mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 15:36:28 +00:00 
			
		
		
		
	implement distinct attribute
distinct can return error facet distinct on numbers return distinct error review fixes make get_facet_value more generic fixes
This commit is contained in:
		| @@ -19,6 +19,7 @@ use crate::{ | ||||
|  | ||||
| pub const CRITERIA_KEY: &str = "criteria"; | ||||
| pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | ||||
| pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; | ||||
| pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||
| pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; | ||||
| pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; | ||||
| @@ -460,6 +461,18 @@ impl Index { | ||||
|     pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime<Utc>) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, SerdeJson<DateTime<Utc>>>(wtxn, UPDATED_AT_KEY, &time) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) | ||||
|     } | ||||
|  | ||||
|     pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result<Option<&'a str>> { | ||||
|         self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result<bool> { | ||||
|         self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
|   | ||||
| @@ -17,7 +17,7 @@ use crate::search::facet::FacetIter; | ||||
| use crate::search::query_tree::Operation; | ||||
| use crate::search::WordDerivationsCache; | ||||
| use crate::{FieldsIdsMap, FieldId, Index}; | ||||
| use super::{Criterion, CriterionResult}; | ||||
| use super::{Criterion, CriterionResult, CriterionContext}; | ||||
|  | ||||
| /// Threshold on the number of candidates that will make | ||||
| /// the system to choose between one algorithm or another. | ||||
| @@ -151,7 +151,7 @@ impl<'t> AscDesc<'t> { | ||||
|  | ||||
| impl<'t> Criterion for AscDesc<'t> { | ||||
|     #[logging_timer::time("AscDesc::{}")] | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|     fn next(&mut self, context: CriterionContext) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         loop { | ||||
|             debug!("Facet {}({}) iteration", | ||||
|                 if self.ascending { "Asc" } else { "Desc" }, self.field_name | ||||
| @@ -163,7 +163,8 @@ impl<'t> Criterion for AscDesc<'t> { | ||||
|                     let bucket_candidates = take(&mut self.bucket_candidates); | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                             let CriterionContext { word_cache, exclude } = context; | ||||
|                             match parent.next(CriterionContext { exclude, word_cache })? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     self.query_tree = query_tree; | ||||
|                                     let candidates = match (&self.query_tree, candidates) { | ||||
| @@ -173,7 +174,7 @@ impl<'t> Criterion for AscDesc<'t> { | ||||
|                                         }, | ||||
|                                         (Some(qt), None) => { | ||||
|                                             let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; | ||||
|                                             let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; | ||||
|                                             let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), word_cache)?; | ||||
|                                             candidates.intersect_with(&self.faceted_candidates); | ||||
|                                             candidates | ||||
|                                         }, | ||||
|   | ||||
| @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::Operation; | ||||
| use crate::search::WordDerivationsCache; | ||||
| use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; | ||||
| use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context, CriterionContext}; | ||||
|  | ||||
| /// The result of a call to the fetcher. | ||||
| #[derive(Debug, Clone, PartialEq)] | ||||
| @@ -61,7 +61,7 @@ impl<'t> Fetcher<'t> { | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("Fetcher::{}")] | ||||
|     pub fn next(&mut self) -> anyhow::Result<Option<FetcherResult>> { | ||||
|     pub fn next(&mut self, exclude: &RoaringBitmap) -> anyhow::Result<Option<FetcherResult>> { | ||||
|         use Candidates::{Allowed, Forbidden}; | ||||
|         loop { | ||||
|             debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", | ||||
| @@ -90,7 +90,11 @@ impl<'t> Fetcher<'t> { | ||||
|                 Forbidden(_) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(&mut self.wdcache)? { | ||||
|                             let context = CriterionContext { | ||||
|                                 word_cache: &mut self.wdcache, | ||||
|                                 exclude | ||||
|                             }; | ||||
|                             match parent.next(context)? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     let candidates = match (&query_tree, candidates) { | ||||
|                                         (_, Some(candidates)) => candidates, | ||||
|   | ||||
| @@ -20,8 +20,13 @@ mod asc_desc; | ||||
| mod proximity; | ||||
| pub mod fetcher; | ||||
|  | ||||
| pub struct CriterionContext<'a, 'b> { | ||||
|     exclude: &'a RoaringBitmap, | ||||
|     word_cache: &'b mut WordDerivationsCache, | ||||
| } | ||||
|  | ||||
| pub trait Criterion { | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>>; | ||||
|     fn next(&mut self, wdcache: CriterionContext) -> anyhow::Result<Option<CriterionResult>>; | ||||
| } | ||||
|  | ||||
| /// The result of a call to the parent criterion. | ||||
|   | ||||
| @@ -8,7 +8,7 @@ use log::debug; | ||||
| use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; | ||||
| use crate::search::query_tree::{maximum_proximity, Operation, Query}; | ||||
| use crate::search::{build_dfa, WordDerivationsCache}; | ||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; | ||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree, CriterionContext}; | ||||
|  | ||||
| pub struct Proximity<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
| @@ -56,8 +56,9 @@ impl<'t> Proximity<'t> { | ||||
|  | ||||
| impl<'t> Criterion for Proximity<'t> { | ||||
|     #[logging_timer::time("Proximity::{}")] | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|     fn next(&mut self, context: CriterionContext) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         use Candidates::{Allowed, Forbidden}; | ||||
|         let CriterionContext { word_cache, exclude } = context; | ||||
|         loop { | ||||
|             debug!("Proximity at iteration {} (max {:?}) ({:?})", | ||||
|                 self.proximity, | ||||
| @@ -98,7 +99,7 @@ impl<'t> Criterion for Proximity<'t> { | ||||
|                                     self.ctx, | ||||
|                                     query_tree, | ||||
|                                     candidates, | ||||
|                                     wdcache, | ||||
|                                     word_cache, | ||||
|                                 )?; | ||||
|                                 self.plane_sweep_cache = Some(cache.into_iter()); | ||||
|  | ||||
| @@ -110,7 +111,7 @@ impl<'t> Criterion for Proximity<'t> { | ||||
|                                &query_tree, | ||||
|                                self.proximity, | ||||
|                                &mut self.candidates_cache, | ||||
|                                wdcache, | ||||
|                                word_cache, | ||||
|                            )? | ||||
|                         }; | ||||
|  | ||||
| @@ -140,7 +141,7 @@ impl<'t> Criterion for Proximity<'t> { | ||||
|                             &query_tree, | ||||
|                             self.proximity, | ||||
|                             &mut self.candidates_cache, | ||||
|                             wdcache, | ||||
|                             word_cache, | ||||
|                         )?; | ||||
|  | ||||
|                         new_candidates.difference_with(&candidates); | ||||
| @@ -170,11 +171,11 @@ impl<'t> Criterion for Proximity<'t> { | ||||
|                 (None, Forbidden(_)) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                             match parent.next(CriterionContext { exclude, word_cache })? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     let candidates = match (&query_tree, candidates) { | ||||
|                                         (_, Some(candidates)) => candidates, | ||||
|                                         (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, | ||||
|                                         (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), word_cache)?, | ||||
|                                         (None, None) => RoaringBitmap::new(), | ||||
|                                     }; | ||||
|  | ||||
|   | ||||
| @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; | ||||
| use crate::search::{word_derivations, WordDerivationsCache}; | ||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; | ||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, CriterionContext}; | ||||
|  | ||||
| pub struct Typo<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
| @@ -51,8 +51,9 @@ impl<'t> Typo<'t> { | ||||
|  | ||||
| impl<'t> Criterion for Typo<'t> { | ||||
|     #[logging_timer::time("Typo::{}")] | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|     fn next(&mut self, context: CriterionContext) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         use Candidates::{Allowed, Forbidden}; | ||||
|         let CriterionContext { word_cache, exclude } = context; | ||||
|         loop { | ||||
|             debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); | ||||
|  | ||||
| @@ -71,9 +72,9 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                     } else { | ||||
|                         let fst = self.ctx.words_fst(); | ||||
|                         let new_query_tree = if self.number_typos < 2 { | ||||
|                             alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? | ||||
|                             alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? | ||||
|                         } else if self.number_typos == 2 { | ||||
|                             *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; | ||||
|                             *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; | ||||
|                             query_tree.clone() | ||||
|                         } else { | ||||
|                             query_tree.clone() | ||||
| @@ -84,7 +85,7 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                             &new_query_tree, | ||||
|                             self.number_typos, | ||||
|                             &mut self.candidates_cache, | ||||
|                             wdcache, | ||||
|                             word_cache, | ||||
|                         )?; | ||||
|                         new_candidates.intersect_with(&candidates); | ||||
|                         candidates.difference_with(&new_candidates); | ||||
| @@ -109,9 +110,9 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                     } else { | ||||
|                         let fst = self.ctx.words_fst(); | ||||
|                         let new_query_tree = if self.number_typos < 2 { | ||||
|                             alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? | ||||
|                             alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? | ||||
|                         } else if self.number_typos == 2 { | ||||
|                             *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; | ||||
|                             *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; | ||||
|                             query_tree.clone() | ||||
|                         } else { | ||||
|                             query_tree.clone() | ||||
| @@ -122,7 +123,7 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                             &new_query_tree, | ||||
|                             self.number_typos, | ||||
|                             &mut self.candidates_cache, | ||||
|                             wdcache, | ||||
|                             word_cache, | ||||
|                         )?; | ||||
|                         new_candidates.difference_with(&candidates); | ||||
|                         candidates.union_with(&new_candidates); | ||||
| @@ -147,7 +148,7 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                 (None, Forbidden(_)) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                             match parent.next(CriterionContext { exclude, word_cache })? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); | ||||
|                                     self.number_typos = 0; | ||||
| @@ -346,8 +347,12 @@ mod test { | ||||
|  | ||||
|         let mut wdcache = WordDerivationsCache::new(); | ||||
|         let mut criteria = Typo::initial(&context, query_tree, facet_candidates); | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|  | ||||
|         assert!(criteria.next(&mut wdcache).unwrap().is_none()); | ||||
|         assert!(criteria.next(sort_context).unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
| @@ -381,7 +386,12 @@ mod test { | ||||
|             bucket_candidates: candidates_1, | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); | ||||
|  | ||||
|         let candidates_2 = ( | ||||
|                 context.word_docids("split").unwrap().unwrap() | ||||
| @@ -403,7 +413,12 @@ mod test { | ||||
|             bucket_candidates: candidates_2, | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
| @@ -421,11 +436,19 @@ mod test { | ||||
|             bucket_candidates: facet_candidates, | ||||
|         }; | ||||
|  | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|         // first iteration, returns the facet candidates | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); | ||||
|         assert_eq!(criteria.next(sort_context).unwrap(), Some(expected)); | ||||
|  | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|         // second iteration, returns None because there is no more things to do | ||||
|         assert!(criteria.next(&mut wdcache).unwrap().is_none()); | ||||
|         assert!(criteria.next(sort_context ).unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
| @@ -459,7 +482,12 @@ mod test { | ||||
|             bucket_candidates: candidates_1 & &facet_candidates, | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); | ||||
|  | ||||
|         let candidates_2 = ( | ||||
|                 context.word_docids("split").unwrap().unwrap() | ||||
| @@ -481,7 +509,12 @@ mod test { | ||||
|             bucket_candidates: candidates_2 & &facet_candidates, | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); | ||||
|         let sort_context = CriterionContext { | ||||
|             word_cache: &mut wdcache, | ||||
|             exclude: &RoaringBitmap::new(), | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); | ||||
|     } | ||||
|  | ||||
| } | ||||
|   | ||||
| @@ -5,8 +5,7 @@ use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::Operation; | ||||
| use crate::search::WordDerivationsCache; | ||||
| use super::{resolve_query_tree, Criterion, CriterionResult, Context}; | ||||
| use super::{resolve_query_tree, Criterion, CriterionResult, Context, CriterionContext}; | ||||
|  | ||||
| pub struct Words<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
| @@ -48,7 +47,8 @@ impl<'t> Words<'t> { | ||||
|  | ||||
| impl<'t> Criterion for Words<'t> { | ||||
|     #[logging_timer::time("Words::{}")] | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|     fn next(&mut self, context: CriterionContext) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         let CriterionContext { word_cache, exclude } = context; | ||||
|         loop { | ||||
|             debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); | ||||
|  | ||||
| @@ -62,7 +62,7 @@ impl<'t> Criterion for Words<'t> { | ||||
|                     })); | ||||
|                 }, | ||||
|                 (Some(qt), Some(candidates)) => { | ||||
|                     let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; | ||||
|                     let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, word_cache)?; | ||||
|                     found_candidates.intersect_with(&candidates); | ||||
|                     candidates.difference_with(&found_candidates); | ||||
|  | ||||
| @@ -100,7 +100,7 @@ impl<'t> Criterion for Words<'t> { | ||||
|                 (None, None) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                             match parent.next(CriterionContext { word_cache, exclude })? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); | ||||
|                                     self.candidates = candidates; | ||||
|   | ||||
							
								
								
									
										192
									
								
								milli/src/search/distinct/facet_distinct.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										192
									
								
								milli/src/search/distinct/facet_distinct.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,192 @@ | ||||
| use std::mem::size_of; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::facet::*; | ||||
| use crate::{facet::FacetType, DocumentId, FieldId, Index}; | ||||
| use super::{Distinct, DocIter}; | ||||
|  | ||||
| pub struct FacetDistinct<'a> { | ||||
|     distinct: FieldId, | ||||
|     index: &'a Index, | ||||
|     txn: &'a heed::RoTxn<'a>, | ||||
|     facet_type: FacetType, | ||||
| } | ||||
|  | ||||
| impl<'a> FacetDistinct<'a> { | ||||
|     pub fn new( | ||||
|         distinct: FieldId, | ||||
|         index: &'a Index, | ||||
|         txn: &'a heed::RoTxn<'a>, | ||||
|         facet_type: FacetType, | ||||
|     ) -> Self { | ||||
|         Self { | ||||
|             distinct, | ||||
|             index, | ||||
|             txn, | ||||
|             facet_type, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct FacetDistinctIter<'a> { | ||||
|     candidates: RoaringBitmap, | ||||
|     distinct: FieldId, | ||||
|     excluded: RoaringBitmap, | ||||
|     facet_type: FacetType, | ||||
|     index: &'a Index, | ||||
|     iter_offset: usize, | ||||
|     txn: &'a heed::RoTxn<'a>, | ||||
| } | ||||
|  | ||||
| impl<'a> FacetDistinctIter<'a> { | ||||
|     fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap> | ||||
|     where | ||||
|         KC: heed::BytesEncode<'c>, | ||||
|     { | ||||
|         let facet_docids = self | ||||
|             .index | ||||
|             .facet_field_id_value_docids | ||||
|             .remap_key_type::<KC>() | ||||
|             .get(self.txn, key)? | ||||
|             .expect("Corrupted data: Facet values must exist"); | ||||
|         Ok(facet_docids) | ||||
|     } | ||||
|  | ||||
|     fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { | ||||
|         let iter = get_facet_values::<FieldDocIdFacetStringCodec>( | ||||
|             id, | ||||
|             self.distinct, | ||||
|             self.index, | ||||
|             self.txn, | ||||
|         )?; | ||||
|  | ||||
|         for item in iter { | ||||
|             let ((_, _, value), _) = item?; | ||||
|             let key = (self.distinct, value); | ||||
|             let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?; | ||||
|             self.excluded.union_with(&facet_docids); | ||||
|         } | ||||
|  | ||||
|         self.excluded.remove(id); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn distinct_integer(&mut self, id: DocumentId) -> anyhow::Result<()> { | ||||
|         let iter = get_facet_values::<FieldDocIdFacetI64Codec>( | ||||
|             id, | ||||
|             self.distinct, | ||||
|             self.index, | ||||
|             self.txn, | ||||
|         )?; | ||||
|  | ||||
|         for item in iter { | ||||
|             let ((_, _, value), _) = item?; | ||||
|             // get facet docids on level 0 | ||||
|             let key = (self.distinct, 0, value, value); | ||||
|             let facet_docids = self.get_facet_docids::<FacetLevelValueI64Codec>(&key)?; | ||||
|             self.excluded.union_with(&facet_docids); | ||||
|         } | ||||
|  | ||||
|         self.excluded.remove(id); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn distinct_float(&mut self, id: DocumentId) -> anyhow::Result<()> { | ||||
|         let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id, | ||||
|             self.distinct, | ||||
|             self.index, | ||||
|             self.txn, | ||||
|         )?; | ||||
|  | ||||
|         for item in iter { | ||||
|             let ((_, _, value), _) = item?; | ||||
|             // get facet docids on level 0 | ||||
|             let key = (self.distinct, 0, value, value); | ||||
|             let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?; | ||||
|             self.excluded.union_with(&facet_docids); | ||||
|         } | ||||
|  | ||||
|         self.excluded.remove(id); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn next_inner(&mut self) -> anyhow::Result<Option<DocumentId>> { | ||||
|         // The first step is to remove all the excluded documents from our candidates | ||||
|         self.candidates.difference_with(&self.excluded); | ||||
|  | ||||
|         let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); | ||||
|         match candidates_iter.next() { | ||||
|             Some(id) => { | ||||
|                 match self.facet_type { | ||||
|                     FacetType::String => self.distinct_string(id)?, | ||||
|                     FacetType::Integer => self.distinct_integer(id)?, | ||||
|                     FacetType::Float => self.distinct_float(id)?, | ||||
|                 }; | ||||
|  | ||||
|                 // On every iteration, the first document is always a distinct one, since it | ||||
|                 // hasn't been discarded by the previous difference. | ||||
|                 self.iter_offset += 1; | ||||
|                 Ok(Some(id)) | ||||
|             } | ||||
|             // no more candidate at this offset, return. | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn get_facet_values<'a, KC>( | ||||
|     id: DocumentId, | ||||
|     distinct: FieldId, | ||||
|     index: &Index, | ||||
|     txn: &'a heed::RoTxn, | ||||
| ) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>> | ||||
| where | ||||
|     KC: heed::BytesDecode<'a>, | ||||
| { | ||||
|     const FID_SIZE: usize = size_of::<FieldId>(); | ||||
|     const DOCID_SIZE: usize = size_of::<DocumentId>(); | ||||
|  | ||||
|     let mut key = [0; FID_SIZE + DOCID_SIZE]; | ||||
|     key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); | ||||
|     key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); | ||||
|  | ||||
|     let iter = index | ||||
|         .field_id_docid_facet_values | ||||
|         .prefix_iter(txn, &key)? | ||||
|         .remap_key_type::<KC>(); | ||||
|     Ok(iter) | ||||
| } | ||||
|  | ||||
| impl Iterator for FacetDistinctIter<'_> { | ||||
|     type Item = anyhow::Result<DocumentId>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         self.next_inner().transpose() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl DocIter for FacetDistinctIter<'_> { | ||||
|     fn into_excluded(self) -> RoaringBitmap { | ||||
|         self.excluded | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> Distinct<'_> for FacetDistinct<'a> { | ||||
|     type Iter = FacetDistinctIter<'a>; | ||||
|  | ||||
|     fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { | ||||
|         FacetDistinctIter { | ||||
|             candidates, | ||||
|             distinct: self.distinct, | ||||
|             excluded, | ||||
|             facet_type: self.facet_type, | ||||
|             index: self.index, | ||||
|             iter_offset: 0, | ||||
|             txn: self.txn, | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										109
									
								
								milli/src/search/distinct/map_distinct.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								milli/src/search/distinct/map_distinct.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,109 @@ | ||||
| use std::collections::HashMap; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::{Distinct, DocIter}; | ||||
| use crate::{DocumentId, FieldId, Index}; | ||||
|  | ||||
| pub struct MapDistinct<'a> { | ||||
|     distinct: FieldId, | ||||
|     map: HashMap<String, usize>, | ||||
|     index: &'a Index, | ||||
|     txn: &'a heed::RoTxn<'a>, | ||||
| } | ||||
|  | ||||
| impl<'a> MapDistinct<'a> { | ||||
|     pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { | ||||
|         let map = HashMap::new(); | ||||
|         Self { | ||||
|             distinct, | ||||
|             map, | ||||
|             index, | ||||
|             txn, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct MapDistinctIter<'a, 'b> { | ||||
|     distinct: FieldId, | ||||
|     map: &'b mut HashMap<String, usize>, | ||||
|     index: &'a Index, | ||||
|     txn: &'a heed::RoTxn<'a>, | ||||
|     candidates: roaring::bitmap::IntoIter, | ||||
|     excluded: RoaringBitmap, | ||||
| } | ||||
|  | ||||
| impl<'a, 'b> MapDistinctIter<'a, 'b> { | ||||
|     fn next_inner(&mut self) -> anyhow::Result<Option<DocumentId>> { | ||||
|         let map = &mut self.map; | ||||
|         let mut filter = |value: Value| { | ||||
|             let entry = map.entry(value.to_string()).or_insert(0); | ||||
|             *entry += 1; | ||||
|             *entry <= 1 | ||||
|         }; | ||||
|  | ||||
|         while let Some(id) = self.candidates.next() { | ||||
|             let document = self.index.documents(&self.txn, Some(id))?[0].1; | ||||
|             let value = document | ||||
|                 .get(self.distinct) | ||||
|                 .map(serde_json::from_slice::<Value>) | ||||
|                 .transpose()?; | ||||
|  | ||||
|             let accept = match value { | ||||
|                 Some(value) => { | ||||
|                     match value { | ||||
|                         // Since we can't distinct these values, we always accept them | ||||
|                         Value::Null | Value::Object(_) => true, | ||||
|                         Value::Array(values) => { | ||||
|                             let mut accept = true; | ||||
|                             for value in values { | ||||
|                                 accept &= filter(value); | ||||
|                             } | ||||
|                             accept | ||||
|                         } | ||||
|                         value => filter(value), | ||||
|                     } | ||||
|                 } | ||||
|                 // Accept values by default. | ||||
|                 _ => true, | ||||
|             }; | ||||
|  | ||||
|             if accept { | ||||
|                 return Ok(Some(id)); | ||||
|             } else { | ||||
|                 self.excluded.insert(id); | ||||
|             } | ||||
|         } | ||||
|         Ok(None) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Iterator for MapDistinctIter<'_, '_> { | ||||
|     type Item = anyhow::Result<DocumentId>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         self.next_inner().transpose() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl DocIter for MapDistinctIter<'_, '_> { | ||||
|     fn into_excluded(self) -> RoaringBitmap { | ||||
|         self.excluded | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { | ||||
|     type Iter = MapDistinctIter<'a, 'b>; | ||||
|  | ||||
|     fn distinct(&'b mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { | ||||
|         MapDistinctIter { | ||||
|             distinct: self.distinct, | ||||
|             map: &mut self.map, | ||||
|             index: &self.index, | ||||
|             txn: &self.txn, | ||||
|             candidates: candidates.into_iter(), | ||||
|             excluded, | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										21
									
								
								milli/src/search/distinct/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								milli/src/search/distinct/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| mod facet_distinct; | ||||
| mod map_distinct; | ||||
| mod noop_distinct; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| pub use facet_distinct::FacetDistinct; | ||||
| pub use map_distinct::MapDistinct; | ||||
| pub use noop_distinct::NoopDistinct; | ||||
| use crate::DocumentId; | ||||
|  | ||||
| pub trait DocIter: Iterator<Item=anyhow::Result<DocumentId>> { | ||||
|     /// Returns ownership on the internal RoaringBitmaps: (candidates, excluded) | ||||
|     fn into_excluded(self) -> RoaringBitmap; | ||||
| } | ||||
|  | ||||
| pub trait Distinct<'a> { | ||||
|     type Iter: DocIter; | ||||
|  | ||||
|     fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; | ||||
| } | ||||
							
								
								
									
										36
									
								
								milli/src/search/distinct/noop_distinct.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								milli/src/search/distinct/noop_distinct.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::DocumentId; | ||||
| use super::{DocIter, Distinct}; | ||||
|  | ||||
| pub struct NoopDistinct; | ||||
|  | ||||
| pub struct NoopDistinctIter { | ||||
|     candidates: roaring::bitmap::IntoIter, | ||||
|     excluded: RoaringBitmap, | ||||
| } | ||||
|  | ||||
| impl Iterator for NoopDistinctIter { | ||||
|     type Item = anyhow::Result<DocumentId>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         self.candidates.next().map(Result::Ok) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl DocIter for NoopDistinctIter { | ||||
|     fn into_excluded(self) -> RoaringBitmap { | ||||
|         self.excluded | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Distinct<'_> for NoopDistinct { | ||||
|     type Iter = NoopDistinctIter; | ||||
|  | ||||
|     fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { | ||||
|         NoopDistinctIter { | ||||
|             candidates: candidates.into_iter(), | ||||
|             excluded, | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -11,22 +11,24 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; | ||||
| use once_cell::sync::Lazy; | ||||
| use roaring::bitmap::RoaringBitmap; | ||||
|  | ||||
| use crate::search::criteria::fetcher::FetcherResult; | ||||
| use crate::search::criteria::fetcher::{FetcherResult, Fetcher}; | ||||
| use crate::{Index, DocumentId}; | ||||
| use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct}; | ||||
| use self::query_tree::QueryTreeBuilder; | ||||
|  | ||||
| pub use self::facet::FacetIter; | ||||
| pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; | ||||
| pub use self::query_tree::MatchingWords; | ||||
| use self::query_tree::QueryTreeBuilder; | ||||
|  | ||||
| // Building these factories is not free. | ||||
| static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | ||||
| static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true)); | ||||
| static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | ||||
|  | ||||
| mod criteria; | ||||
| mod distinct; | ||||
| mod facet; | ||||
| mod query_tree; | ||||
| mod criteria; | ||||
|  | ||||
| pub struct Search<'a> { | ||||
|     query: Option<String>, | ||||
| @@ -123,33 +125,60 @@ impl<'a> Search<'a> { | ||||
|         }; | ||||
|  | ||||
|         let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; | ||||
|         let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; | ||||
|         let criteria = criteria_builder.build(query_tree, facet_candidates)?; | ||||
|  | ||||
|         match self.index.distinct_attribute(self.rtxn)? { | ||||
|             None => self.perform_sort(NoopDistinct, matching_words, criteria), | ||||
|             Some(name) => { | ||||
|                 let field_ids_map = self.index.fields_ids_map(self.rtxn)?; | ||||
|                 let id = field_ids_map.id(name).expect("distinct not present in field map"); | ||||
|                 let faceted_fields = self.index.faceted_fields(self.rtxn)?; | ||||
|                 match faceted_fields.get(name) { | ||||
|                     Some(facet_type) => { | ||||
|                         let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); | ||||
|                         self.perform_sort(distinct, matching_words, criteria) | ||||
|                     } | ||||
|                     None => { | ||||
|                         let distinct = MapDistinct::new(id, self.index, self.rtxn); | ||||
|                         self.perform_sort(distinct, matching_words, criteria) | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn perform_sort( | ||||
|         &self, | ||||
|         mut distinct: impl for<'c> Distinct<'c>, | ||||
|         matching_words: MatchingWords, | ||||
|         mut criteria: Fetcher, | ||||
|     ) -> anyhow::Result<SearchResult> { | ||||
|  | ||||
|         let mut offset = self.offset; | ||||
|         let mut limit = self.limit; | ||||
|         let mut documents_ids = Vec::new(); | ||||
|         let mut initial_candidates = RoaringBitmap::new(); | ||||
|         while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { | ||||
|         let mut excluded_documents = RoaringBitmap::new(); | ||||
|         let mut documents_ids = Vec::with_capacity(self.limit); | ||||
|  | ||||
|         while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_documents)? { | ||||
|  | ||||
|             debug!("Number of candidates found {}", candidates.len()); | ||||
|  | ||||
|             let mut len = candidates.len() as usize; | ||||
|             let mut candidates = candidates.into_iter(); | ||||
|             let excluded = std::mem::take(&mut excluded_documents); | ||||
|  | ||||
|             let mut candidates = distinct.distinct(candidates, excluded); | ||||
|  | ||||
|             initial_candidates.union_with(&bucket_candidates); | ||||
|  | ||||
|             if offset != 0 { | ||||
|                 candidates.by_ref().take(offset).for_each(drop); | ||||
|                 offset = offset.saturating_sub(len.min(offset)); | ||||
|                 len = len.saturating_sub(len.min(offset)); | ||||
|                 let discarded = candidates.by_ref().take(offset).count(); | ||||
|                 offset = offset.saturating_sub(discarded); | ||||
|             } | ||||
|  | ||||
|             if len != 0 { | ||||
|                 documents_ids.extend(candidates.take(limit)); | ||||
|                 limit = limit.saturating_sub(len.min(limit)); | ||||
|             for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { | ||||
|                 documents_ids.push(candidate?); | ||||
|             } | ||||
|  | ||||
|             if limit == 0 { break } | ||||
|             if documents_ids.len() == self.limit { break } | ||||
|             excluded_documents = candidates.into_excluded(); | ||||
|         } | ||||
|  | ||||
|         Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) | ||||
|   | ||||
| @@ -70,6 +70,7 @@ pub struct Settings<'a, 't, 'u, 'i> { | ||||
|     faceted_fields: Setting<HashMap<String, String>>, | ||||
|     criteria: Setting<Vec<String>>, | ||||
|     stop_words: Setting<BTreeSet<String>>, | ||||
|     distinct_attribute: Setting<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
| @@ -94,6 +95,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|             faceted_fields: Setting::NotSet, | ||||
|             criteria: Setting::NotSet, | ||||
|             stop_words: Setting::NotSet, | ||||
|             distinct_attribute: Setting::NotSet, | ||||
|             update_id, | ||||
|         } | ||||
|     } | ||||
| @@ -142,6 +144,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { | ||||
|         self.distinct_attribute = Setting::Set(distinct_attribute); | ||||
|     } | ||||
|  | ||||
|     pub fn reset_distinct_attribute(&mut self) { | ||||
|         self.distinct_attribute = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> | ||||
|         where | ||||
|             F: Fn(UpdateIndexingStep, u64) + Sync | ||||
| @@ -220,6 +230,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         Ok(true) | ||||
|     } | ||||
|  | ||||
|     fn update_distinct_attribute(&mut self) -> anyhow::Result<bool> { | ||||
|         match self.distinct_attribute { | ||||
|             Setting::Set(ref attr) => { | ||||
|                 let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; | ||||
|                 fields_ids_map | ||||
|                     .insert(attr) | ||||
|                     .context("field id limit exceeded")?; | ||||
|  | ||||
|                 self.index.put_distinct_attribute(self.wtxn, &attr)?; | ||||
|                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||
|             } | ||||
|             Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; }, | ||||
|             Setting::NotSet => return Ok(false), | ||||
|         } | ||||
|         Ok(true) | ||||
|     } | ||||
|  | ||||
|     /// Updates the index's searchable attributes. This causes the field map to be recomputed to | ||||
|     /// reflect the order of the searchable attributes. | ||||
|     fn update_searchable(&mut self) -> anyhow::Result<bool> { | ||||
| @@ -328,6 +355,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         self.update_displayed()?; | ||||
|         let stop_words_updated = self.update_stop_words()?; | ||||
|         let facets_updated = self.update_facets()?; | ||||
|         self.update_distinct_attribute()?; | ||||
|         // update_criteria MUST be called after update_facets, since criterion fields must be set | ||||
|         // as facets. | ||||
|         self.update_criteria()?; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user