mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Merge #535
535: Reintroduce the max values by facet limit r=ManyTheFish a=Kerollmops This PR reintroduces the max values by facet limit this is related to https://github.com/meilisearch/meilisearch/issues/2349. ~I would like some help in deciding on whether I keep the default 100 max values in milli and set up the `FacetDistribution` settings in Meilisearch to use 1000 as the new value, I expose the `max_values_by_facet` for this purpose.~ I changed the default value to 1000 and the max to 10000, thank you `@ManyTheFish` for the help! Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
		| @@ -1,6 +1,6 @@ | |||||||
| use std::collections::{BTreeMap, HashSet}; | use std::collections::{BTreeMap, HashSet}; | ||||||
| use std::ops::Bound::Unbounded; | use std::ops::Bound::Unbounded; | ||||||
| use std::{fmt, mem}; | use std::{cmp, fmt, mem}; | ||||||
|  |  | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| @@ -13,6 +13,14 @@ use crate::heed_codec::facet::{ | |||||||
| use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; | use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; | ||||||
| use crate::{FieldId, Index, Result}; | use crate::{FieldId, Index, Result}; | ||||||
|  |  | ||||||
|  | /// The default number of values by facets that will | ||||||
|  | /// be fetched from the key-value store. | ||||||
|  | const DEFAULT_VALUES_BY_FACET: usize = 1000; | ||||||
|  |  | ||||||
|  | /// The hard limit in the number of values by facets that will be fetched from | ||||||
|  | /// the key-value store. Searching for more values could slow down the engine. | ||||||
|  | const MAX_VALUES_BY_FACET: usize = 10000; | ||||||
|  |  | ||||||
| /// Threshold on the number of candidates that will make | /// Threshold on the number of candidates that will make | ||||||
| /// the system to choose between one algorithm or another. | /// the system to choose between one algorithm or another. | ||||||
| const CANDIDATES_THRESHOLD: u64 = 3000; | const CANDIDATES_THRESHOLD: u64 = 3000; | ||||||
| @@ -20,13 +28,20 @@ const CANDIDATES_THRESHOLD: u64 = 3000; | |||||||
| pub struct FacetDistribution<'a> { | pub struct FacetDistribution<'a> { | ||||||
|     facets: Option<HashSet<String>>, |     facets: Option<HashSet<String>>, | ||||||
|     candidates: Option<RoaringBitmap>, |     candidates: Option<RoaringBitmap>, | ||||||
|  |     max_values_by_facet: usize, | ||||||
|     rtxn: &'a heed::RoTxn<'a>, |     rtxn: &'a heed::RoTxn<'a>, | ||||||
|     index: &'a Index, |     index: &'a Index, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> FacetDistribution<'a> { | impl<'a> FacetDistribution<'a> { | ||||||
|     pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { |     pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { | ||||||
|         FacetDistribution { facets: None, candidates: None, rtxn, index } |         FacetDistribution { | ||||||
|  |             facets: None, | ||||||
|  |             candidates: None, | ||||||
|  |             max_values_by_facet: DEFAULT_VALUES_BY_FACET, | ||||||
|  |             rtxn, | ||||||
|  |             index, | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn facets<I: IntoIterator<Item = A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self { |     pub fn facets<I: IntoIterator<Item = A>, A: AsRef<str>>(&mut self, names: I) -> &mut Self { | ||||||
| @@ -34,6 +49,11 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         self |         self | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { | ||||||
|  |         self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { |     pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { | ||||||
|         self.candidates = Some(candidates); |         self.candidates = Some(candidates); | ||||||
|         self |         self | ||||||
| @@ -52,6 +72,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|             FacetType::Number => { |             FacetType::Number => { | ||||||
|                 let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); |                 let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); | ||||||
|  |  | ||||||
|  |                 let distribution_prelength = distribution.len(); | ||||||
|                 let db = self.index.field_id_docid_facet_f64s; |                 let db = self.index.field_id_docid_facet_f64s; | ||||||
|                 for docid in candidates.into_iter() { |                 for docid in candidates.into_iter() { | ||||||
|                     key_buffer.truncate(mem::size_of::<FieldId>()); |                     key_buffer.truncate(mem::size_of::<FieldId>()); | ||||||
| @@ -64,6 +85,10 @@ impl<'a> FacetDistribution<'a> { | |||||||
|                     for result in iter { |                     for result in iter { | ||||||
|                         let ((_, _, value), ()) = result?; |                         let ((_, _, value), ()) = result?; | ||||||
|                         *distribution.entry(value.to_string()).or_insert(0) += 1; |                         *distribution.entry(value.to_string()).or_insert(0) += 1; | ||||||
|  |  | ||||||
|  |                         if distribution.len() - distribution_prelength == self.max_values_by_facet { | ||||||
|  |                             break; | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
| @@ -86,6 +111,10 @@ impl<'a> FacetDistribution<'a> { | |||||||
|                             .entry(normalized_value) |                             .entry(normalized_value) | ||||||
|                             .or_insert_with(|| (original_value, 0)); |                             .or_insert_with(|| (original_value, 0)); | ||||||
|                         *count += 1; |                         *count += 1; | ||||||
|  |  | ||||||
|  |                         if normalized_distribution.len() == self.max_values_by_facet { | ||||||
|  |                             break; | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
| @@ -116,6 +145,9 @@ impl<'a> FacetDistribution<'a> { | |||||||
|             if !docids.is_empty() { |             if !docids.is_empty() { | ||||||
|                 distribution.insert(value.to_string(), docids.len()); |                 distribution.insert(value.to_string(), docids.len()); | ||||||
|             } |             } | ||||||
|  |             if distribution.len() == self.max_values_by_facet { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -136,6 +168,9 @@ impl<'a> FacetDistribution<'a> { | |||||||
|             if !docids.is_empty() { |             if !docids.is_empty() { | ||||||
|                 distribution.insert(original.to_string(), docids.len()); |                 distribution.insert(original.to_string(), docids.len()); | ||||||
|             } |             } | ||||||
|  |             if distribution.len() == self.max_values_by_facet { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -155,6 +190,9 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         for result in range { |         for result in range { | ||||||
|             let ((_, _, value, _), docids) = result?; |             let ((_, _, value, _), docids) = result?; | ||||||
|             distribution.insert(value.to_string(), docids.len()); |             distribution.insert(value.to_string(), docids.len()); | ||||||
|  |             if distribution.len() == self.max_values_by_facet { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let iter = self |         let iter = self | ||||||
| @@ -168,6 +206,9 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         for result in iter { |         for result in iter { | ||||||
|             let ((_, normalized_value), (original_value, docids)) = result?; |             let ((_, normalized_value), (original_value, docids)) = result?; | ||||||
|             normalized_distribution.insert(normalized_value, (original_value, docids.len())); |             normalized_distribution.insert(normalized_value, (original_value, docids.len())); | ||||||
|  |             if normalized_distribution.len() == self.max_values_by_facet { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let iter = normalized_distribution |         let iter = normalized_distribution | ||||||
| @@ -253,11 +294,12 @@ impl<'a> FacetDistribution<'a> { | |||||||
|  |  | ||||||
| impl fmt::Debug for FacetDistribution<'_> { | impl fmt::Debug for FacetDistribution<'_> { | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|         let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self; |         let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; | ||||||
|  |  | ||||||
|         f.debug_struct("FacetDistribution") |         f.debug_struct("FacetDistribution") | ||||||
|             .field("facets", facets) |             .field("facets", facets) | ||||||
|             .field("candidates", candidates) |             .field("candidates", candidates) | ||||||
|  |             .field("max_values_by_facet", max_values_by_facet) | ||||||
|             .finish() |             .finish() | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user