mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Implement proximityPrecision setting on milli side
This commit is contained in:
		| @@ -21,6 +21,7 @@ use crate::heed_codec::facet::{ | ||||
| use crate::heed_codec::{ | ||||
|     BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, | ||||
| }; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::readable_slices::ReadableSlices; | ||||
| use crate::{ | ||||
|     default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, | ||||
| @@ -72,6 +73,7 @@ pub mod main_key { | ||||
|     pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; | ||||
|     pub const SORT_FACET_VALUES_BY: &str = "sort-facet-values-by"; | ||||
|     pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; | ||||
|     pub const PROXIMITY_PRECISION: &str = "proximity-precision"; | ||||
| } | ||||
|  | ||||
| pub mod db_name { | ||||
| @@ -1466,6 +1468,28 @@ impl Index { | ||||
|         self.main.remap_key_type::<Str>().delete(txn, main_key::PAGINATION_MAX_TOTAL_HITS) | ||||
|     } | ||||
|  | ||||
|     pub fn proximity_precision(&self, txn: &RoTxn) -> heed::Result<Option<ProximityPrecision>> { | ||||
|         self.main | ||||
|             .remap_types::<Str, SerdeBincode<ProximityPrecision>>() | ||||
|             .get(txn, main_key::PROXIMITY_PRECISION) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn put_proximity_precision( | ||||
|         &self, | ||||
|         txn: &mut RwTxn, | ||||
|         val: ProximityPrecision, | ||||
|     ) -> heed::Result<()> { | ||||
|         self.main.remap_types::<Str, SerdeBincode<ProximityPrecision>>().put( | ||||
|             txn, | ||||
|             main_key::PROXIMITY_PRECISION, | ||||
|             &val, | ||||
|         ) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn delete_proximity_precision(&self, txn: &mut RwTxn) -> heed::Result<bool> { | ||||
|         self.main.remap_key_type::<Str>().delete(txn, main_key::PROXIMITY_PRECISION) | ||||
|     } | ||||
|  | ||||
|     /* script  language docids */ | ||||
|     /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. | ||||
|     pub fn script_language_documents_ids( | ||||
|   | ||||
| @@ -1,5 +1,7 @@ | ||||
| use std::cmp; | ||||
|  | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use crate::{relative_from_absolute_position, Position}; | ||||
|  | ||||
| pub const MAX_DISTANCE: u32 = 4; | ||||
| @@ -25,3 +27,11 @@ pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { | ||||
| pub fn path_proximity(path: &[Position]) -> u32 { | ||||
|     path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>() | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub enum ProximityPrecision { | ||||
|     #[default] | ||||
|     WordScale, | ||||
|     AttributeScale, | ||||
| } | ||||
|   | ||||
| @@ -10,6 +10,7 @@ use roaring::RoaringBitmap; | ||||
| use super::interner::Interned; | ||||
| use super::Word; | ||||
| use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; | ||||
| use crate::{ | ||||
|     CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, | ||||
| @@ -263,18 +264,67 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|         word2: Interned<String>, | ||||
|         proximity: u8, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         // TODO: if database is empty, search if the word are in the same attribute instead | ||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|             self.txn, | ||||
|             (proximity, word1, word2), | ||||
|             &( | ||||
|                 proximity, | ||||
|                 self.word_interner.get(word1).as_str(), | ||||
|                 self.word_interner.get(word2).as_str(), | ||||
|             ), | ||||
|             &mut self.db_cache.word_pair_proximity_docids, | ||||
|             self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), | ||||
|         ) | ||||
|         match self.index.proximity_precision(self.txn)?.unwrap_or_default() { | ||||
|             ProximityPrecision::AttributeScale => { | ||||
|                 // Force proximity to 0 because: | ||||
|                 // in AttributeScale, there are only 2 possible distances: | ||||
|                 // 1. words in same attribute: in that the DB contains (0, word1, word2) | ||||
|                 // 2. words in different attributes: no DB entry for these two words. | ||||
|                 let proximity = 0; | ||||
|                 let docids = if let Some(docids) = | ||||
|                     self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2)) | ||||
|                 { | ||||
|                     docids | ||||
|                         .as_ref() | ||||
|                         .map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d)) | ||||
|                         .transpose() | ||||
|                         .map_err(heed::Error::Decoding)? | ||||
|                 } else { | ||||
|                     // Compute the distance at the attribute level and store it in the cache. | ||||
|                     let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { | ||||
|                         fids | ||||
|                     } else { | ||||
|                         self.index.fields_ids_map(self.txn)?.ids().collect() | ||||
|                     }; | ||||
|                     let mut docids = RoaringBitmap::new(); | ||||
|                     for fid in fids { | ||||
|                         // for each field, intersect left word bitmap and right word bitmap, | ||||
|                         // then merge the result in a global bitmap before storing it in the cache. | ||||
|                         let word1_docids = self.get_db_word_fid_docids(word1, fid)?; | ||||
|                         let word2_docids = self.get_db_word_fid_docids(word2, fid)?; | ||||
|                         if let (Some(word1_docids), Some(word2_docids)) = | ||||
|                             (word1_docids, word2_docids) | ||||
|                         { | ||||
|                             docids |= word1_docids & word2_docids; | ||||
|                         } | ||||
|                     } | ||||
|                     let encoded = CboRoaringBitmapCodec::bytes_encode(&docids) | ||||
|                         .map(Cow::into_owned) | ||||
|                         .map(Cow::Owned) | ||||
|                         .map(Some) | ||||
|                         .map_err(heed::Error::Decoding)?; | ||||
|                     self.db_cache | ||||
|                         .word_pair_proximity_docids | ||||
|                         .insert((proximity, word1, word2), encoded); | ||||
|                     Some(docids) | ||||
|                 }; | ||||
|  | ||||
|                 Ok(docids) | ||||
|             } | ||||
|             ProximityPrecision::WordScale => { | ||||
|                 DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|                     self.txn, | ||||
|                     (proximity, word1, word2), | ||||
|                     &( | ||||
|                         proximity, | ||||
|                         self.word_interner.get(word1).as_str(), | ||||
|                         self.word_interner.get(word2).as_str(), | ||||
|                     ), | ||||
|                     &mut self.db_cache.word_pair_proximity_docids, | ||||
|                     self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_pair_proximity_docids_len( | ||||
| @@ -283,56 +333,95 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|         word2: Interned<String>, | ||||
|         proximity: u8, | ||||
|     ) -> Result<Option<u64>> { | ||||
|         // TODO: if database is empty, search if the word are in the same attribute instead | ||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( | ||||
|             self.txn, | ||||
|             (proximity, word1, word2), | ||||
|             &( | ||||
|                 proximity, | ||||
|                 self.word_interner.get(word1).as_str(), | ||||
|                 self.word_interner.get(word2).as_str(), | ||||
|             ), | ||||
|             &mut self.db_cache.word_pair_proximity_docids, | ||||
|             self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), | ||||
|         ) | ||||
|         match self.index.proximity_precision(self.txn)?.unwrap_or_default() { | ||||
|             ProximityPrecision::AttributeScale => Ok(self | ||||
|                 .get_db_word_pair_proximity_docids(word1, word2, proximity)? | ||||
|                 .map(|d| d.len())), | ||||
|             ProximityPrecision::WordScale => { | ||||
|                 DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( | ||||
|                     self.txn, | ||||
|                     (proximity, word1, word2), | ||||
|                     &( | ||||
|                         proximity, | ||||
|                         self.word_interner.get(word1).as_str(), | ||||
|                         self.word_interner.get(word2).as_str(), | ||||
|                     ), | ||||
|                     &mut self.db_cache.word_pair_proximity_docids, | ||||
|                     self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_prefix_pair_proximity_docids( | ||||
|         &mut self, | ||||
|         word1: Interned<String>, | ||||
|         prefix2: Interned<String>, | ||||
|         proximity: u8, | ||||
|         mut proximity: u8, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         // TODO: if database is empty, search if the word are in the same attribute instead | ||||
|         let docids = match self | ||||
|             .db_cache | ||||
|             .word_prefix_pair_proximity_docids | ||||
|             .entry((proximity, word1, prefix2)) | ||||
|         { | ||||
|             Entry::Occupied(docids) => docids.get().clone(), | ||||
|             Entry::Vacant(entry) => { | ||||
|                 // compute docids using prefix iter and store the result in the cache. | ||||
|                 let key = U8StrStrCodec::bytes_encode(&( | ||||
|                     proximity, | ||||
|                     self.word_interner.get(word1).as_str(), | ||||
|                     self.word_interner.get(prefix2).as_str(), | ||||
|                 )) | ||||
|                 .unwrap() | ||||
|                 .into_owned(); | ||||
|                 let mut prefix_docids = RoaringBitmap::new(); | ||||
|                 let remap_key_type = self | ||||
|                     .index | ||||
|                     .word_pair_proximity_docids | ||||
|                     .remap_key_type::<Bytes>() | ||||
|                     .prefix_iter(self.txn, &key)?; | ||||
|                 for result in remap_key_type { | ||||
|                     let (_, docids) = result?; | ||||
|         let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default(); | ||||
|         if proximity_precision == ProximityPrecision::AttributeScale { | ||||
|             // Force proximity to 0 because: | ||||
|             // in AttributeScale, there are only 2 possible distances: | ||||
|             // 1. words in same attribute: in that the DB contains (0, word1, word2) | ||||
|             // 2. words in different attributes: no DB entry for these two words. | ||||
|             proximity = 0; | ||||
|         } | ||||
|  | ||||
|                     prefix_docids |= docids; | ||||
|         let docids = if let Some(docids) = | ||||
|             self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2)) | ||||
|         { | ||||
|             docids.clone() | ||||
|         } else { | ||||
|             let prefix_docids = match proximity_precision { | ||||
|                 ProximityPrecision::AttributeScale => { | ||||
|                     // Compute the distance at the attribute level and store it in the cache. | ||||
|                     let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { | ||||
|                         fids | ||||
|                     } else { | ||||
|                         self.index.fields_ids_map(self.txn)?.ids().collect() | ||||
|                     }; | ||||
|                     let mut prefix_docids = RoaringBitmap::new(); | ||||
|                     // for each field, intersect left word bitmap and right word bitmap, | ||||
|                     // then merge the result in a global bitmap before storing it in the cache. | ||||
|                     for fid in fids { | ||||
|                         let word1_docids = self.get_db_word_fid_docids(word1, fid)?; | ||||
|                         let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?; | ||||
|                         if let (Some(word1_docids), Some(prefix2_docids)) = | ||||
|                             (word1_docids, prefix2_docids) | ||||
|                         { | ||||
|                             prefix_docids |= word1_docids & prefix2_docids; | ||||
|                         } | ||||
|                     } | ||||
|                     prefix_docids | ||||
|                 } | ||||
|                 entry.insert(Some(prefix_docids.clone())); | ||||
|                 Some(prefix_docids) | ||||
|             } | ||||
|                 ProximityPrecision::WordScale => { | ||||
|                     // compute docids using prefix iter and store the result in the cache. | ||||
|                     let key = U8StrStrCodec::bytes_encode(&( | ||||
|                         proximity, | ||||
|                         self.word_interner.get(word1).as_str(), | ||||
|                         self.word_interner.get(prefix2).as_str(), | ||||
|                     )) | ||||
|                     .unwrap() | ||||
|                     .into_owned(); | ||||
|                     let mut prefix_docids = RoaringBitmap::new(); | ||||
|                     let remap_key_type = self | ||||
|                         .index | ||||
|                         .word_pair_proximity_docids | ||||
|                         .remap_key_type::<Bytes>() | ||||
|                         .prefix_iter(self.txn, &key)?; | ||||
|                     for result in remap_key_type { | ||||
|                         let (_, docids) = result?; | ||||
|  | ||||
|                         prefix_docids |= docids; | ||||
|                     } | ||||
|                     prefix_docids | ||||
|                 } | ||||
|             }; | ||||
|             self.db_cache | ||||
|                 .word_prefix_pair_proximity_docids | ||||
|                 .insert((proximity, word1, prefix2), Some(prefix_docids.clone())); | ||||
|             Some(prefix_docids) | ||||
|         }; | ||||
|         Ok(docids) | ||||
|     } | ||||
|   | ||||
| @@ -32,6 +32,7 @@ use super::helpers::{ | ||||
|     MergeFn, MergeableReader, | ||||
| }; | ||||
| use super::{helpers, TypedChunk}; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extract data for each databases from obkv documents in parallel. | ||||
| @@ -52,7 +53,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|     dictionary: Option<&[&str]>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
|     exact_attributes: HashSet<FieldId>, | ||||
|     // TODO: add a proximity database deactivation parameter. | ||||
|     proximity_precision: ProximityPrecision, | ||||
| ) -> Result<()> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
| @@ -151,16 +152,17 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     // TODO: Skip this part if deactivated | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_pair_proximity_docids, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         TypedChunk::WordPairProximityDocids, | ||||
|         "word-pair-proximity-docids", | ||||
|     ); | ||||
|     if proximity_precision == ProximityPrecision::WordScale { | ||||
|         spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|             docid_word_positions_chunks.clone(), | ||||
|             indexer, | ||||
|             lmdb_writer_sx.clone(), | ||||
|             extract_word_pair_proximity_docids, | ||||
|             merge_deladd_cbo_roaring_bitmaps, | ||||
|             TypedChunk::WordPairProximityDocids, | ||||
|             "word-pair-proximity-docids", | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|   | ||||
| @@ -352,6 +352,7 @@ where | ||||
|         let dictionary: Option<Vec<_>> = | ||||
|             dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); | ||||
|         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; | ||||
|         let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); | ||||
|  | ||||
|         let pool_params = GrenadParameters { | ||||
|             chunk_compression_type: self.indexer_config.chunk_compression_type, | ||||
| @@ -392,6 +393,7 @@ where | ||||
|                     dictionary.as_deref(), | ||||
|                     max_positions_per_attributes, | ||||
|                     exact_attributes, | ||||
|                     proximity_precision, | ||||
|                 ) | ||||
|             }); | ||||
|  | ||||
|   | ||||
| @@ -12,6 +12,7 @@ use super::IndexerConfig; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::error::UserError; | ||||
| use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::update::index_documents::IndexDocumentsMethod; | ||||
| use crate::update::{IndexDocuments, UpdateIndexingStep}; | ||||
| use crate::{FieldsIdsMap, Index, OrderBy, Result}; | ||||
| @@ -127,7 +128,7 @@ pub struct Settings<'a, 't, 'i> { | ||||
|     max_values_per_facet: Setting<usize>, | ||||
|     sort_facet_values_by: Setting<HashMap<String, OrderBy>>, | ||||
|     pagination_max_total_hits: Setting<usize>, | ||||
|     // TODO: add a proximity database deactivation attribute. | ||||
|     proximity_precision: Setting<ProximityPrecision>, | ||||
| } | ||||
|  | ||||
| impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
| @@ -159,6 +160,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|             max_values_per_facet: Setting::NotSet, | ||||
|             sort_facet_values_by: Setting::NotSet, | ||||
|             pagination_max_total_hits: Setting::NotSet, | ||||
|             proximity_precision: Setting::NotSet, | ||||
|             indexer_config, | ||||
|         } | ||||
|     } | ||||
| @@ -333,6 +335,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         self.pagination_max_total_hits = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     pub fn set_proximity_precision(&mut self, value: ProximityPrecision) { | ||||
|         self.proximity_precision = Setting::Set(value); | ||||
|     } | ||||
|  | ||||
|     pub fn reset_proximity_precision(&mut self) { | ||||
|         self.proximity_precision = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     fn reindex<FP, FA>( | ||||
|         &mut self, | ||||
|         progress_callback: &FP, | ||||
| @@ -862,6 +872,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn update_proximity_precision(&mut self) -> Result<bool> { | ||||
|         let changed = match self.proximity_precision { | ||||
|             Setting::Set(new) => { | ||||
|                 let old = self.index.proximity_precision(self.wtxn)?; | ||||
|                 if old == Some(new) { | ||||
|                     false | ||||
|                 } else { | ||||
|                     self.index.put_proximity_precision(self.wtxn, new)?; | ||||
|                     true | ||||
|                 } | ||||
|             } | ||||
|             Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, | ||||
|             Setting::NotSet => false, | ||||
|         }; | ||||
|  | ||||
|         Ok(changed) | ||||
|     } | ||||
|  | ||||
|     pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()> | ||||
|     where | ||||
|         FP: Fn(UpdateIndexingStep) + Sync, | ||||
| @@ -898,6 +926,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         let synonyms_updated = self.update_synonyms()?; | ||||
|         let searchable_updated = self.update_searchable()?; | ||||
|         let exact_attributes_updated = self.update_exact_attributes()?; | ||||
|         let proximity_precision = self.update_proximity_precision()?; | ||||
|  | ||||
|         if stop_words_updated | ||||
|             || non_separator_tokens_updated | ||||
| @@ -907,7 +936,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|             || synonyms_updated | ||||
|             || searchable_updated | ||||
|             || exact_attributes_updated | ||||
|         // TODO: reindex if proximity database is activated | ||||
|             || proximity_precision | ||||
|         { | ||||
|             self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; | ||||
|         } | ||||
| @@ -1733,6 +1762,7 @@ mod tests { | ||||
|                     max_values_per_facet, | ||||
|                     sort_facet_values_by, | ||||
|                     pagination_max_total_hits, | ||||
|                     proximity_precision, | ||||
|                 } = settings; | ||||
|                 assert!(matches!(searchable_fields, Setting::NotSet)); | ||||
|                 assert!(matches!(displayed_fields, Setting::NotSet)); | ||||
| @@ -1754,6 +1784,7 @@ mod tests { | ||||
|                 assert!(matches!(max_values_per_facet, Setting::NotSet)); | ||||
|                 assert!(matches!(sort_facet_values_by, Setting::NotSet)); | ||||
|                 assert!(matches!(pagination_max_total_hits, Setting::NotSet)); | ||||
|                 assert!(matches!(proximity_precision, Setting::NotSet)); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user