mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Remove word pair proximity prefix cache and compute it at search time
This commit is contained in:
		| @@ -83,8 +83,6 @@ pub mod db_name { | |||||||
|     pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; |     pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; | ||||||
|     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; |     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; | ||||||
|     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; |     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; | ||||||
|     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; |  | ||||||
|     pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; |  | ||||||
|     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; |     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; | ||||||
|     pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; |     pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; | ||||||
|     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; |     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; | ||||||
| @@ -129,10 +127,6 @@ pub struct Index { | |||||||
|  |  | ||||||
|     /// Maps the proximity between a pair of words with all the docids where this relation appears. |     /// Maps the proximity between a pair of words with all the docids where this relation appears. | ||||||
|     pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, |     pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||||
|     /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. |  | ||||||
|     pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. |  | ||||||
|     pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|  |  | ||||||
|     /// Maps the word and the position with the docids that corresponds to it. |     /// Maps the word and the position with the docids that corresponds to it. | ||||||
|     pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, |     pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||||
| @@ -186,7 +180,7 @@ impl Index { | |||||||
|     ) -> Result<Index> { |     ) -> Result<Index> { | ||||||
|         use db_name::*; |         use db_name::*; | ||||||
|  |  | ||||||
|         options.max_dbs(26); |         options.max_dbs(24); | ||||||
|         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; |         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; | ||||||
|  |  | ||||||
|         let env = options.open(path)?; |         let env = options.open(path)?; | ||||||
| @@ -203,10 +197,6 @@ impl Index { | |||||||
|             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; |             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; | ||||||
|         let script_language_docids = |         let script_language_docids = | ||||||
|             env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; |             env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; | ||||||
|         let word_prefix_pair_proximity_docids = |  | ||||||
|             env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; |  | ||||||
|         let prefix_word_pair_proximity_docids = |  | ||||||
|             env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; |  | ||||||
|         let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; |         let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; | ||||||
|         let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; |         let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; | ||||||
|         let field_id_word_count_docids = |         let field_id_word_count_docids = | ||||||
| @@ -247,8 +237,6 @@ impl Index { | |||||||
|             exact_word_prefix_docids, |             exact_word_prefix_docids, | ||||||
|             word_pair_proximity_docids, |             word_pair_proximity_docids, | ||||||
|             script_language_docids, |             script_language_docids, | ||||||
|             word_prefix_pair_proximity_docids, |  | ||||||
|             prefix_word_pair_proximity_docids, |  | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_fid_docids, |             word_fid_docids, | ||||||
|             word_prefix_position_docids, |             word_prefix_position_docids, | ||||||
|   | |||||||
| @@ -11,7 +11,9 @@ use super::interner::Interned; | |||||||
| use super::Word; | use super::Word; | ||||||
| use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; | use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; | ||||||
| use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; | use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; | ||||||
| use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext}; | use crate::{ | ||||||
|  |     CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, | ||||||
|  | }; | ||||||
|  |  | ||||||
| /// A cache storing pointers to values in the LMDB databases. | /// A cache storing pointers to values in the LMDB databases. | ||||||
| /// | /// | ||||||
| @@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> { | |||||||
|     pub word_pair_proximity_docids: |     pub word_pair_proximity_docids: | ||||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, |         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||||
|     pub word_prefix_pair_proximity_docids: |     pub word_prefix_pair_proximity_docids: | ||||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, |         FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>, | ||||||
|     pub prefix_word_pair_proximity_docids: |     pub prefix_word_pair_proximity_docids: | ||||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, |         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||||
|     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>, |     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>, | ||||||
| @@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> { | |||||||
|         prefix2: Interned<String>, |         prefix2: Interned<String>, | ||||||
|         proximity: u8, |         proximity: u8, | ||||||
|     ) -> Result<Option<RoaringBitmap>> { |     ) -> Result<Option<RoaringBitmap>> { | ||||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( |         let docids = match self | ||||||
|             self.txn, |             .db_cache | ||||||
|             (proximity, word1, prefix2), |             .word_prefix_pair_proximity_docids | ||||||
|             &( |             .entry((proximity, word1, prefix2)) | ||||||
|                 proximity, |         { | ||||||
|                 self.word_interner.get(word1).as_str(), |             Entry::Occupied(docids) => docids.get().clone(), | ||||||
|                 self.word_interner.get(prefix2).as_str(), |             Entry::Vacant(entry) => { | ||||||
|             ), |                 // compute docids using prefix iter and store the result in the cache. | ||||||
|             &mut self.db_cache.word_prefix_pair_proximity_docids, |                 let key = U8StrStrCodec::bytes_encode(&( | ||||||
|             self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(), |                     proximity, | ||||||
|         ) |                     self.word_interner.get(word1).as_str(), | ||||||
|  |                     self.word_interner.get(prefix2).as_str(), | ||||||
|  |                 )) | ||||||
|  |                 .unwrap() | ||||||
|  |                 .into_owned(); | ||||||
|  |                 let mut prefix_docids = RoaringBitmap::new(); | ||||||
|  |                 let remap_key_type = self | ||||||
|  |                     .index | ||||||
|  |                     .word_pair_proximity_docids | ||||||
|  |                     .remap_key_type::<ByteSlice>() | ||||||
|  |                     .prefix_iter(self.txn, &key)?; | ||||||
|  |                 for result in remap_key_type { | ||||||
|  |                     let (_, docids) = result?; | ||||||
|  |  | ||||||
|  |                     prefix_docids |= docids; | ||||||
|  |                 } | ||||||
|  |                 entry.insert(Some(prefix_docids.clone())); | ||||||
|  |                 Some(prefix_docids) | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |         Ok(docids) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn get_db_prefix_word_pair_proximity_docids( |     pub fn get_db_prefix_word_pair_proximity_docids( | ||||||
|         &mut self, |         &mut self, | ||||||
|         left_prefix: Interned<String>, |         left_prefix: Interned<String>, | ||||||
|         right: Interned<String>, |         right: Interned<String>, | ||||||
|         proximity: u8, |         proximity: u8, | ||||||
|     ) -> Result<Option<RoaringBitmap>> { |     ) -> Result<Option<RoaringBitmap>> { | ||||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( |         // only accept exact matches on reverted positions | ||||||
|             self.txn, |         self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) | ||||||
|             (proximity, left_prefix, right), |  | ||||||
|             &( |  | ||||||
|                 proximity, |  | ||||||
|                 self.word_interner.get(left_prefix).as_str(), |  | ||||||
|                 self.word_interner.get(right).as_str(), |  | ||||||
|             ), |  | ||||||
|             &mut self.db_cache.prefix_word_pair_proximity_docids, |  | ||||||
|             self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(), |  | ||||||
|         ) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn get_db_word_fid_docids( |     pub fn get_db_word_fid_docids( | ||||||
|   | |||||||
| @@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|             word_prefix_docids, |             word_prefix_docids, | ||||||
|             exact_word_prefix_docids, |             exact_word_prefix_docids, | ||||||
|             word_pair_proximity_docids, |             word_pair_proximity_docids, | ||||||
|             word_prefix_pair_proximity_docids, |  | ||||||
|             prefix_word_pair_proximity_docids, |  | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_fid_docids, |             word_fid_docids, | ||||||
|             field_id_word_count_docids, |             field_id_word_count_docids, | ||||||
| @@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         word_prefix_docids.clear(self.wtxn)?; |         word_prefix_docids.clear(self.wtxn)?; | ||||||
|         exact_word_prefix_docids.clear(self.wtxn)?; |         exact_word_prefix_docids.clear(self.wtxn)?; | ||||||
|         word_pair_proximity_docids.clear(self.wtxn)?; |         word_pair_proximity_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; |  | ||||||
|         prefix_word_pair_proximity_docids.clear(self.wtxn)?; |  | ||||||
|         word_position_docids.clear(self.wtxn)?; |         word_position_docids.clear(self.wtxn)?; | ||||||
|         word_fid_docids.clear(self.wtxn)?; |         word_fid_docids.clear(self.wtxn)?; | ||||||
|         field_id_word_count_docids.clear(self.wtxn)?; |         field_id_word_count_docids.clear(self.wtxn)?; | ||||||
| @@ -132,7 +128,6 @@ mod tests { | |||||||
|         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); |         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); |         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); |         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); |  | ||||||
|         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); |         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); |         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); |         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); | ||||||
|   | |||||||
| @@ -35,13 +35,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; | |||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||||
| use crate::update::{ | use crate::update::{ | ||||||
|     IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, |     IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, | ||||||
|     WordPrefixIntegerDocids, WordsPrefixesFst, |  | ||||||
| }; | }; | ||||||
| use crate::{CboRoaringBitmapCodec, Index, Result}; | use crate::{CboRoaringBitmapCodec, Index, Result}; | ||||||
|  |  | ||||||
| static MERGED_DATABASE_COUNT: usize = 7; | static MERGED_DATABASE_COUNT: usize = 7; | ||||||
| static PREFIX_DATABASE_COUNT: usize = 5; | static PREFIX_DATABASE_COUNT: usize = 4; | ||||||
| static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; | static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | ||||||
| @@ -381,7 +380,6 @@ where | |||||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, |             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         let mut word_pair_proximity_docids = None; |  | ||||||
|         let mut word_position_docids = None; |         let mut word_position_docids = None; | ||||||
|         let mut word_fid_docids = None; |         let mut word_fid_docids = None; | ||||||
|         let mut word_docids = None; |         let mut word_docids = None; | ||||||
| @@ -411,11 +409,6 @@ where | |||||||
|                         word_fid_docids_reader, |                         word_fid_docids_reader, | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 TypedChunk::WordPairProximityDocids(chunk) => { |  | ||||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; |  | ||||||
|                     word_pair_proximity_docids = Some(cloneable_chunk); |  | ||||||
|                     TypedChunk::WordPairProximityDocids(chunk) |  | ||||||
|                 } |  | ||||||
|                 TypedChunk::WordPositionDocids(chunk) => { |                 TypedChunk::WordPositionDocids(chunk) => { | ||||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; |                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; | ||||||
|                     word_position_docids = Some(cloneable_chunk); |                     word_position_docids = Some(cloneable_chunk); | ||||||
| @@ -458,7 +451,6 @@ where | |||||||
|         self.execute_prefix_databases( |         self.execute_prefix_databases( | ||||||
|             word_docids, |             word_docids, | ||||||
|             exact_word_docids, |             exact_word_docids, | ||||||
|             word_pair_proximity_docids, |  | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_fid_docids, |             word_fid_docids, | ||||||
|         )?; |         )?; | ||||||
| @@ -471,7 +463,6 @@ where | |||||||
|         self, |         self, | ||||||
|         word_docids: Option<grenad::Reader<CursorClonableMmap>>, |         word_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>, |         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|         word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>, |  | ||||||
|         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>, |         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>, |         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|     ) -> Result<()> |     ) -> Result<()> | ||||||
| @@ -592,32 +583,6 @@ where | |||||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, |             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { |  | ||||||
|             // Run the word prefix pair proximity docids update operation. |  | ||||||
|             PrefixWordPairsProximityDocids::new( |  | ||||||
|                 self.wtxn, |  | ||||||
|                 self.index, |  | ||||||
|                 self.indexer_config.chunk_compression_type, |  | ||||||
|                 self.indexer_config.chunk_compression_level, |  | ||||||
|             ) |  | ||||||
|             .execute( |  | ||||||
|                 word_pair_proximity_docids, |  | ||||||
|                 &new_prefix_fst_words, |  | ||||||
|                 &common_prefix_fst_words, |  | ||||||
|                 &del_prefix_fst_words, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if (self.should_abort)() { |  | ||||||
|             return Err(Error::InternalError(InternalError::AbortedIndexation)); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         databases_seen += 1; |  | ||||||
|         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { |  | ||||||
|             databases_seen, |  | ||||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, |  | ||||||
|         }); |  | ||||||
|  |  | ||||||
|         if let Some(word_position_docids) = word_position_docids { |         if let Some(word_position_docids) = word_position_docids { | ||||||
|             // Run the words prefix position docids update operation. |             // Run the words prefix position docids update operation. | ||||||
|             let mut builder = WordPrefixIntegerDocids::new( |             let mut builder = WordPrefixIntegerDocids::new( | ||||||
|   | |||||||
| @@ -8,10 +8,6 @@ pub use self::index_documents::{ | |||||||
|     MergeFn, |     MergeFn, | ||||||
| }; | }; | ||||||
| pub use self::indexer_config::IndexerConfig; | pub use self::indexer_config::IndexerConfig; | ||||||
| pub use self::prefix_word_pairs::{ |  | ||||||
|     PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
|     MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
| }; |  | ||||||
| pub use self::settings::{Setting, Settings}; | pub use self::settings::{Setting, Settings}; | ||||||
| pub use self::update_step::UpdateIndexingStep; | pub use self::update_step::UpdateIndexingStep; | ||||||
| pub use self::word_prefix_docids::WordPrefixDocids; | pub use self::word_prefix_docids::WordPrefixDocids; | ||||||
| @@ -24,7 +20,6 @@ pub(crate) mod del_add; | |||||||
| pub(crate) mod facet; | pub(crate) mod facet; | ||||||
| mod index_documents; | mod index_documents; | ||||||
| mod indexer_config; | mod indexer_config; | ||||||
| mod prefix_word_pairs; |  | ||||||
| mod settings; | mod settings; | ||||||
| mod update_step; | mod update_step; | ||||||
| mod word_prefix_docids; | mod word_prefix_docids; | ||||||
|   | |||||||
| @@ -1,418 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::HashSet; |  | ||||||
| use std::io::{BufReader, BufWriter}; |  | ||||||
|  |  | ||||||
| use grenad::CompressionType; |  | ||||||
| use heed::types::ByteSlice; |  | ||||||
|  |  | ||||||
| use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; |  | ||||||
| use crate::{Index, Result}; |  | ||||||
|  |  | ||||||
| mod prefix_word; |  | ||||||
| mod word_prefix; |  | ||||||
|  |  | ||||||
| pub use prefix_word::index_prefix_word_database; |  | ||||||
| pub use word_prefix::index_word_prefix_database; |  | ||||||
|  |  | ||||||
| pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; |  | ||||||
| pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; |  | ||||||
|  |  | ||||||
| pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { |  | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |  | ||||||
|     index: &'i Index, |  | ||||||
|     max_proximity: u8, |  | ||||||
|     max_prefix_length: usize, |  | ||||||
|     chunk_compression_type: CompressionType, |  | ||||||
|     chunk_compression_level: Option<u32>, |  | ||||||
| } |  | ||||||
| impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { |  | ||||||
|     pub fn new( |  | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |  | ||||||
|         index: &'i Index, |  | ||||||
|         chunk_compression_type: CompressionType, |  | ||||||
|         chunk_compression_level: Option<u32>, |  | ||||||
|     ) -> Self { |  | ||||||
|         Self { |  | ||||||
|             wtxn, |  | ||||||
|             index, |  | ||||||
|             max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
|             max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
|             chunk_compression_type, |  | ||||||
|             chunk_compression_level, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[logging_timer::time("WordPrefixPairProximityDocids::{}")] |  | ||||||
|     pub fn execute<'a>( |  | ||||||
|         self, |  | ||||||
|         new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, |  | ||||||
|         new_prefix_fst_words: &'a [String], |  | ||||||
|         common_prefix_fst_words: &[&'a [String]], |  | ||||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         puffin::profile_function!(); |  | ||||||
|  |  | ||||||
|         index_word_prefix_database( |  | ||||||
|             self.wtxn, |  | ||||||
|             self.index.word_pair_proximity_docids, |  | ||||||
|             self.index.word_prefix_pair_proximity_docids, |  | ||||||
|             self.max_proximity, |  | ||||||
|             self.max_prefix_length, |  | ||||||
|             new_word_pair_proximity_docids.clone(), |  | ||||||
|             new_prefix_fst_words, |  | ||||||
|             common_prefix_fst_words, |  | ||||||
|             del_prefix_fst_words, |  | ||||||
|             self.chunk_compression_type, |  | ||||||
|             self.chunk_compression_level, |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         index_prefix_word_database( |  | ||||||
|             self.wtxn, |  | ||||||
|             self.index.word_pair_proximity_docids, |  | ||||||
|             self.index.prefix_word_pair_proximity_docids, |  | ||||||
|             self.max_proximity, |  | ||||||
|             self.max_prefix_length, |  | ||||||
|             new_word_pair_proximity_docids, |  | ||||||
|             new_prefix_fst_words, |  | ||||||
|             common_prefix_fst_words, |  | ||||||
|             del_prefix_fst_words, |  | ||||||
|             self.chunk_compression_type, |  | ||||||
|             self.chunk_compression_level, |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // This is adapted from `sorter_into_lmdb_database` |  | ||||||
| pub fn insert_into_database( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     database: heed::PolyDatabase, |  | ||||||
|     new_key: &[u8], |  | ||||||
|     new_value: &[u8], |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; |  | ||||||
|     match iter.next().transpose()? { |  | ||||||
|         Some((key, old_val)) if new_key == key => { |  | ||||||
|             let val = |  | ||||||
|                 merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) |  | ||||||
|                     .map_err(|_| { |  | ||||||
|                         // TODO just wrap this error? |  | ||||||
|                         crate::error::InternalError::IndexingMergingKeys { |  | ||||||
|                             process: "get-put-merge", |  | ||||||
|                         } |  | ||||||
|                     })?; |  | ||||||
|             // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour |  | ||||||
|             unsafe { iter.put_current(new_key, &val)? }; |  | ||||||
|         } |  | ||||||
|         _ => { |  | ||||||
|             drop(iter); |  | ||||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, |  | ||||||
| // but it uses `append` if the database is empty, and it assumes that the values in the |  | ||||||
| // writer don't conflict with values in the database. |  | ||||||
| pub fn write_into_lmdb_database_without_merging( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     database: heed::PolyDatabase, |  | ||||||
|     writer: grenad::Writer<BufWriter<std::fs::File>>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; |  | ||||||
|     let reader = grenad::Reader::new(BufReader::new(file))?; |  | ||||||
|     if database.is_empty(wtxn)? { |  | ||||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; |  | ||||||
|         let mut cursor = reader.into_cursor()?; |  | ||||||
|         while let Some((k, v)) = cursor.move_on_next()? { |  | ||||||
|             // safety: the key comes from the grenad reader, not the database |  | ||||||
|             unsafe { out_iter.append(k, v)? }; |  | ||||||
|         } |  | ||||||
|     } else { |  | ||||||
|         let mut cursor = reader.into_cursor()?; |  | ||||||
|         while let Some((k, v)) = cursor.move_on_next()? { |  | ||||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use std::io::Cursor; |  | ||||||
|  |  | ||||||
|     use crate::db_snap; |  | ||||||
|     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; |  | ||||||
|     use crate::index::tests::TempIndex; |  | ||||||
|     use crate::update::IndexDocumentsMethod; |  | ||||||
|  |  | ||||||
|     fn documents_with_enough_different_words_for_prefixes( |  | ||||||
|         prefixes: &[&str], |  | ||||||
|         start_id: usize, |  | ||||||
|     ) -> Vec<crate::Object> { |  | ||||||
|         let mut documents = Vec::new(); |  | ||||||
|         let mut id = start_id; |  | ||||||
|         for prefix in prefixes { |  | ||||||
|             for i in 0..50 { |  | ||||||
|                 documents.push( |  | ||||||
|                     serde_json::json!({ |  | ||||||
|                         "id": id, |  | ||||||
|                         "text": format!("{prefix}{i:x}"), |  | ||||||
|                     }) |  | ||||||
|                     .as_object() |  | ||||||
|                     .unwrap() |  | ||||||
|                     .clone(), |  | ||||||
|                 ); |  | ||||||
|                 id += 1; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         documents |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn add_new_documents() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.autogenerate_docids = true; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": "9000", |  | ||||||
|                 "text": "At an amazing and beautiful house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": "9001", |  | ||||||
|                 "text": "The bell rings at 5 am" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": "9002", |  | ||||||
|                 "text": "At an extraordinary house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_pair_proximity_docids, "update"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "update"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "update"); |  | ||||||
|     } |  | ||||||
|     #[test] |  | ||||||
|     fn batch_bug_3043() { |  | ||||||
|         // https://github.com/meilisearch/meilisearch/issues/3043 |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.autogenerate_docids = true; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "text": "x y" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "text": "x a y" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_pair_proximity_docids); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn hard_delete_and_reupdate() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9000, |  | ||||||
|                 "text": "At an amazing and beautiful house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9001, |  | ||||||
|                 "text": "The bell rings at 5 am" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "initial"); |  | ||||||
|         db_snap!(index, word_docids, "initial"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         index.delete_document("9000"); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "first_delete"); |  | ||||||
|         db_snap!(index, word_docids, "first_delete"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); |  | ||||||
|  |  | ||||||
|         index.delete_documents((0..50).map(|id| id.to_string()).collect()); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "second_delete"); |  | ||||||
|         db_snap!(index, word_docids, "second_delete"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); |  | ||||||
|  |  | ||||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|  |  | ||||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "reupdate"); |  | ||||||
|         db_snap!(index, word_docids, "reupdate"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn replace_hard_deletion() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9000, |  | ||||||
|                 "text": "At an amazing house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9001, |  | ||||||
|                 "text": "The bell rings" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "initial"); |  | ||||||
|         db_snap!(index, word_docids, "initial"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); |  | ||||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "replaced"); |  | ||||||
|         db_snap!(index, word_docids, "replaced"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,182 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::{BTreeMap, HashSet}; |  | ||||||
|  |  | ||||||
| use grenad::CompressionType; |  | ||||||
| use heed::types::ByteSlice; |  | ||||||
| use heed::BytesDecode; |  | ||||||
| use log::debug; |  | ||||||
|  |  | ||||||
| use crate::update::index_documents::{create_writer, CursorClonableMmap}; |  | ||||||
| use crate::update::prefix_word_pairs::{ |  | ||||||
|     insert_into_database, write_into_lmdb_database_without_merging, |  | ||||||
| }; |  | ||||||
| use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; |  | ||||||
|  |  | ||||||
| #[allow(clippy::too_many_arguments)] |  | ||||||
| #[logging_timer::time] |  | ||||||
| pub fn index_prefix_word_database( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     max_proximity: u8, |  | ||||||
|     max_prefix_length: usize, |  | ||||||
|     new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, |  | ||||||
|     new_prefix_fst_words: &[String], |  | ||||||
|     common_prefix_fst_words: &[&[String]], |  | ||||||
|     del_prefix_fst_words: &HashSet<Vec<u8>>, |  | ||||||
|     chunk_compression_type: CompressionType, |  | ||||||
|     chunk_compression_level: Option<u32>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     puffin::profile_function!(); |  | ||||||
|  |  | ||||||
|     let max_proximity = max_proximity - 1; |  | ||||||
|     debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); |  | ||||||
|  |  | ||||||
|     let common_prefixes: Vec<_> = common_prefix_fst_words |  | ||||||
|         .iter() |  | ||||||
|         .flat_map(|s| s.iter()) |  | ||||||
|         .map(|s| s.as_str()) |  | ||||||
|         .filter(|s| s.len() <= max_prefix_length) |  | ||||||
|         .collect(); |  | ||||||
|  |  | ||||||
|     for proximity in 1..max_proximity { |  | ||||||
|         for prefix in common_prefixes.iter() { |  | ||||||
|             let mut prefix_key = vec![proximity]; |  | ||||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); |  | ||||||
|             let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; |  | ||||||
|             // This is the core of the algorithm |  | ||||||
|             execute_on_word_pairs_and_prefixes( |  | ||||||
|                 proximity, |  | ||||||
|                 prefix.as_bytes(), |  | ||||||
|                 // the next two arguments tell how to iterate over the new word pairs |  | ||||||
|                 &mut cursor, |  | ||||||
|                 |cursor| { |  | ||||||
|                     if let Some((key, value)) = cursor.next()? { |  | ||||||
|                         let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) |  | ||||||
|                             .ok_or(heed::Error::Decoding)?; |  | ||||||
|                         Ok(Some((word2, value))) |  | ||||||
|                     } else { |  | ||||||
|                         Ok(None) |  | ||||||
|                     } |  | ||||||
|                 }, |  | ||||||
|                 // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) |  | ||||||
|                 |key, value| { |  | ||||||
|                     insert_into_database( |  | ||||||
|                         wtxn, |  | ||||||
|                         *prefix_word_pair_proximity_docids.as_polymorph(), |  | ||||||
|                         key, |  | ||||||
|                         value, |  | ||||||
|                     ) |  | ||||||
|                 }, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Now we do the same thing with the new prefixes and all word pairs in the DB |  | ||||||
|     let new_prefixes: Vec<_> = new_prefix_fst_words |  | ||||||
|         .iter() |  | ||||||
|         .map(|s| s.as_str()) |  | ||||||
|         .filter(|s| s.len() <= max_prefix_length) |  | ||||||
|         .collect(); |  | ||||||
|  |  | ||||||
|     // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) |  | ||||||
|     // element in an intermediary grenad |  | ||||||
|     let mut writer = |  | ||||||
|         create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); |  | ||||||
|  |  | ||||||
|     for proximity in 1..max_proximity { |  | ||||||
|         for prefix in new_prefixes.iter() { |  | ||||||
|             let mut prefix_key = vec![proximity]; |  | ||||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); |  | ||||||
|             let mut db_iter = word_pair_proximity_docids |  | ||||||
|                 .as_polymorph() |  | ||||||
|                 .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? |  | ||||||
|                 .remap_key_type::<UncheckedU8StrStrCodec>(); |  | ||||||
|             execute_on_word_pairs_and_prefixes( |  | ||||||
|                 proximity, |  | ||||||
|                 prefix.as_bytes(), |  | ||||||
|                 &mut db_iter, |  | ||||||
|                 |db_iter| { |  | ||||||
|                     db_iter |  | ||||||
|                         .next() |  | ||||||
|                         .transpose() |  | ||||||
|                         .map(|x| x.map(|((_, _, word2), value)| (word2, value))) |  | ||||||
|                         .map_err(|e| e.into()) |  | ||||||
|                 }, |  | ||||||
|                 |key, value| writer.insert(key, value).map_err(|e| e.into()), |  | ||||||
|             )?; |  | ||||||
|             drop(db_iter); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // and then we write the grenad into the DB |  | ||||||
|     // Since the grenad contains only new prefixes, we know in advance that none |  | ||||||
|     // of its elements already exist in the DB, thus there is no need to specify |  | ||||||
|     // how to merge conflicting elements |  | ||||||
|     write_into_lmdb_database_without_merging( |  | ||||||
|         wtxn, |  | ||||||
|         *prefix_word_pair_proximity_docids.as_polymorph(), |  | ||||||
|         writer, |  | ||||||
|     )?; |  | ||||||
|  |  | ||||||
|     // All of the word prefix pairs in the database that have a w2 |  | ||||||
|     // that is contained in the `suppr_pw` set must be removed as well. |  | ||||||
|     if !del_prefix_fst_words.is_empty() { |  | ||||||
|         let mut iter = |  | ||||||
|             prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?; |  | ||||||
|         while let Some(((_, prefix, _), _)) = iter.next().transpose()? { |  | ||||||
|             if del_prefix_fst_words.contains(prefix.as_bytes()) { |  | ||||||
|                 // Delete this entry as the w2 prefix is no more in the words prefix fst. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. |  | ||||||
| /// |  | ||||||
| /// Its arguments are: |  | ||||||
| /// - an iterator over the words following the given `prefix` with the given `proximity` |  | ||||||
| /// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements |  | ||||||
| fn execute_on_word_pairs_and_prefixes<I>( |  | ||||||
|     proximity: u8, |  | ||||||
|     prefix: &[u8], |  | ||||||
|     iter: &mut I, |  | ||||||
|     mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>, |  | ||||||
|     mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default(); |  | ||||||
|  |  | ||||||
|     // Memory usage check: |  | ||||||
|     // The content of the loop will be called for each `word2` that follows a word beginning |  | ||||||
|     // with `prefix` with the given proximity. |  | ||||||
|     // In practice, I don't think the batch can ever get too big. |  | ||||||
|     while let Some((word2, docids)) = next_word2_and_docids(iter)? { |  | ||||||
|         let entry = batch.entry(word2.to_owned()).or_default(); |  | ||||||
|         entry.push(Cow::Owned(docids.to_owned())); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     let mut key_buffer = Vec::with_capacity(512); |  | ||||||
|     key_buffer.push(proximity); |  | ||||||
|     key_buffer.extend_from_slice(prefix); |  | ||||||
|     key_buffer.push(0); |  | ||||||
|  |  | ||||||
|     let mut value_buffer = Vec::with_capacity(65_536); |  | ||||||
|  |  | ||||||
|     for (word2, docids) in batch { |  | ||||||
|         key_buffer.truncate(prefix.len() + 2); |  | ||||||
|         value_buffer.clear(); |  | ||||||
|  |  | ||||||
|         key_buffer.extend_from_slice(&word2); |  | ||||||
|         let data = if docids.len() > 1 { |  | ||||||
|             CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; |  | ||||||
|             value_buffer.as_slice() |  | ||||||
|         } else { |  | ||||||
|             &docids[0] |  | ||||||
|         }; |  | ||||||
|         insert(key_buffer.as_slice(), data)?; |  | ||||||
|     } |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
| @@ -1,728 +0,0 @@ | |||||||
| /*! |  | ||||||
| The word-prefix-pair-proximity-docids database is a database whose keys are of |  | ||||||
| the form `(proximity, word, prefix)` and the values are roaring bitmaps of |  | ||||||
| the documents which contain `word` followed by another word starting with |  | ||||||
| `prefix` at a distance of `proximity`. |  | ||||||
|  |  | ||||||
| The prefixes present in this database are only those that correspond to many |  | ||||||
| different words in the documents. |  | ||||||
|  |  | ||||||
| ## How is it created/updated? (simplified version) |  | ||||||
| To compute it, we have access to (mainly) two inputs: |  | ||||||
|  |  | ||||||
| * a list of sorted prefixes, such as: |  | ||||||
| ```text |  | ||||||
| c |  | ||||||
| ca |  | ||||||
| cat |  | ||||||
| d |  | ||||||
| do |  | ||||||
| dog |  | ||||||
| ``` |  | ||||||
| Note that only prefixes which correspond to more than a certain number of |  | ||||||
| different words from the database are included in this list. |  | ||||||
|  |  | ||||||
| * a sorted list of proximities and word pairs (the proximity is the distance between the two words), |  | ||||||
| associated with a roaring bitmap, such as: |  | ||||||
| ```text |  | ||||||
| 1 good doggo         -> docids1: [8] |  | ||||||
| 1 good door          -> docids2: [7, 19, 20] |  | ||||||
| 1 good ghost         -> docids3: [1] |  | ||||||
| 2 good dog           -> docids4: [2, 5, 6] |  | ||||||
| 2 horror cathedral   -> docids5: [1, 2] |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| I illustrate a simplified version of the algorithm to create the word-prefix |  | ||||||
| pair-proximity database below: |  | ||||||
|  |  | ||||||
| 1. **Outer loop:** First, we iterate over each proximity and word pair: |  | ||||||
| ```text |  | ||||||
| proximity: 1 |  | ||||||
| word1    : good |  | ||||||
| word2    : doggo |  | ||||||
| ``` |  | ||||||
| 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are |  | ||||||
| in the list of sorted prefixes. And we insert the key `prefix` |  | ||||||
| and the value (`docids`) to a sorted map which we call the “batch”. For example, |  | ||||||
| at the end of the first outer loop, we may have: |  | ||||||
| ```text |  | ||||||
| Outer loop 1: |  | ||||||
| ------------------------------ |  | ||||||
| proximity: 1 |  | ||||||
| word1    : good |  | ||||||
| word2    : doggo |  | ||||||
| docids   : docids1 |  | ||||||
|  |  | ||||||
| prefixes: [d, do, dog] |  | ||||||
|  |  | ||||||
| batch: [ |  | ||||||
|     d,   -> [docids1] |  | ||||||
|     do   -> [docids1] |  | ||||||
|     dog  -> [docids1] |  | ||||||
| ] |  | ||||||
| ``` |  | ||||||
| 3. For illustration purpose, let's run through a second iteration of the outer loop: |  | ||||||
| ```text |  | ||||||
| Outer loop 2: |  | ||||||
| ------------------------------ |  | ||||||
| proximity: 1 |  | ||||||
| word1    : good |  | ||||||
| word2    : door |  | ||||||
| docids   : docids2 |  | ||||||
|  |  | ||||||
| prefixes: [d, do, doo] |  | ||||||
|  |  | ||||||
| batch: [ |  | ||||||
|     d   -> [docids1, docids2] |  | ||||||
|     do  -> [docids1, docids2] |  | ||||||
|     dog -> [docids1] |  | ||||||
|     doo -> [docids2] |  | ||||||
| ] |  | ||||||
| ``` |  | ||||||
| Notice that there were some conflicts which were resolved by merging the |  | ||||||
| conflicting values together. Also, an additional prefix was added at the |  | ||||||
| end of the batch. |  | ||||||
|  |  | ||||||
| 4. On the third iteration of the outer loop, we have: |  | ||||||
| ```text |  | ||||||
| Outer loop 3: |  | ||||||
| ------------------------------ |  | ||||||
| proximity: 1 |  | ||||||
| word1    : good |  | ||||||
| word2    : ghost |  | ||||||
| ``` |  | ||||||
| Because `word2` begins with a different letter than the previous `word2`, |  | ||||||
| we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 |  | ||||||
|  |  | ||||||
| Therefore, we know that we can insert every element from the batch into the |  | ||||||
| database before proceeding any further. This operation is called |  | ||||||
| “flushing the batch”. Flushing the batch should also be done whenever: |  | ||||||
| * `proximity` is different than the previous `proximity`. |  | ||||||
| * `word1` is different than the previous `word1`. |  | ||||||
| * `word2` starts with a different letter than the previous word2 |  | ||||||
|  |  | ||||||
| 6. **Flushing the batch:** to flush the batch, we iterate over its elements: |  | ||||||
| ```text |  | ||||||
| Flushing Batch loop 1: |  | ||||||
| ------------------------------ |  | ||||||
| proximity  : 1 |  | ||||||
| word1      : good |  | ||||||
| prefix     : d |  | ||||||
|  |  | ||||||
| docids   : [docids2, docids3] |  | ||||||
| ``` |  | ||||||
| We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using |  | ||||||
| `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a |  | ||||||
| roaring bitmap of all the document ids where `word1` is followed by `prefix` |  | ||||||
| at a distance of `proximity`. |  | ||||||
| Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` |  | ||||||
| into the database. |  | ||||||
|  |  | ||||||
| 7. That's it! ... except... |  | ||||||
|  |  | ||||||
| ## How is it created/updated (continued) |  | ||||||
|  |  | ||||||
| I lied a little bit about the input data. In reality, we get two sets of the |  | ||||||
| inputs described above, which come from different places: |  | ||||||
|  |  | ||||||
| * For the list of sorted prefixes, we have: |  | ||||||
|     1. `new_prefixes`, which are all the prefixes that were not present in the |  | ||||||
|     database before the insertion of the new documents |  | ||||||
|  |  | ||||||
|     2. `common_prefixes` which are the prefixes that are present both in the |  | ||||||
|     database and in the newly added documents |  | ||||||
|  |  | ||||||
| * For the list of word pairs and proximities, we have: |  | ||||||
|     1. `new_word_pairs`, which is the list of word pairs and their proximities |  | ||||||
|     present in the newly added documents |  | ||||||
|  |  | ||||||
|     2. `word_pairs_db`, which is the list of word pairs from the database. |  | ||||||
|     This list includes all elements in `new_word_pairs` since `new_word_pairs` |  | ||||||
|     was added to the database prior to calling the `WordPrefix::execute` |  | ||||||
|     function. |  | ||||||
|  |  | ||||||
| To update the prefix database correctly, we call the algorithm described earlier first |  | ||||||
| on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). |  | ||||||
| Thus: |  | ||||||
|  |  | ||||||
| 1. For all the word pairs that were already present in the DB, we insert them |  | ||||||
| again with the `new_prefixes`. Calling the algorithm on them with the |  | ||||||
| `common_prefixes` would not result in any new data. |  | ||||||
|  |  | ||||||
| 2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, |  | ||||||
| and then, because they are part of `word_pairs_db`, with the `new_prefixes`. |  | ||||||
|  |  | ||||||
| Note, also, that since we read data from the database when iterating over |  | ||||||
| `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- |  | ||||||
| docids from the batch directly into the database (we would have a concurrent |  | ||||||
| reader and writer). Therefore, when calling the algorithm on |  | ||||||
| `(new_prefixes, word_pairs_db)`, we insert the computed |  | ||||||
| `((proximity, word, prefix), docids)` elements in an intermediary grenad |  | ||||||
| Writer instead of the DB. At the end of the outer loop, we finally read from |  | ||||||
| the grenad and insert its elements in the database. |  | ||||||
| */ |  | ||||||
|  |  | ||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::HashSet; |  | ||||||
|  |  | ||||||
| use grenad::CompressionType; |  | ||||||
| use heed::types::ByteSlice; |  | ||||||
| use heed::BytesDecode; |  | ||||||
| use log::debug; |  | ||||||
|  |  | ||||||
| use crate::update::index_documents::{create_writer, CursorClonableMmap}; |  | ||||||
| use crate::update::prefix_word_pairs::{ |  | ||||||
|     insert_into_database, write_into_lmdb_database_without_merging, |  | ||||||
| }; |  | ||||||
| use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; |  | ||||||
|  |  | ||||||
| #[allow(clippy::too_many_arguments)] |  | ||||||
| #[logging_timer::time] |  | ||||||
| pub fn index_word_prefix_database( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     max_proximity: u8, |  | ||||||
|     max_prefix_length: usize, |  | ||||||
|     new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, |  | ||||||
|     new_prefix_fst_words: &[String], |  | ||||||
|     common_prefix_fst_words: &[&[String]], |  | ||||||
|     del_prefix_fst_words: &HashSet<Vec<u8>>, |  | ||||||
|     chunk_compression_type: CompressionType, |  | ||||||
|     chunk_compression_level: Option<u32>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     puffin::profile_function!(); |  | ||||||
|     debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); |  | ||||||
|  |  | ||||||
|     // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length |  | ||||||
|     let prefixes = PrefixTrieNode::from_sorted_prefixes( |  | ||||||
|         common_prefix_fst_words |  | ||||||
|             .iter() |  | ||||||
|             .flat_map(|s| s.iter()) |  | ||||||
|             .map(|s| s.as_str()) |  | ||||||
|             .filter(|s| s.len() <= max_prefix_length), |  | ||||||
|     ); |  | ||||||
|  |  | ||||||
|     // If the prefix trie is not empty, then we can iterate over all new |  | ||||||
|     // word pairs to look for new (proximity, word1, common_prefix) elements |  | ||||||
|     // to insert in the DB |  | ||||||
|     if !prefixes.is_empty() { |  | ||||||
|         let mut cursor = new_word_pair_proximity_docids.into_cursor()?; |  | ||||||
|         // This is the core of the algorithm |  | ||||||
|         execute_on_word_pairs_and_prefixes( |  | ||||||
|             // the first two arguments tell how to iterate over the new word pairs |  | ||||||
|             &mut cursor, |  | ||||||
|             |cursor| { |  | ||||||
|                 if let Some((key, value)) = cursor.move_on_next()? { |  | ||||||
|                     let (proximity, word1, word2) = |  | ||||||
|                         UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; |  | ||||||
|                     Ok(Some(((proximity, word1, word2), value))) |  | ||||||
|                 } else { |  | ||||||
|                     Ok(None) |  | ||||||
|                 } |  | ||||||
|             }, |  | ||||||
|             &prefixes, |  | ||||||
|             max_proximity, |  | ||||||
|             // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) |  | ||||||
|             |key, value| { |  | ||||||
|                 insert_into_database( |  | ||||||
|                     wtxn, |  | ||||||
|                     *word_prefix_pair_proximity_docids.as_polymorph(), |  | ||||||
|                     key, |  | ||||||
|                     value, |  | ||||||
|                 ) |  | ||||||
|             }, |  | ||||||
|         )?; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Now we do the same thing with the new prefixes and all word pairs in the DB |  | ||||||
|  |  | ||||||
|     let prefixes = PrefixTrieNode::from_sorted_prefixes( |  | ||||||
|         new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), |  | ||||||
|     ); |  | ||||||
|  |  | ||||||
|     if !prefixes.is_empty() { |  | ||||||
|         let mut db_iter = word_pair_proximity_docids |  | ||||||
|             .remap_key_type::<UncheckedU8StrStrCodec>() |  | ||||||
|             .remap_data_type::<ByteSlice>() |  | ||||||
|             .iter(wtxn)?; |  | ||||||
|  |  | ||||||
|         // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) |  | ||||||
|         // element in an intermediary grenad |  | ||||||
|         let mut writer = |  | ||||||
|             create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); |  | ||||||
|  |  | ||||||
|         execute_on_word_pairs_and_prefixes( |  | ||||||
|             &mut db_iter, |  | ||||||
|             |db_iter| db_iter.next().transpose().map_err(|e| e.into()), |  | ||||||
|             &prefixes, |  | ||||||
|             max_proximity, |  | ||||||
|             |key, value| writer.insert(key, value).map_err(|e| e.into()), |  | ||||||
|         )?; |  | ||||||
|         drop(db_iter); |  | ||||||
|  |  | ||||||
|         // and then we write the grenad into the DB |  | ||||||
|         // Since the grenad contains only new prefixes, we know in advance that none |  | ||||||
|         // of its elements already exist in the DB, thus there is no need to specify |  | ||||||
|         // how to merge conflicting elements |  | ||||||
|         write_into_lmdb_database_without_merging( |  | ||||||
|             wtxn, |  | ||||||
|             *word_prefix_pair_proximity_docids.as_polymorph(), |  | ||||||
|             writer, |  | ||||||
|         )?; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // All of the word prefix pairs in the database that have a w2 |  | ||||||
|     // that is contained in the `suppr_pw` set must be removed as well. |  | ||||||
|     if !del_prefix_fst_words.is_empty() { |  | ||||||
|         let mut iter = |  | ||||||
|             word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?; |  | ||||||
|         while let Some(((_, _, prefix), _)) = iter.next().transpose()? { |  | ||||||
|             if del_prefix_fst_words.contains(prefix.as_bytes()) { |  | ||||||
|                 // Delete this entry as the w2 prefix is no more in the words prefix fst. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. |  | ||||||
| /// |  | ||||||
| /// Its main arguments are: |  | ||||||
| /// 1. a sorted iterator over ((proximity, word1, word2), docids) elements |  | ||||||
| /// 2. a prefix trie |  | ||||||
| /// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements |  | ||||||
| /// |  | ||||||
| /// For more information about what this function does, read the module documentation. |  | ||||||
| fn execute_on_word_pairs_and_prefixes<I>( |  | ||||||
|     iter: &mut I, |  | ||||||
|     mut next_word_pair_proximity: impl for<'a> FnMut( |  | ||||||
|         &'a mut I, |  | ||||||
|     ) -> Result< |  | ||||||
|         Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, |  | ||||||
|     >, |  | ||||||
|     prefixes: &PrefixTrieNode, |  | ||||||
|     max_proximity: u8, |  | ||||||
|     mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let mut batch = PrefixAndProximityBatch::default(); |  | ||||||
|     let mut prev_word2_start = 0; |  | ||||||
|  |  | ||||||
|     // Optimisation: the index at the root of the prefix trie where to search for |  | ||||||
|     let mut prefix_search_start = PrefixTrieNodeSearchStart(0); |  | ||||||
|  |  | ||||||
|     // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter |  | ||||||
|     let mut empty_prefixes = false; |  | ||||||
|  |  | ||||||
|     let mut prefix_buffer = Vec::with_capacity(8); |  | ||||||
|     let mut merge_buffer = Vec::with_capacity(65_536); |  | ||||||
|  |  | ||||||
|     while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { |  | ||||||
|         // stop indexing if the proximity is over the threshold |  | ||||||
|         if proximity > max_proximity { |  | ||||||
|             break; |  | ||||||
|         }; |  | ||||||
|         let word2_start_different_than_prev = word2[0] != prev_word2_start; |  | ||||||
|         // if there were no potential prefixes for the previous word2 based on its first letter, |  | ||||||
|         // and if the current word2 starts with the same letter, then there is also no potential |  | ||||||
|         // prefixes for the current word2, and we can skip to the next iteration |  | ||||||
|         if empty_prefixes && !word2_start_different_than_prev { |  | ||||||
|             continue; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // if the proximity is different to the previous one, OR |  | ||||||
|         // if word1 is different than the previous word1, OR |  | ||||||
|         // if the start of word2 is different than the previous start of word2, |  | ||||||
|         // THEN we'll need to flush the batch |  | ||||||
|         let prox_different_than_prev = proximity != batch.proximity; |  | ||||||
|         let word1_different_than_prev = word1 != batch.word1; |  | ||||||
|         if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev |  | ||||||
|         { |  | ||||||
|             batch.flush(&mut merge_buffer, &mut insert)?; |  | ||||||
|             batch.proximity = proximity; |  | ||||||
|             // don't forget to reset the value of batch.word1 and prev_word2_start |  | ||||||
|             if word1_different_than_prev { |  | ||||||
|                 batch.word1.clear(); |  | ||||||
|                 batch.word1.extend_from_slice(word1); |  | ||||||
|             } |  | ||||||
|             if word2_start_different_than_prev { |  | ||||||
|                 prev_word2_start = word2[0]; |  | ||||||
|             } |  | ||||||
|             prefix_search_start.0 = 0; |  | ||||||
|             // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 |  | ||||||
|             empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if !empty_prefixes { |  | ||||||
|             // All conditions are satisfied, we can now insert each new prefix of word2 into the batch |  | ||||||
|             prefix_buffer.clear(); |  | ||||||
|             prefixes.for_each_prefix_of( |  | ||||||
|                 word2, |  | ||||||
|                 &mut prefix_buffer, |  | ||||||
|                 &prefix_search_start, |  | ||||||
|                 |prefix_buffer| { |  | ||||||
|                     batch.insert(prefix_buffer, data.to_vec()); |  | ||||||
|                 }, |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     batch.flush(&mut merge_buffer, &mut insert)?; |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
| /** |  | ||||||
| A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). |  | ||||||
| The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. |  | ||||||
|  |  | ||||||
| It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. |  | ||||||
|  |  | ||||||
| The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content |  | ||||||
| can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: |  | ||||||
| - key   : (proximity, word1, prefix) as bytes |  | ||||||
| - value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes |  | ||||||
| */ |  | ||||||
| #[derive(Default)] |  | ||||||
| struct PrefixAndProximityBatch { |  | ||||||
|     proximity: u8, |  | ||||||
|     word1: Vec<u8>, |  | ||||||
|     #[allow(clippy::type_complexity)] |  | ||||||
|     batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl PrefixAndProximityBatch { |  | ||||||
|     /// Insert the new key and value into the batch |  | ||||||
|     /// |  | ||||||
|     /// The key must either exist in the batch or be greater than all existing keys |  | ||||||
|     fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) { |  | ||||||
|         match self.batch.iter_mut().find(|el| el.0 == new_key) { |  | ||||||
|             Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), |  | ||||||
|             None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Empties the batch, calling `insert` on each element. |  | ||||||
|     /// |  | ||||||
|     /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. |  | ||||||
|     fn flush( |  | ||||||
|         &mut self, |  | ||||||
|         merge_buffer: &mut Vec<u8>, |  | ||||||
|         insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         let PrefixAndProximityBatch { proximity, word1, batch } = self; |  | ||||||
|         if batch.is_empty() { |  | ||||||
|             return Ok(()); |  | ||||||
|         } |  | ||||||
|         merge_buffer.clear(); |  | ||||||
|  |  | ||||||
|         let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); |  | ||||||
|         buffer.push(*proximity); |  | ||||||
|         buffer.extend_from_slice(word1); |  | ||||||
|         buffer.push(0); |  | ||||||
|  |  | ||||||
|         for (key, mergeable_data) in batch.drain(..) { |  | ||||||
|             buffer.truncate(1 + word1.len() + 1); |  | ||||||
|             buffer.extend_from_slice(key.as_slice()); |  | ||||||
|  |  | ||||||
|             let data = if mergeable_data.len() > 1 { |  | ||||||
|                 CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; |  | ||||||
|                 merge_buffer.as_slice() |  | ||||||
|             } else { |  | ||||||
|                 &mergeable_data[0] |  | ||||||
|             }; |  | ||||||
|             insert(buffer.as_slice(), data)?; |  | ||||||
|             merge_buffer.clear(); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /** A prefix trie. Used to iterate quickly over the prefixes of a word that are |  | ||||||
| within a set. |  | ||||||
|  |  | ||||||
| ## Structure |  | ||||||
| The trie is made of nodes composed of: |  | ||||||
| 1. a byte character (e.g. 'a') |  | ||||||
| 2. whether the node is an end node or not |  | ||||||
| 3. a list of children nodes, sorted by their byte character |  | ||||||
|  |  | ||||||
| For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` |  | ||||||
| is drawn below. Nodes with a double border are "end nodes". |  | ||||||
|  |  | ||||||
| ┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ |  | ||||||
| │          a           │ │          c           │ ║          r           ║ |  | ||||||
| └──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ |  | ||||||
| ╔══════╗╔══════╗╔══════╗ ┌─────────┐  ╔═════════╗ ┌─────────┐ ╔══════════╗ |  | ||||||
| ║  c   ║║  e   ║║  r   ║ │    e    │  ║    h    ║ │    e    │ ║    i     ║ |  | ||||||
| ╚══════╝╚══════╝╚══════╝ └─────────┘  ╚═════════╝ └─────────┘ ╚══════════╝ |  | ||||||
|                          ╔═══╗ ╔═══╗                 ╔═══╗ |  | ||||||
|                          ║ i ║ ║ l ║                 ║ l ║ |  | ||||||
|                          ╚═══╝ ╚═══╝                 ╚═══╝ |  | ||||||
| */ |  | ||||||
| #[derive(Default, Debug)] |  | ||||||
| struct PrefixTrieNode { |  | ||||||
|     children: Vec<(PrefixTrieNode, u8)>, |  | ||||||
|     is_end_node: bool, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug)] |  | ||||||
| struct PrefixTrieNodeSearchStart(usize); |  | ||||||
|  |  | ||||||
| impl PrefixTrieNode { |  | ||||||
|     fn is_empty(&self) -> bool { |  | ||||||
|         self.children.is_empty() |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Returns false if the trie does not contain a prefix of the given word. |  | ||||||
|     /// Returns true if the trie *may* contain a prefix of the given word. |  | ||||||
|     /// |  | ||||||
|     /// Moves the search start to the first node equal to the first letter of the word, |  | ||||||
|     /// or to 0 otherwise. |  | ||||||
|     fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { |  | ||||||
|         let byte = word[0]; |  | ||||||
|         if self.children[search_start.0].1 == byte { |  | ||||||
|             true |  | ||||||
|         } else { |  | ||||||
|             match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { |  | ||||||
|                 Ok(position) => { |  | ||||||
|                     search_start.0 += position; |  | ||||||
|                     true |  | ||||||
|                 } |  | ||||||
|                 Err(_) => { |  | ||||||
|                     search_start.0 = 0; |  | ||||||
|                     false |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn from_sorted_prefixes<'a>(prefixes: impl Iterator<Item = &'a str>) -> Self { |  | ||||||
|         let mut node = PrefixTrieNode::default(); |  | ||||||
|         for prefix in prefixes { |  | ||||||
|             node.insert_sorted_prefix(prefix.as_bytes().iter()); |  | ||||||
|         } |  | ||||||
|         node |  | ||||||
|     } |  | ||||||
|     fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter<u8>) { |  | ||||||
|         if let Some(&c) = prefix.next() { |  | ||||||
|             if let Some((node, byte)) = self.children.last_mut() { |  | ||||||
|                 if *byte == c { |  | ||||||
|                     node.insert_sorted_prefix(prefix); |  | ||||||
|                     return; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|             let mut new_node = PrefixTrieNode::default(); |  | ||||||
|             new_node.insert_sorted_prefix(prefix); |  | ||||||
|             self.children.push((new_node, c)); |  | ||||||
|         } else { |  | ||||||
|             self.is_end_node = true; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Call the given closure on each prefix of the word contained in the prefix trie. |  | ||||||
|     /// |  | ||||||
|     /// The search starts from the given `search_start`. |  | ||||||
|     fn for_each_prefix_of( |  | ||||||
|         &self, |  | ||||||
|         word: &[u8], |  | ||||||
|         buffer: &mut Vec<u8>, |  | ||||||
|         search_start: &PrefixTrieNodeSearchStart, |  | ||||||
|         mut do_fn: impl FnMut(&mut Vec<u8>), |  | ||||||
|     ) { |  | ||||||
|         let first_byte = word[0]; |  | ||||||
|         let mut cur_node = self; |  | ||||||
|         buffer.push(first_byte); |  | ||||||
|         if let Some((child_node, c)) = |  | ||||||
|             cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) |  | ||||||
|         { |  | ||||||
|             if *c == first_byte { |  | ||||||
|                 cur_node = child_node; |  | ||||||
|                 if cur_node.is_end_node { |  | ||||||
|                     do_fn(buffer); |  | ||||||
|                 } |  | ||||||
|                 for &byte in &word[1..] { |  | ||||||
|                     buffer.push(byte); |  | ||||||
|                     if let Some((child_node, c)) = |  | ||||||
|                         cur_node.children.iter().find(|(_, c)| *c >= byte) |  | ||||||
|                     { |  | ||||||
|                         if *c == byte { |  | ||||||
|                             cur_node = child_node; |  | ||||||
|                             if cur_node.is_end_node { |  | ||||||
|                                 do_fn(buffer); |  | ||||||
|                             } |  | ||||||
|                         } else { |  | ||||||
|                             break; |  | ||||||
|                         } |  | ||||||
|                     } else { |  | ||||||
|                         break; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use roaring::RoaringBitmap; |  | ||||||
|  |  | ||||||
|     use super::*; |  | ||||||
|     use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; |  | ||||||
|  |  | ||||||
|     fn check_prefixes( |  | ||||||
|         trie: &PrefixTrieNode, |  | ||||||
|         search_start: &PrefixTrieNodeSearchStart, |  | ||||||
|         word: &str, |  | ||||||
|         expected_prefixes: &[&str], |  | ||||||
|     ) { |  | ||||||
|         let mut actual_prefixes = vec![]; |  | ||||||
|         trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { |  | ||||||
|             let s = String::from_utf8(x.to_owned()).unwrap(); |  | ||||||
|             actual_prefixes.push(s); |  | ||||||
|         }); |  | ||||||
|         assert_eq!(actual_prefixes, expected_prefixes); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn test_trie() { |  | ||||||
|         let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ |  | ||||||
|             "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", |  | ||||||
|             "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", |  | ||||||
|             "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", |  | ||||||
|             "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", |  | ||||||
|             "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", |  | ||||||
|             "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", |  | ||||||
|             "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", |  | ||||||
|             "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", |  | ||||||
|             "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", |  | ||||||
|             "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", |  | ||||||
|             "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", |  | ||||||
|             "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", |  | ||||||
|             "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", |  | ||||||
|             "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", |  | ||||||
|             "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", |  | ||||||
|             "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", |  | ||||||
|         ])); |  | ||||||
|  |  | ||||||
|         let mut search_start = PrefixTrieNodeSearchStart(0); |  | ||||||
|  |  | ||||||
|         let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); |  | ||||||
|         assert!(!is_empty); |  | ||||||
|         assert_eq!(search_start.0, 2); |  | ||||||
|  |  | ||||||
|         check_prefixes(&trie, &search_start, "affair", &["a"]); |  | ||||||
|         check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); |  | ||||||
|  |  | ||||||
|         let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); |  | ||||||
|         assert!(!is_empty); |  | ||||||
|         assert_eq!(trie.children[search_start.0].1, b'u'); |  | ||||||
|  |  | ||||||
|         check_prefixes(&trie, &search_start, "unique", &["u", "un"]); |  | ||||||
|  |  | ||||||
|         // NOTE: this should fail, because the search start is already beyong 'a' |  | ||||||
|         let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); |  | ||||||
|         assert!(!is_empty); |  | ||||||
|         // search start is reset |  | ||||||
|         assert_eq!(search_start.0, 0); |  | ||||||
|  |  | ||||||
|         let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ |  | ||||||
|             "arb", "arbre", "cat", "catto", |  | ||||||
|         ])); |  | ||||||
|         check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); |  | ||||||
|         check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn test_execute_on_word_pairs_and_prefixes() { |  | ||||||
|         let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ |  | ||||||
|             "arb", "arbre", "cat", "catto", |  | ||||||
|         ])); |  | ||||||
|  |  | ||||||
|         let mut serialised_bitmap123 = vec![]; |  | ||||||
|         let mut bitmap123 = RoaringBitmap::new(); |  | ||||||
|         bitmap123.insert(1); |  | ||||||
|         bitmap123.insert(2); |  | ||||||
|         bitmap123.insert(3); |  | ||||||
|         CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); |  | ||||||
|  |  | ||||||
|         let mut serialised_bitmap456 = vec![]; |  | ||||||
|         let mut bitmap456 = RoaringBitmap::new(); |  | ||||||
|         bitmap456.insert(4); |  | ||||||
|         bitmap456.insert(5); |  | ||||||
|         bitmap456.insert(6); |  | ||||||
|         CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); |  | ||||||
|  |  | ||||||
|         let mut serialised_bitmap789 = vec![]; |  | ||||||
|         let mut bitmap789 = RoaringBitmap::new(); |  | ||||||
|         bitmap789.insert(7); |  | ||||||
|         bitmap789.insert(8); |  | ||||||
|         bitmap789.insert(9); |  | ||||||
|         CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); |  | ||||||
|  |  | ||||||
|         let mut serialised_bitmap_ranges = vec![]; |  | ||||||
|         let mut bitmap_ranges = RoaringBitmap::new(); |  | ||||||
|         bitmap_ranges.insert_range(63_000..65_000); |  | ||||||
|         bitmap_ranges.insert_range(123_000..128_000); |  | ||||||
|         CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); |  | ||||||
|  |  | ||||||
|         let word_pairs = [ |  | ||||||
|             ((1, "healthy", "arbres"), &serialised_bitmap123), |  | ||||||
|             ((1, "healthy", "boat"), &serialised_bitmap123), |  | ||||||
|             ((1, "healthy", "ca"), &serialised_bitmap123), |  | ||||||
|             ((1, "healthy", "cats"), &serialised_bitmap456), |  | ||||||
|             ((1, "healthy", "cattos"), &serialised_bitmap123), |  | ||||||
|             ((1, "jittery", "cat"), &serialised_bitmap123), |  | ||||||
|             ((1, "jittery", "cata"), &serialised_bitmap456), |  | ||||||
|             ((1, "jittery", "catb"), &serialised_bitmap789), |  | ||||||
|             ((1, "jittery", "catc"), &serialised_bitmap_ranges), |  | ||||||
|             ((2, "healthy", "arbre"), &serialised_bitmap123), |  | ||||||
|             ((2, "healthy", "arbres"), &serialised_bitmap456), |  | ||||||
|             ((2, "healthy", "cats"), &serialised_bitmap789), |  | ||||||
|             ((2, "healthy", "cattos"), &serialised_bitmap_ranges), |  | ||||||
|             ((3, "healthy", "arbre"), &serialised_bitmap456), |  | ||||||
|             ((3, "healthy", "arbres"), &serialised_bitmap789), |  | ||||||
|         ]; |  | ||||||
|  |  | ||||||
|         let expected_result = [ |  | ||||||
|             ((1, "healthy", "arb"), bitmap123.clone()), |  | ||||||
|             ((1, "healthy", "arbre"), bitmap123.clone()), |  | ||||||
|             ((1, "healthy", "cat"), &bitmap456 | &bitmap123), |  | ||||||
|             ((1, "healthy", "catto"), bitmap123.clone()), |  | ||||||
|             ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), |  | ||||||
|             ((2, "healthy", "arb"), &bitmap123 | &bitmap456), |  | ||||||
|             ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), |  | ||||||
|             ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), |  | ||||||
|             ((2, "healthy", "catto"), bitmap_ranges.clone()), |  | ||||||
|         ]; |  | ||||||
|  |  | ||||||
|         let mut result = vec![]; |  | ||||||
|  |  | ||||||
|         let mut iter = |  | ||||||
|             IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { |  | ||||||
|                 ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) |  | ||||||
|             }); |  | ||||||
|         execute_on_word_pairs_and_prefixes( |  | ||||||
|             &mut iter, |  | ||||||
|             |iter| Ok(iter.next()), |  | ||||||
|             &prefixes, |  | ||||||
|             2, |  | ||||||
|             |k, v| { |  | ||||||
|                 let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); |  | ||||||
|                 let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); |  | ||||||
|                 result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); |  | ||||||
|                 Ok(()) |  | ||||||
|             }, |  | ||||||
|         ) |  | ||||||
|         .unwrap(); |  | ||||||
|  |  | ||||||
|         for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { |  | ||||||
|             let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; |  | ||||||
|             let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; |  | ||||||
|  |  | ||||||
|             assert_eq!(actual_word1, expected_word1); |  | ||||||
|             assert_eq!(actual_prefix, expected_prefix); |  | ||||||
|             assert_eq!(actual_proximity, expected_proximity); |  | ||||||
|             assert_eq!(actual_bitmap, expected_bitmap); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
		Reference in New Issue
	
	Block a user