mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Add prefix_word_pair_proximity database
Similar to the word_prefix_pair_proximity one but instead the keys are: (proximity, prefix, word2)
This commit is contained in:
		
				
					committed by
					
						 Loïc Lecrenier
						Loïc Lecrenier
					
				
			
			
				
	
			
			
			
						parent
						
							1dbbd8694f
						
					
				
				
					commit
					264a04922d
				
			| @@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             prefix_word_pair_proximity_docids, | ||||
|             word_position_docids, | ||||
|             field_id_word_count_docids, | ||||
|             word_prefix_position_docids, | ||||
| @@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         docid_word_positions.clear(self.wtxn)?; | ||||
|         word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         prefix_word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_position_docids.clear(self.wtxn)?; | ||||
|         field_id_word_count_docids.clear(self.wtxn)?; | ||||
|         word_prefix_position_docids.clear(self.wtxn)?; | ||||
|   | ||||
| @@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             word_pair_proximity_docids, | ||||
|             field_id_word_count_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             prefix_word_pair_proximity_docids, | ||||
|             word_position_docids, | ||||
|             word_prefix_position_docids, | ||||
|             facet_id_f64_docids, | ||||
| @@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; | ||||
|         } | ||||
|  | ||||
|         // We delete the documents ids from the word prefix pair proximity database docids | ||||
|         // and remove the empty pairs too. | ||||
|         let db = word_prefix_pair_proximity_docids.remap_key_type::<ByteSlice>(); | ||||
|         let mut iter = db.iter_mut(self.wtxn)?; | ||||
|         while let Some(result) = iter.next() { | ||||
|             let (key, mut docids) = result?; | ||||
|             let previous_len = docids.len(); | ||||
|             docids -= &self.to_delete_docids; | ||||
|             if docids.is_empty() { | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { iter.del_current()? }; | ||||
|             } else if docids.len() != previous_len { | ||||
|                 let key = key.to_owned(); | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { iter.put_current(&key, &docids)? }; | ||||
|         for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { | ||||
|             // We delete the documents ids from the word prefix pair proximity database docids | ||||
|             // and remove the empty pairs too. | ||||
|             let db = db.remap_key_type::<ByteSlice>(); | ||||
|             let mut iter = db.iter_mut(self.wtxn)?; | ||||
|             while let Some(result) = iter.next() { | ||||
|                 let (key, mut docids) = result?; | ||||
|                 let previous_len = docids.len(); | ||||
|                 docids -= &self.to_delete_docids; | ||||
|                 if docids.is_empty() { | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.del_current()? }; | ||||
|                 } else if docids.len() != previous_len { | ||||
|                     let key = key.to_owned(); | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.put_current(&key, &docids)? }; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // We delete the documents ids that are under the pairs of words, | ||||
|         // it is faster and use no memory to iterate over all the words pairs than | ||||
|         // to compute the cartesian product of every words of the deleted documents. | ||||
|   | ||||
| @@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; | ||||
| use crate::error::UserError; | ||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||
| use crate::update::{ | ||||
|     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, | ||||
|     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, | ||||
|     self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, | ||||
|     WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, | ||||
| }; | ||||
| use crate::{Index, Result, RoaringBitmapCodec}; | ||||
|  | ||||
| @@ -528,12 +528,7 @@ where | ||||
|  | ||||
|         if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { | ||||
|             // Run the word prefix pair proximity docids update operation. | ||||
|             let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); | ||||
|             builder.chunk_compression_type = self.indexer_config.chunk_compression_type; | ||||
|             builder.chunk_compression_level = self.indexer_config.chunk_compression_level; | ||||
|             builder.max_nb_chunks = self.indexer_config.max_nb_chunks; | ||||
|             builder.max_memory = self.indexer_config.max_memory; | ||||
|             builder.execute( | ||||
|             PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( | ||||
|                 word_pair_proximity_docids, | ||||
|                 &new_prefix_fst_words, | ||||
|                 &common_prefix_fst_words, | ||||
|   | ||||
| @@ -6,10 +6,10 @@ pub use self::index_documents::{ | ||||
|     DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, | ||||
| }; | ||||
| pub use self::indexer_config::IndexerConfig; | ||||
| pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; | ||||
| pub use self::settings::{Setting, Settings}; | ||||
| pub use self::update_step::UpdateIndexingStep; | ||||
| pub use self::word_prefix_docids::WordPrefixDocids; | ||||
| pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; | ||||
| pub use self::words_prefix_position_docids::WordPrefixPositionDocids; | ||||
| pub use self::words_prefixes_fst::WordsPrefixesFst; | ||||
|  | ||||
| @@ -19,9 +19,9 @@ mod delete_documents; | ||||
| mod facets; | ||||
| mod index_documents; | ||||
| mod indexer_config; | ||||
| mod prefix_word_pairs; | ||||
| mod settings; | ||||
| mod update_step; | ||||
| mod word_prefix_docids; | ||||
| mod word_prefix_pair_proximity_docids; | ||||
| mod words_prefix_position_docids; | ||||
| mod words_prefixes_fst; | ||||
|   | ||||
							
								
								
									
										216
									
								
								milli/src/update/prefix_word_pairs/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										216
									
								
								milli/src/update/prefix_word_pairs/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,216 @@ | ||||
| use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; | ||||
| use crate::{Index, Result}; | ||||
| use heed::types::ByteSlice; | ||||
| use std::{borrow::Cow, collections::HashSet, io::BufReader}; | ||||
|  | ||||
| mod prefix_word; | ||||
| mod word_prefix; | ||||
|  | ||||
| pub use prefix_word::index_prefix_word_database; | ||||
| pub use word_prefix::index_word_prefix_database; | ||||
|  | ||||
| pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     max_proximity: u8, | ||||
|     max_prefix_length: usize, | ||||
| } | ||||
| impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { | ||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { | ||||
|         Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } | ||||
|     } | ||||
|     /// Set the maximum proximity required to make a prefix be part of the words prefixes | ||||
|     /// database. If two words are too far from the threshold the associated documents will | ||||
|     /// not be part of the prefix database. | ||||
|     /// | ||||
|     /// Default value is 4. This value must be lower or equal than 7 and will be clamped | ||||
|     /// to this bound otherwise. | ||||
|     pub fn max_proximity(&mut self, value: u8) -> &mut Self { | ||||
|         self.max_proximity = value.max(7); | ||||
|         self | ||||
|     } | ||||
|     /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words | ||||
|     /// prefixes database. If the prefix length is higher than the threshold, the associated documents | ||||
|     /// will not be part of the prefix database. | ||||
|     /// | ||||
|     /// Default value is 2. | ||||
|     pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { | ||||
|         self.max_prefix_length = value; | ||||
|         self | ||||
|     } | ||||
|     #[logging_timer::time("WordPrefixPairProximityDocids::{}")] | ||||
|     pub fn execute<'a>( | ||||
|         self, | ||||
|         new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, | ||||
|         new_prefix_fst_words: &'a [String], | ||||
|         common_prefix_fst_words: &[&'a [String]], | ||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
|     ) -> Result<()> { | ||||
|         index_word_prefix_database( | ||||
|             self.wtxn, | ||||
|             self.index.word_pair_proximity_docids, | ||||
|             self.index.word_prefix_pair_proximity_docids, | ||||
|             self.max_proximity, | ||||
|             self.max_prefix_length, | ||||
|             new_word_pair_proximity_docids.clone(), | ||||
|             new_prefix_fst_words, | ||||
|             common_prefix_fst_words, | ||||
|             del_prefix_fst_words, | ||||
|         )?; | ||||
|  | ||||
|         index_prefix_word_database( | ||||
|             self.wtxn, | ||||
|             self.index.word_pair_proximity_docids, | ||||
|             self.index.prefix_word_pair_proximity_docids, | ||||
|             self.max_proximity, | ||||
|             self.max_prefix_length, | ||||
|             new_word_pair_proximity_docids, | ||||
|             new_prefix_fst_words, | ||||
|             common_prefix_fst_words, | ||||
|             del_prefix_fst_words, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| // This is adapted from `sorter_into_lmdb_database` | ||||
| pub fn insert_into_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     new_key: &[u8], | ||||
|     new_value: &[u8], | ||||
| ) -> Result<()> { | ||||
|     let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; | ||||
|     match iter.next().transpose()? { | ||||
|         Some((key, old_val)) if new_key == key => { | ||||
|             let val = | ||||
|                 merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) | ||||
|                     .map_err(|_| { | ||||
|                         // TODO just wrap this error? | ||||
|                         crate::error::InternalError::IndexingMergingKeys { | ||||
|                             process: "get-put-merge", | ||||
|                         } | ||||
|                     })?; | ||||
|             // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour | ||||
|             unsafe { iter.put_current(new_key, &val)? }; | ||||
|         } | ||||
|         _ => { | ||||
|             drop(iter); | ||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, | ||||
| // but it uses `append` if the database is empty, and it assumes that the values in the | ||||
| // writer don't conflict with values in the database. | ||||
| pub fn write_into_lmdb_database_without_merging( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     writer: grenad::Writer<std::fs::File>, | ||||
| ) -> Result<()> { | ||||
|     let file = writer.into_inner()?; | ||||
|     let reader = grenad::Reader::new(BufReader::new(file))?; | ||||
|     if database.is_empty(wtxn)? { | ||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|         let mut cursor = reader.into_cursor()?; | ||||
|         while let Some((k, v)) = cursor.move_on_next()? { | ||||
|             // safety: the key comes from the grenad reader, not the database | ||||
|             unsafe { out_iter.append(k, v)? }; | ||||
|         } | ||||
|     } else { | ||||
|         let mut cursor = reader.into_cursor()?; | ||||
|         while let Some((k, v)) = cursor.move_on_next()? { | ||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use crate::db_snap; | ||||
|     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use std::io::Cursor; | ||||
|  | ||||
|     fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> { | ||||
|         let mut documents = Vec::new(); | ||||
|         for prefix in prefixes { | ||||
|             for i in 0..50 { | ||||
|                 documents.push( | ||||
|                     serde_json::json!({ | ||||
|                         "text": format!("{prefix}{i:x}"), | ||||
|                     }) | ||||
|                     .as_object() | ||||
|                     .unwrap() | ||||
|                     .clone(), | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|         documents | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_update() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|         index.index_documents_config.autogenerate_docids = true; | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "At an amazing and beautiful house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "The bell rings at 5 am" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "At an extraordinary house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, word_pair_proximity_docids, "update"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "update"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "update"); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										178
									
								
								milli/src/update/prefix_word_pairs/prefix_word.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										178
									
								
								milli/src/update/prefix_word_pairs/prefix_word.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,178 @@ | ||||
| use crate::update::index_documents::{create_writer, CursorClonableMmap}; | ||||
| use crate::update::prefix_word_pairs::{ | ||||
|     insert_into_database, write_into_lmdb_database_without_merging, | ||||
| }; | ||||
| use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::BytesDecode; | ||||
| use log::debug; | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashSet}; | ||||
|  | ||||
| #[logging_timer::time] | ||||
| pub fn index_prefix_word_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||
|     prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||
|     max_proximity: u8, | ||||
|     max_prefix_length: usize, | ||||
|     new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, | ||||
|     new_prefix_fst_words: &[String], | ||||
|     common_prefix_fst_words: &[&[String]], | ||||
|     del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
| ) -> Result<()> { | ||||
|     let max_proximity = max_proximity - 1; | ||||
|     debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); | ||||
|  | ||||
|     let common_prefixes: Vec<_> = common_prefix_fst_words | ||||
|         .into_iter() | ||||
|         .map(|s| s.into_iter()) | ||||
|         .flatten() | ||||
|         .map(|s| s.as_str()) | ||||
|         .filter(|s| s.len() <= max_prefix_length) | ||||
|         .collect(); | ||||
|  | ||||
|     // If the prefix trie is not empty, then we can iterate over all new | ||||
|     // word pairs to look for new (word1, common_prefix, proximity) elements | ||||
|     // to insert in the DB | ||||
|     for proximity in 1..=max_proximity - 1 { | ||||
|         for prefix in common_prefixes.iter() { | ||||
|             let mut prefix_key = vec![]; | ||||
|             prefix_key.push(proximity); | ||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); | ||||
|             let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; | ||||
|             // This is the core of the algorithm | ||||
|             execute_on_word_pairs_and_prefixes( | ||||
|                 proximity + 1, | ||||
|                 prefix.as_bytes(), | ||||
|                 // the next two arguments tell how to iterate over the new word pairs | ||||
|                 &mut cursor, | ||||
|                 |cursor| { | ||||
|                     if let Some((key, value)) = cursor.next()? { | ||||
|                         let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) | ||||
|                             .ok_or(heed::Error::Decoding)?; | ||||
|                         Ok(Some((word2, value))) | ||||
|                     } else { | ||||
|                         Ok(None) | ||||
|                     } | ||||
|                 }, | ||||
|                 // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) | ||||
|                 |key, value| { | ||||
|                     insert_into_database( | ||||
|                         wtxn, | ||||
|                         *prefix_word_pair_proximity_docids.as_polymorph(), | ||||
|                         key, | ||||
|                         value, | ||||
|                     ) | ||||
|                 }, | ||||
|             )?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // Now we do the same thing with the new prefixes and all word pairs in the DB | ||||
|     let new_prefixes: Vec<_> = new_prefix_fst_words | ||||
|         .into_iter() | ||||
|         .map(|s| s.as_str()) | ||||
|         .filter(|s| s.len() <= max_prefix_length) | ||||
|         .collect(); | ||||
|  | ||||
|     // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) | ||||
|     // element in an intermediary grenad | ||||
|     let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); | ||||
|  | ||||
|     for proximity in 1..=max_proximity - 1 { | ||||
|         for prefix in new_prefixes.iter() { | ||||
|             let mut prefix_key = vec![]; | ||||
|             prefix_key.push(proximity); | ||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); | ||||
|             let mut db_iter = word_pair_proximity_docids | ||||
|                 .as_polymorph() | ||||
|                 .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? | ||||
|                 .remap_key_type::<UncheckedU8StrStrCodec>(); | ||||
|             execute_on_word_pairs_and_prefixes( | ||||
|                 proximity + 1, | ||||
|                 prefix.as_bytes(), | ||||
|                 &mut db_iter, | ||||
|                 |db_iter| { | ||||
|                     db_iter | ||||
|                         .next() | ||||
|                         .transpose() | ||||
|                         .map(|x| x.map(|((_, _, word2), value)| (word2, value))) | ||||
|                         .map_err(|e| e.into()) | ||||
|                 }, | ||||
|                 |key, value| writer.insert(key, value).map_err(|e| e.into()), | ||||
|             )?; | ||||
|             drop(db_iter); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // and then we write the grenad into the DB | ||||
|     // Since the grenad contains only new prefixes, we know in advance that none | ||||
|     // of its elements already exist in the DB, thus there is no need to specify | ||||
|     // how to merge conflicting elements | ||||
|     write_into_lmdb_database_without_merging( | ||||
|         wtxn, | ||||
|         *prefix_word_pair_proximity_docids.as_polymorph(), | ||||
|         writer, | ||||
|     )?; | ||||
|  | ||||
|     // All of the word prefix pairs in the database that have a w2 | ||||
|     // that is contained in the `suppr_pw` set must be removed as well. | ||||
|     if !del_prefix_fst_words.is_empty() { | ||||
|         let mut iter = | ||||
|             prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?; | ||||
|         while let Some(((_, prefix, _), _)) = iter.next().transpose()? { | ||||
|             if del_prefix_fst_words.contains(prefix.as_bytes()) { | ||||
|                 // Delete this entry as the w2 prefix is no more in the words prefix fst. | ||||
|                 unsafe { iter.del_current()? }; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. | ||||
| /// | ||||
| /// Its main arguments are: | ||||
| /// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements | ||||
| /// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements | ||||
| /// | ||||
| /// For more information about what this function does, read the module documentation. | ||||
| fn execute_on_word_pairs_and_prefixes<I>( | ||||
|     proximity: u8, | ||||
|     prefix: &[u8], | ||||
|     iter: &mut I, | ||||
|     mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>, | ||||
|     mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, | ||||
| ) -> Result<()> { | ||||
|     let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default(); | ||||
|  | ||||
|     while let Some((word2, data)) = next_word2_and_docids(iter)? { | ||||
|         let entry = batch.entry(word2.to_owned()).or_default(); | ||||
|         entry.push(Cow::Owned(data.to_owned())); | ||||
|     } | ||||
|  | ||||
|     let mut key_buffer = Vec::with_capacity(8); | ||||
|     key_buffer.push(proximity); | ||||
|     key_buffer.extend_from_slice(prefix); | ||||
|     key_buffer.push(0); | ||||
|  | ||||
|     let mut value_buffer = Vec::with_capacity(65_536); | ||||
|  | ||||
|     for (key, values) in batch { | ||||
|         key_buffer.truncate(prefix.len() + 2); | ||||
|         value_buffer.clear(); | ||||
|  | ||||
|         key_buffer.extend_from_slice(&key); | ||||
|         let data = if values.len() > 1 { | ||||
|             CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; | ||||
|             value_buffer.as_slice() | ||||
|         } else { | ||||
|             &values[0] | ||||
|         }; | ||||
|         insert(key_buffer.as_slice(), data)?; | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
| @@ -0,0 +1,46 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| 1  5                a    [101, ] | ||||
| 1  amazing          a    [100, ] | ||||
| 1  an               a    [100, ] | ||||
| 1  and              b    [100, ] | ||||
| 1  and              be   [100, ] | ||||
| 1  at               a    [100, ] | ||||
| 1  rings            a    [101, ] | ||||
| 1  the              b    [101, ] | ||||
| 1  the              be   [101, ] | ||||
| 2  5                a    [101, ] | ||||
| 2  amazing          a    [100, ] | ||||
| 2  amazing          b    [100, ] | ||||
| 2  amazing          be   [100, ] | ||||
| 2  an               a    [100, ] | ||||
| 2  and              a    [100, ] | ||||
| 2  at               a    [100, 101, ] | ||||
| 2  beautiful        a    [100, ] | ||||
| 2  bell             a    [101, ] | ||||
| 2  house            b    [100, ] | ||||
| 2  house            be   [100, ] | ||||
| 2  rings            b    [101, ] | ||||
| 2  rings            be   [101, ] | ||||
| 3  am               a    [101, ] | ||||
| 3  amazing          a    [100, ] | ||||
| 3  an               b    [100, ] | ||||
| 3  an               be   [100, ] | ||||
| 3  and              a    [100, ] | ||||
| 3  at               a    [100, ] | ||||
| 3  at               b    [101, ] | ||||
| 3  at               be   [101, ] | ||||
| 3  beautiful        a    [100, ] | ||||
| 3  house            a    [100, ] | ||||
| 3  rings            a    [101, ] | ||||
| 3  the              a    [101, ] | ||||
| 4  5                b    [101, ] | ||||
| 4  5                be   [101, ] | ||||
| 4  and              a    [100, ] | ||||
| 4  at               b    [100, ] | ||||
| 4  at               be   [100, ] | ||||
| 4  beautiful        a    [100, ] | ||||
| 4  bell             a    [101, ] | ||||
| 4  house            a    [100, ] | ||||
|  | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| fb88e49fd666886731b62baef8f44995 | ||||
| @@ -0,0 +1,41 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| 2  a    5                [101, ] | ||||
| 2  a    amazing          [100, ] | ||||
| 2  a    an               [100, 202, ] | ||||
| 2  a    and              [100, ] | ||||
| 2  a    beautiful        [100, ] | ||||
| 2  a    extraordinary    [202, ] | ||||
| 2  am   and              [100, ] | ||||
| 2  an   amazing          [100, ] | ||||
| 2  an   beautiful        [100, ] | ||||
| 2  an   extraordinary    [202, ] | ||||
| 2  b    house            [100, ] | ||||
| 2  b    rings            [101, ] | ||||
| 2  be   house            [100, ] | ||||
| 2  be   rings            [101, ] | ||||
| 3  a    5                [101, ] | ||||
| 3  a    am               [101, ] | ||||
| 3  a    amazing          [100, ] | ||||
| 3  a    an               [100, ] | ||||
| 3  a    and              [100, ] | ||||
| 3  a    at               [100, 202, ] | ||||
| 3  a    beautiful        [100, ] | ||||
| 3  a    extraordinary    [202, ] | ||||
| 3  a    house            [100, 202, ] | ||||
| 3  a    rings            [101, ] | ||||
| 3  am   5                [101, ] | ||||
| 3  am   an               [100, ] | ||||
| 3  am   beautiful        [100, ] | ||||
| 3  an   amazing          [100, ] | ||||
| 3  an   and              [100, ] | ||||
| 3  an   at               [100, 202, ] | ||||
| 3  an   house            [100, 202, ] | ||||
| 3  b    and              [100, ] | ||||
| 3  b    at               [101, ] | ||||
| 3  b    the              [101, ] | ||||
| 3  be   and              [100, ] | ||||
| 3  be   at               [101, ] | ||||
| 3  be   the              [101, ] | ||||
|  | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| 6965ecd1bf821f1cf921c2ab751b36cf | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| fb88e49fd666886731b62baef8f44995 | ||||
| @@ -1,5 +1,5 @@ | ||||
| /*! | ||||
|  ## What is WordPrefixPairProximityDocids? | ||||
|  ## What is WordPrefix? | ||||
| The word-prefix-pair-proximity-docids database is a database whose keys are of | ||||
| the form `(proximity, word, prefix)` and the values are roaring bitmaps of | ||||
| the documents which contain `word` followed by another word starting with | ||||
| @@ -139,7 +139,7 @@ inputs described above, which come from different places: | ||||
| 
 | ||||
|     2. `word_pairs_db`, which is the list of word pairs from the database. | ||||
|     This list includes all elements in `new_word_pairs` since `new_word_pairs` | ||||
|     was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` | ||||
|     was added to the database prior to calling the `WordPrefix::execute` | ||||
|     function. | ||||
| 
 | ||||
| To update the prefix database correctly, we call the algorithm described earlier first | ||||
| @@ -161,196 +161,137 @@ reader and writer). Therefore, when calling the algorithm on | ||||
| `((proximity, word, prefix), docids)` elements in an intermediary grenad | ||||
| Writer instead of the DB. At the end of the outer loop, we finally read from | ||||
| the grenad and insert its elements in the database. | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| */ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| use std::io::BufReader; | ||||
| 
 | ||||
| use crate::update::index_documents::{create_writer, CursorClonableMmap}; | ||||
| use crate::update::prefix_word_pairs::{ | ||||
|     insert_into_database, write_into_lmdb_database_without_merging, | ||||
| }; | ||||
| use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::BytesDecode; | ||||
| use log::debug; | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| 
 | ||||
| use crate::update::index_documents::{ | ||||
|     create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, | ||||
| }; | ||||
| use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; | ||||
| 
 | ||||
| pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
| #[logging_timer::time] | ||||
| pub fn index_word_prefix_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||
|     word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||
|     max_proximity: u8, | ||||
|     max_prefix_length: usize, | ||||
| } | ||||
|     new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, | ||||
|     new_prefix_fst_words: &[String], | ||||
|     common_prefix_fst_words: &[&[String]], | ||||
|     del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
| ) -> Result<()> { | ||||
|     debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); | ||||
| 
 | ||||
| impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|     ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|         WordPrefixPairProximityDocids { | ||||
|             wtxn, | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             max_proximity: 4, | ||||
|             max_prefix_length: 2, | ||||
|         } | ||||
|     } | ||||
|     // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
 | ||||
|     let prefixes = PrefixTrieNode::from_sorted_prefixes( | ||||
|         common_prefix_fst_words | ||||
|             .into_iter() | ||||
|             .map(|s| s.into_iter()) | ||||
|             .flatten() | ||||
|             .map(|s| s.as_str()) | ||||
|             .filter(|s| s.len() <= max_prefix_length), | ||||
|     ); | ||||
| 
 | ||||
|     /// Set the maximum proximity required to make a prefix be part of the words prefixes
 | ||||
|     /// database. If two words are too far from the threshold the associated documents will
 | ||||
|     /// not be part of the prefix database.
 | ||||
|     ///
 | ||||
|     /// Default value is 4. This value must be lower or equal than 7 and will be clamped
 | ||||
|     /// to this bound otherwise.
 | ||||
|     pub fn max_proximity(&mut self, value: u8) -> &mut Self { | ||||
|         self.max_proximity = value.max(7); | ||||
|         self | ||||
|     } | ||||
| 
 | ||||
|     /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
 | ||||
|     /// prefixes database. If the prefix length is higher than the threshold, the associated documents
 | ||||
|     /// will not be part of the prefix database.
 | ||||
|     ///
 | ||||
|     /// Default value is 2.
 | ||||
|     pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { | ||||
|         self.max_prefix_length = value; | ||||
|         self | ||||
|     } | ||||
| 
 | ||||
|     #[logging_timer::time("WordPrefixPairProximityDocids::{}")] | ||||
|     pub fn execute<'a>( | ||||
|         mut self, | ||||
|         new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, | ||||
|         new_prefix_fst_words: &'a [String], | ||||
|         common_prefix_fst_words: &[&'a [String]], | ||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
|     ) -> Result<()> { | ||||
|         debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); | ||||
| 
 | ||||
|         // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
 | ||||
|         let prefixes = PrefixTrieNode::from_sorted_prefixes( | ||||
|             common_prefix_fst_words | ||||
|                 .into_iter() | ||||
|                 .map(|s| s.into_iter()) | ||||
|                 .flatten() | ||||
|                 .map(|s| s.as_str()) | ||||
|                 .filter(|s| s.len() <= self.max_prefix_length), | ||||
|         ); | ||||
| 
 | ||||
|         // If the prefix trie is not empty, then we can iterate over all new
 | ||||
|         // word pairs to look for new (word1, common_prefix, proximity) elements
 | ||||
|         // to insert in the DB
 | ||||
|         if !prefixes.is_empty() { | ||||
|             let mut cursor = new_word_pair_proximity_docids.into_cursor()?; | ||||
|             // This is the core of the algorithm
 | ||||
|             execute_on_word_pairs_and_prefixes( | ||||
|                 // the first two arguments tell how to iterate over the new word pairs
 | ||||
|                 &mut cursor, | ||||
|                 |cursor| { | ||||
|                     if let Some((key, value)) = cursor.move_on_next()? { | ||||
|                         let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) | ||||
|                             .ok_or(heed::Error::Decoding)?; | ||||
|                         Ok(Some(((proximity, word1, word2), value))) | ||||
|                     } else { | ||||
|                         Ok(None) | ||||
|                     } | ||||
|                 }, | ||||
|                 &prefixes, | ||||
|                 self.max_proximity, | ||||
|                 // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
 | ||||
|                 |key, value| { | ||||
|                     insert_into_database( | ||||
|                         &mut self.wtxn, | ||||
|                         *self.index.word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|                         key, | ||||
|                         value, | ||||
|                     ) | ||||
|                 }, | ||||
|             )?; | ||||
|         } | ||||
| 
 | ||||
|         // Now we do the same thing with the new prefixes and all word pairs in the DB
 | ||||
| 
 | ||||
|         let prefixes = PrefixTrieNode::from_sorted_prefixes( | ||||
|             new_prefix_fst_words | ||||
|                 .into_iter() | ||||
|                 .map(|s| s.as_str()) | ||||
|                 .filter(|s| s.len() <= self.max_prefix_length), | ||||
|         ); | ||||
| 
 | ||||
|         if !prefixes.is_empty() { | ||||
|             let mut db_iter = self | ||||
|                 .index | ||||
|                 .word_pair_proximity_docids | ||||
|                 .remap_key_type::<UncheckedU8StrStrCodec>() | ||||
|                 .remap_data_type::<ByteSlice>() | ||||
|                 .iter(self.wtxn)?; | ||||
| 
 | ||||
|             // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
 | ||||
|             // element in an intermediary grenad
 | ||||
|             let mut writer = create_writer( | ||||
|                 self.chunk_compression_type, | ||||
|                 self.chunk_compression_level, | ||||
|                 tempfile::tempfile()?, | ||||
|             ); | ||||
| 
 | ||||
|             execute_on_word_pairs_and_prefixes( | ||||
|                 &mut db_iter, | ||||
|                 |db_iter| db_iter.next().transpose().map_err(|e| e.into()), | ||||
|                 &prefixes, | ||||
|                 self.max_proximity, | ||||
|                 |key, value| writer.insert(key, value).map_err(|e| e.into()), | ||||
|             )?; | ||||
|             drop(db_iter); | ||||
| 
 | ||||
|             // and then we write the grenad into the DB
 | ||||
|             // Since the grenad contains only new prefixes, we know in advance that none
 | ||||
|             // of its elements already exist in the DB, thus there is no need to specify
 | ||||
|             // how to merge conflicting elements
 | ||||
|             write_into_lmdb_database_without_merging( | ||||
|                 self.wtxn, | ||||
|                 *self.index.word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|                 writer, | ||||
|             )?; | ||||
|         } | ||||
| 
 | ||||
|         // All of the word prefix pairs in the database that have a w2
 | ||||
|         // that is contained in the `suppr_pw` set must be removed as well.
 | ||||
|         if !del_prefix_fst_words.is_empty() { | ||||
|             let mut iter = self | ||||
|                 .index | ||||
|                 .word_prefix_pair_proximity_docids | ||||
|                 .remap_data_type::<ByteSlice>() | ||||
|                 .iter_mut(self.wtxn)?; | ||||
|             while let Some(((_, w2, _), _)) = iter.next().transpose()? { | ||||
|                 if del_prefix_fst_words.contains(w2.as_bytes()) { | ||||
|                     // Delete this entry as the w2 prefix is no more in the words prefix fst.
 | ||||
|                     unsafe { iter.del_current()? }; | ||||
|     // If the prefix trie is not empty, then we can iterate over all new
 | ||||
|     // word pairs to look for new (proximity, word1, common_prefix) elements
 | ||||
|     // to insert in the DB
 | ||||
|     if !prefixes.is_empty() { | ||||
|         let mut cursor = new_word_pair_proximity_docids.into_cursor()?; | ||||
|         // This is the core of the algorithm
 | ||||
|         execute_on_word_pairs_and_prefixes( | ||||
|             // the first two arguments tell how to iterate over the new word pairs
 | ||||
|             &mut cursor, | ||||
|             |cursor| { | ||||
|                 if let Some((key, value)) = cursor.move_on_next()? { | ||||
|                     let (proximity, word1, word2) = | ||||
|                         UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; | ||||
|                     Ok(Some(((proximity, word1, word2), value))) | ||||
|                 } else { | ||||
|                     Ok(None) | ||||
|                 } | ||||
|             }, | ||||
|             &prefixes, | ||||
|             max_proximity, | ||||
|             // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
 | ||||
|             |key, value| { | ||||
|                 insert_into_database( | ||||
|                     wtxn, | ||||
|                     *word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|                     key, | ||||
|                     value, | ||||
|                 ) | ||||
|             }, | ||||
|         )?; | ||||
|     } | ||||
| 
 | ||||
|     // Now we do the same thing with the new prefixes and all word pairs in the DB
 | ||||
| 
 | ||||
|     let prefixes = PrefixTrieNode::from_sorted_prefixes( | ||||
|         new_prefix_fst_words | ||||
|             .into_iter() | ||||
|             .map(|s| s.as_str()) | ||||
|             .filter(|s| s.len() <= max_prefix_length), | ||||
|     ); | ||||
| 
 | ||||
|     if !prefixes.is_empty() { | ||||
|         let mut db_iter = word_pair_proximity_docids | ||||
|             .remap_key_type::<UncheckedU8StrStrCodec>() | ||||
|             .remap_data_type::<ByteSlice>() | ||||
|             .iter(wtxn)?; | ||||
| 
 | ||||
|         // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
 | ||||
|         // element in an intermediary grenad
 | ||||
|         let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); | ||||
| 
 | ||||
|         execute_on_word_pairs_and_prefixes( | ||||
|             &mut db_iter, | ||||
|             |db_iter| db_iter.next().transpose().map_err(|e| e.into()), | ||||
|             &prefixes, | ||||
|             max_proximity, | ||||
|             |key, value| writer.insert(key, value).map_err(|e| e.into()), | ||||
|         )?; | ||||
|         drop(db_iter); | ||||
| 
 | ||||
|         // and then we write the grenad into the DB
 | ||||
|         // Since the grenad contains only new prefixes, we know in advance that none
 | ||||
|         // of its elements already exist in the DB, thus there is no need to specify
 | ||||
|         // how to merge conflicting elements
 | ||||
|         write_into_lmdb_database_without_merging( | ||||
|             wtxn, | ||||
|             *word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             writer, | ||||
|         )?; | ||||
|     } | ||||
| 
 | ||||
|     // All of the word prefix pairs in the database that have a w2
 | ||||
|     // that is contained in the `suppr_pw` set must be removed as well.
 | ||||
|     if !del_prefix_fst_words.is_empty() { | ||||
|         let mut iter = | ||||
|             word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?; | ||||
|         while let Some(((_, _, prefix), _)) = iter.next().transpose()? { | ||||
|             if del_prefix_fst_words.contains(prefix.as_bytes()) { | ||||
|                 // Delete this entry as the w2 prefix is no more in the words prefix fst.
 | ||||
|                 unsafe { iter.del_current()? }; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         Ok(()) | ||||
|     } | ||||
| 
 | ||||
|     Ok(()) | ||||
| } | ||||
| 
 | ||||
| /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
 | ||||
| ///
 | ||||
| /// Its main arguments are:
 | ||||
| /// 1. a sorted iterator over ((word1, word2, proximity), docids) elements
 | ||||
| /// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
 | ||||
| /// 2. a prefix trie
 | ||||
| /// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
 | ||||
| /// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
 | ||||
| ///
 | ||||
| /// For more information about what this function does, read the module documentation.
 | ||||
| fn execute_on_word_pairs_and_prefixes<I>( | ||||
| @@ -495,61 +436,6 @@ impl PrefixAndProximityBatch { | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| // This is adapted from `sorter_into_lmdb_database`
 | ||||
| fn insert_into_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     new_key: &[u8], | ||||
|     new_value: &[u8], | ||||
| ) -> Result<()> { | ||||
|     let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; | ||||
|     match iter.next().transpose()? { | ||||
|         Some((key, old_val)) if new_key == key => { | ||||
|             let val = | ||||
|                 merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) | ||||
|                     .map_err(|_| { | ||||
|                         // TODO just wrap this error?
 | ||||
|                         crate::error::InternalError::IndexingMergingKeys { | ||||
|                             process: "get-put-merge", | ||||
|                         } | ||||
|                     })?; | ||||
|             // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
 | ||||
|             unsafe { iter.put_current(new_key, &val)? }; | ||||
|         } | ||||
|         _ => { | ||||
|             drop(iter); | ||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
| 
 | ||||
| // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
 | ||||
| // but it uses `append` if the database is empty, and it assumes that the values in the
 | ||||
| // writer don't conflict with values in the database.
 | ||||
| pub fn write_into_lmdb_database_without_merging( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     writer: grenad::Writer<std::fs::File>, | ||||
| ) -> Result<()> { | ||||
|     let file = writer.into_inner()?; | ||||
|     let reader = grenad::Reader::new(BufReader::new(file))?; | ||||
|     if database.is_empty(wtxn)? { | ||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|         let mut cursor = reader.into_cursor()?; | ||||
|         while let Some((k, v)) = cursor.move_on_next()? { | ||||
|             // safety: the key comes from the grenad reader, not the database
 | ||||
|             unsafe { out_iter.append(k, v)? }; | ||||
|         } | ||||
|     } else { | ||||
|         let mut cursor = reader.into_cursor()?; | ||||
|         while let Some((k, v)) = cursor.move_on_next()? { | ||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
| 
 | ||||
| /** A prefix trie. Used to iterate quickly over the prefixes of a word that are
 | ||||
| within a set. | ||||
| 
 | ||||
| @@ -676,90 +562,9 @@ impl PrefixTrieNode { | ||||
| } | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use std::io::Cursor; | ||||
| 
 | ||||
|     use roaring::RoaringBitmap; | ||||
| 
 | ||||
|     use super::*; | ||||
|     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; | ||||
| 
 | ||||
|     fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> { | ||||
|         let mut documents = Vec::new(); | ||||
|         for prefix in prefixes { | ||||
|             for i in 0..50 { | ||||
|                 documents.push( | ||||
|                     serde_json::json!({ | ||||
|                         "text": format!("{prefix}{i:x}"), | ||||
|                     }) | ||||
|                     .as_object() | ||||
|                     .unwrap() | ||||
|                     .clone(), | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|         documents | ||||
|     } | ||||
| 
 | ||||
|     #[test] | ||||
|     fn test_update() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|         index.index_documents_config.autogenerate_docids = true; | ||||
| 
 | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
| 
 | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
| 
 | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
 | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "At an amazing and beautiful house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "The bell rings at 5 am" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
| 
 | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
| 
 | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
| 
 | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "At an extraordinary house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
| 
 | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "update"); | ||||
|     } | ||||
|     use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; | ||||
|     use roaring::RoaringBitmap; | ||||
| 
 | ||||
|     fn check_prefixes( | ||||
|         trie: &PrefixTrieNode, | ||||
| @@ -899,9 +704,9 @@ mod tests { | ||||
|             &prefixes, | ||||
|             2, | ||||
|             |k, v| { | ||||
|                 let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); | ||||
|                 let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); | ||||
|                 let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); | ||||
|                 result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); | ||||
|                 result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); | ||||
|                 Ok(()) | ||||
|             }, | ||||
|         ) | ||||
		Reference in New Issue
	
	Block a user