mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Fix indexing of word_prefix_fid_docids
This commit is contained in:
		| @@ -36,7 +36,7 @@ use crate::error::{Error, InternalError, UserError}; | ||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||
| use crate::update::{ | ||||
|     self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, | ||||
|     WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, | ||||
|     WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, | ||||
| }; | ||||
| use crate::{Index, Result, RoaringBitmapCodec}; | ||||
|  | ||||
| @@ -373,6 +373,7 @@ where | ||||
|         let mut final_documents_ids = RoaringBitmap::new(); | ||||
|         let mut word_pair_proximity_docids = None; | ||||
|         let mut word_position_docids = None; | ||||
|         let mut word_fid_docids = None; | ||||
|         let mut word_docids = None; | ||||
|         let mut exact_word_docids = None; | ||||
|  | ||||
| @@ -406,6 +407,11 @@ where | ||||
|                     word_position_docids = Some(cloneable_chunk); | ||||
|                     TypedChunk::WordPositionDocids(chunk) | ||||
|                 } | ||||
|                 TypedChunk::WordFidDocids(chunk) => { | ||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; | ||||
|                     word_fid_docids = Some(cloneable_chunk); | ||||
|                     TypedChunk::WordFidDocids(chunk) | ||||
|                 } | ||||
|                 otherwise => otherwise, | ||||
|             }; | ||||
|  | ||||
| @@ -449,6 +455,7 @@ where | ||||
|             exact_word_docids, | ||||
|             word_pair_proximity_docids, | ||||
|             word_position_docids, | ||||
|             word_fid_docids, | ||||
|         )?; | ||||
|  | ||||
|         Ok(all_documents_ids.len()) | ||||
| @@ -461,6 +468,7 @@ where | ||||
|         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||
|         word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||
|         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||
|         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||
|     ) -> Result<()> | ||||
|     where | ||||
|         FP: Fn(UpdateIndexingStep) + Sync, | ||||
| @@ -595,17 +603,16 @@ where | ||||
|  | ||||
|         if let Some(word_position_docids) = word_position_docids { | ||||
|             // Run the words prefix position docids update operation. | ||||
|             let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); | ||||
|             let mut builder = WordPrefixIntegerDocids::new( | ||||
|                 self.wtxn, | ||||
|                 self.index.word_prefix_position_docids, | ||||
|                 self.index.word_position_docids, | ||||
|             ); | ||||
|             builder.chunk_compression_type = self.indexer_config.chunk_compression_type; | ||||
|             builder.chunk_compression_level = self.indexer_config.chunk_compression_level; | ||||
|             builder.max_nb_chunks = self.indexer_config.max_nb_chunks; | ||||
|             builder.max_memory = self.indexer_config.max_memory; | ||||
|             if let Some(value) = self.config.words_positions_level_group_size { | ||||
|                 builder.level_group_size(value); | ||||
|             } | ||||
|             if let Some(value) = self.config.words_positions_min_level_size { | ||||
|                 builder.min_level_size(value); | ||||
|             } | ||||
|  | ||||
|             builder.execute( | ||||
|                 word_position_docids, | ||||
|                 &new_prefix_fst_words, | ||||
| @@ -613,6 +620,24 @@ where | ||||
|                 &del_prefix_fst_words, | ||||
|             )?; | ||||
|         } | ||||
|         if let Some(word_fid_docids) = word_fid_docids { | ||||
|             // Run the words prefix fid docids update operation. | ||||
|             let mut builder = WordPrefixIntegerDocids::new( | ||||
|                 self.wtxn, | ||||
|                 self.index.word_prefix_fid_docids, | ||||
|                 self.index.word_fid_docids, | ||||
|             ); | ||||
|             builder.chunk_compression_type = self.indexer_config.chunk_compression_type; | ||||
|             builder.chunk_compression_level = self.indexer_config.chunk_compression_level; | ||||
|             builder.max_nb_chunks = self.indexer_config.max_nb_chunks; | ||||
|             builder.max_memory = self.indexer_config.max_memory; | ||||
|             builder.execute( | ||||
|                 word_fid_docids, | ||||
|                 &new_prefix_fst_words, | ||||
|                 &common_prefix_fst_words, | ||||
|                 &del_prefix_fst_words, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         if (self.should_abort)() { | ||||
|             return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||
|   | ||||
| @@ -14,7 +14,7 @@ pub use self::prefix_word_pairs::{ | ||||
| pub use self::settings::{Setting, Settings}; | ||||
| pub use self::update_step::UpdateIndexingStep; | ||||
| pub use self::word_prefix_docids::WordPrefixDocids; | ||||
| pub use self::words_prefix_position_docids::WordPrefixPositionDocids; | ||||
| pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids; | ||||
| pub use self::words_prefixes_fst::WordsPrefixesFst; | ||||
|  | ||||
| mod available_documents_ids; | ||||
| @@ -27,5 +27,5 @@ mod prefix_word_pairs; | ||||
| mod settings; | ||||
| mod update_step; | ||||
| mod word_prefix_docids; | ||||
| mod words_prefix_position_docids; | ||||
| mod words_prefix_integer_docids; | ||||
| mod words_prefixes_fst; | ||||
|   | ||||
| @@ -1,10 +1,9 @@ | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::num::NonZeroU32; | ||||
| use std::{cmp, str}; | ||||
| use std::str; | ||||
| 
 | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use heed::{BytesDecode, BytesEncode, Database}; | ||||
| use log::debug; | ||||
| 
 | ||||
| use crate::error::SerializationError; | ||||
| @@ -14,57 +13,46 @@ use crate::update::index_documents::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, | ||||
|     CursorClonableMmap, MergeFn, | ||||
| }; | ||||
| use crate::{Index, Result}; | ||||
| use crate::{CboRoaringBitmapCodec, Result}; | ||||
| 
 | ||||
| pub struct WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
| pub struct WordPrefixIntegerDocids<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||
|     word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     level_group_size: NonZeroU32, | ||||
|     min_level_size: NonZeroU32, | ||||
| } | ||||
| 
 | ||||
| impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
| impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|     ) -> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|         WordPrefixPositionDocids { | ||||
|         prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||
|         word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||
|     ) -> WordPrefixIntegerDocids<'t, 'u, 'i> { | ||||
|         WordPrefixIntegerDocids { | ||||
|             wtxn, | ||||
|             index, | ||||
|             prefix_database, | ||||
|             word_database, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             level_group_size: NonZeroU32::new(4).unwrap(), | ||||
|             min_level_size: NonZeroU32::new(5).unwrap(), | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { | ||||
|         self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); | ||||
|         self | ||||
|     } | ||||
| 
 | ||||
|     pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { | ||||
|         self.min_level_size = value; | ||||
|         self | ||||
|     } | ||||
| 
 | ||||
|     #[logging_timer::time("WordPrefixPositionDocids::{}")] | ||||
|     #[logging_timer::time("WordPrefixIntegerDocids::{}")] | ||||
|     pub fn execute( | ||||
|         self, | ||||
|         new_word_position_docids: grenad::Reader<CursorClonableMmap>, | ||||
|         new_word_integer_docids: grenad::Reader<CursorClonableMmap>, | ||||
|         new_prefix_fst_words: &[String], | ||||
|         common_prefix_fst_words: &[&[String]], | ||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
|     ) -> Result<()> { | ||||
|         debug!("Computing and writing the word levels positions docids into LMDB on disk..."); | ||||
|         debug!("Computing and writing the word levels integers docids into LMDB on disk..."); | ||||
| 
 | ||||
|         let mut prefix_position_docids_sorter = create_sorter( | ||||
|         let mut prefix_integer_docids_sorter = create_sorter( | ||||
|             grenad::SortAlgorithm::Unstable, | ||||
|             merge_cbo_roaring_bitmaps, | ||||
|             self.chunk_compression_type, | ||||
| @@ -73,14 +61,14 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|             self.max_memory, | ||||
|         ); | ||||
| 
 | ||||
|         let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; | ||||
|         let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?; | ||||
| 
 | ||||
|         if !common_prefix_fst_words.is_empty() { | ||||
|             // We fetch all the new common prefixes between the previous and new prefix fst.
 | ||||
|             let mut buffer = Vec::new(); | ||||
|             let mut current_prefixes: Option<&&[String]> = None; | ||||
|             let mut prefixes_cache = HashMap::new(); | ||||
|             while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { | ||||
|             while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? { | ||||
|                 let (word, pos) = StrBEU16Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; | ||||
| 
 | ||||
|                 current_prefixes = match current_prefixes.take() { | ||||
| @@ -88,7 +76,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|                     _otherwise => { | ||||
|                         write_prefixes_in_sorter( | ||||
|                             &mut prefixes_cache, | ||||
|                             &mut prefix_position_docids_sorter, | ||||
|                             &mut prefix_integer_docids_sorter, | ||||
|                         )?; | ||||
|                         common_prefix_fst_words | ||||
|                             .iter() | ||||
| @@ -101,6 +89,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|                         if word.starts_with(prefix) { | ||||
|                             buffer.clear(); | ||||
|                             buffer.extend_from_slice(prefix.as_bytes()); | ||||
|                             buffer.push(0); | ||||
|                             buffer.extend_from_slice(&pos.to_be_bytes()); | ||||
|                             match prefixes_cache.get_mut(&buffer) { | ||||
|                                 Some(value) => value.push(data.to_owned()), | ||||
| @@ -113,11 +102,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; | ||||
|             write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_integer_docids_sorter)?; | ||||
|         } | ||||
| 
 | ||||
|         // We fetch the docids associated to the newly added word prefix fst only.
 | ||||
|         let db = self.index.word_position_docids.remap_data_type::<ByteSlice>(); | ||||
|         let db = self.word_database.remap_data_type::<ByteSlice>(); | ||||
|         for prefix_bytes in new_prefix_fst_words { | ||||
|             let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { | ||||
|                 SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } | ||||
| @@ -133,19 +122,18 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|                 if word.starts_with(prefix) { | ||||
|                     let key = (prefix, pos); | ||||
|                     let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); | ||||
|                     prefix_position_docids_sorter.insert(bytes, data)?; | ||||
|                     prefix_integer_docids_sorter.insert(bytes, data)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // We remove all the entries that are no more required in this word prefix position
 | ||||
|         // We remove all the entries that are no more required in this word prefix integer
 | ||||
|         // docids database.
 | ||||
|         // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in
 | ||||
|         // We also avoid iterating over the whole `word_prefix_integer_docids` database if we know in
 | ||||
|         // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
 | ||||
|         // will always be false (i.e. if `del_prefix_fst_words` is empty).
 | ||||
|         if !del_prefix_fst_words.is_empty() { | ||||
|             let mut iter = | ||||
|                 self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); | ||||
|             let mut iter = self.prefix_database.iter_mut(self.wtxn)?.lazily_decode_data(); | ||||
|             while let Some(((prefix, _), _)) = iter.next().transpose()? { | ||||
|                 if del_prefix_fst_words.contains(prefix.as_bytes()) { | ||||
|                     unsafe { iter.del_current()? }; | ||||
| @@ -154,11 +142,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | ||||
|             drop(iter); | ||||
|         } | ||||
| 
 | ||||
|         // We finally write all the word prefix position docids into the LMDB database.
 | ||||
|         // We finally write all the word prefix integer docids into the LMDB database.
 | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_position_docids.as_polymorph(), | ||||
|             prefix_position_docids_sorter, | ||||
|             *self.prefix_database.as_polymorph(), | ||||
|             prefix_integer_docids_sorter, | ||||
|             merge_cbo_roaring_bitmaps, | ||||
|         )?; | ||||
| 
 | ||||
		Reference in New Issue
	
	Block a user