mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	introduce exact_word_docids db
This commit is contained in:
		| @@ -59,6 +59,7 @@ pub mod main_key { | ||||
| pub mod db_name { | ||||
|     pub const MAIN: &str = "main"; | ||||
|     pub const WORD_DOCIDS: &str = "word-docids"; | ||||
|     pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; | ||||
|     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; | ||||
|     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; | ||||
|     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; | ||||
| @@ -83,6 +84,10 @@ pub struct Index { | ||||
|  | ||||
|     /// A word and all the documents ids containing the word. | ||||
|     pub word_docids: Database<Str, RoaringBitmapCodec>, | ||||
|  | ||||
|     /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. | ||||
|     pub exact_word_docids: Database<Str, RoaringBitmapCodec>, | ||||
|  | ||||
|     /// A prefix of word and all the documents ids containing this prefix. | ||||
|     pub word_prefix_docids: Database<Str, RoaringBitmapCodec>, | ||||
|  | ||||
| @@ -119,12 +124,13 @@ impl Index { | ||||
|     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> { | ||||
|         use db_name::*; | ||||
|  | ||||
|         options.max_dbs(14); | ||||
|         options.max_dbs(15); | ||||
|         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; | ||||
|  | ||||
|         let env = options.open(path)?; | ||||
|         let main = env.create_poly_database(Some(MAIN))?; | ||||
|         let word_docids = env.create_database(Some(WORD_DOCIDS))?; | ||||
|         let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; | ||||
|         let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; | ||||
|         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; | ||||
|         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; | ||||
| @@ -146,6 +152,7 @@ impl Index { | ||||
|             env, | ||||
|             main, | ||||
|             word_docids, | ||||
|             exact_word_docids, | ||||
|             word_prefix_docids, | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
|   | ||||
| @@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|             env: _env, | ||||
|             main: _main, | ||||
|             word_docids, | ||||
|             exact_word_docids, | ||||
|             word_prefix_docids, | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
| @@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|  | ||||
|         // Clear the other databases. | ||||
|         word_docids.clear(self.wtxn)?; | ||||
|         exact_word_docids.clear(self.wtxn)?; | ||||
|         word_prefix_docids.clear(self.wtxn)?; | ||||
|         docid_word_positions.clear(self.wtxn)?; | ||||
|         word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|   | ||||
| @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; | ||||
| use std::collections::HashMap; | ||||
|  | ||||
| use fst::IntoStreamer; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::types::{ByteSlice, Str}; | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| @@ -16,7 +16,10 @@ use crate::heed_codec::facet::{ | ||||
| }; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::index::{db_name, main_key}; | ||||
| use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; | ||||
| use crate::{ | ||||
|     DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, | ||||
|     BEU32, | ||||
| }; | ||||
|  | ||||
| pub struct DeleteDocuments<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
| @@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             env: _env, | ||||
|             main: _main, | ||||
|             word_docids, | ||||
|             exact_word_docids, | ||||
|             word_prefix_docids, | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
| @@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|         // We iterate over the words and delete the documents ids | ||||
|         // from the word docids database. | ||||
|         for (word, must_remove) in &mut words { | ||||
|             // We create an iterator to be able to get the content and delete the word docids. | ||||
|             // It's faster to acquire a cursor to get and delete or put, as we avoid traversing | ||||
|             // the LMDB B-Tree two times but only once. | ||||
|             let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; | ||||
|             if let Some((key, mut docids)) = iter.next().transpose()? { | ||||
|                 if key == word.as_str() { | ||||
|                     let previous_len = docids.len(); | ||||
|                     docids -= &self.documents_ids; | ||||
|                     if docids.is_empty() { | ||||
|                         // safety: we don't keep references from inside the LMDB database. | ||||
|                         unsafe { iter.del_current()? }; | ||||
|                         *must_remove = true; | ||||
|                     } else if docids.len() != previous_len { | ||||
|                         let key = key.to_owned(); | ||||
|                         // safety: we don't keep references from inside the LMDB database. | ||||
|                         unsafe { iter.put_current(&key, &docids)? }; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             remove_from_word_docids( | ||||
|                 self.wtxn, | ||||
|                 word_docids, | ||||
|                 word.as_str(), | ||||
|                 must_remove, | ||||
|                 &self.documents_ids, | ||||
|             )?; | ||||
|  | ||||
|             remove_from_word_docids( | ||||
|                 self.wtxn, | ||||
|                 exact_word_docids, | ||||
|                 word.as_str(), | ||||
|                 must_remove, | ||||
|                 &self.documents_ids, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         // We construct an FST set that contains the words to delete from the words FST. | ||||
| @@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn remove_from_word_docids( | ||||
|     txn: &mut heed::RwTxn, | ||||
|     db: &heed::Database<Str, RoaringBitmapCodec>, | ||||
|     word: &str, | ||||
|     must_remove: &mut bool, | ||||
|     to_remove: &RoaringBitmap, | ||||
| ) -> Result<()> { | ||||
|     // We create an iterator to be able to get the content and delete the word docids. | ||||
|     // It's faster to acquire a cursor to get and delete or put, as we avoid traversing | ||||
|     // the LMDB B-Tree two times but only once. | ||||
|     let mut iter = db.prefix_iter_mut(txn, &word)?; | ||||
|     if let Some((key, mut docids)) = iter.next().transpose()? { | ||||
|         if key == word { | ||||
|             let previous_len = docids.len(); | ||||
|             docids -= to_remove; | ||||
|             if docids.is_empty() { | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { iter.del_current()? }; | ||||
|                 *must_remove = true; | ||||
|             } else if docids.len() != previous_len { | ||||
|                 let key = key.to_owned(); | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { iter.put_current(&key, &docids)? }; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( | ||||
|     wtxn: &'a mut heed::RwTxn, | ||||
|     db: &heed::Database<C, DC>, | ||||
|   | ||||
| @@ -10,17 +10,21 @@ use super::helpers::{ | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::update::index_documents::MergeFn; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Extracts the word and the documents ids where this word appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted words and | ||||
| /// documents ids from the given chunk of docid word positions. | ||||
| /// | ||||
| /// The first returned reader in the one for normal word_docids, and the second one is for | ||||
| /// exact_word_docids | ||||
| #[logging_timer::time] | ||||
| pub fn extract_word_docids<R: io::Read + io::Seek>( | ||||
|     docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
| ) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_docids_sorter = create_sorter( | ||||
| @@ -43,5 +47,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | ||||
|         word_docids_sorter.insert(word_bytes, &value_buffer)?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(word_docids_sorter, indexer) | ||||
|     let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn); | ||||
|     Ok(( | ||||
|         sorter_into_reader(word_docids_sorter, indexer)?, | ||||
|         sorter_into_reader(empty_sorter, indexer)?, | ||||
|     )) | ||||
| } | ||||
|   | ||||
| @@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         "field-id-wordcount-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>( | ||||
|     spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_docids, | ||||
|         merge_roaring_bitmaps, | ||||
|         TypedChunk::WordDocids, | ||||
|         |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { | ||||
|             word_docids_reader, | ||||
|             exact_word_docids_reader, | ||||
|         }, | ||||
|         "word-docids", | ||||
|     ); | ||||
|  | ||||
|   | ||||
| @@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database( | ||||
|     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Used when trying to merge readers, but you don't actually care about the values. | ||||
| pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     Ok(Cow::Owned(Vec::new())) | ||||
| } | ||||
|   | ||||
| @@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto}; | ||||
| pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| pub use grenad_helpers::{ | ||||
|     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, | ||||
|     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing, | ||||
|     sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, | ||||
|     GrenadParameters, MergeableReader, | ||||
| }; | ||||
|   | ||||
| @@ -20,7 +20,7 @@ pub use self::helpers::{ | ||||
|     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, | ||||
|     sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, | ||||
| }; | ||||
| use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | ||||
| use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters}; | ||||
| pub use self::transform::{Transform, TransformOutput}; | ||||
| use crate::documents::DocumentBatchReader; | ||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||
| @@ -282,6 +282,7 @@ where | ||||
|         let mut word_pair_proximity_docids = None; | ||||
|         let mut word_position_docids = None; | ||||
|         let mut word_docids = None; | ||||
|         let mut _exact_word_docids = None; | ||||
|  | ||||
|         let mut databases_seen = 0; | ||||
|         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
| @@ -291,10 +292,13 @@ where | ||||
|  | ||||
|         for result in lmdb_writer_rx { | ||||
|             let typed_chunk = match result? { | ||||
|                 TypedChunk::WordDocids(chunk) => { | ||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; | ||||
|                 TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { | ||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; | ||||
|                     word_docids = Some(cloneable_chunk); | ||||
|                     TypedChunk::WordDocids(chunk) | ||||
|                     let cloneable_chunk = | ||||
|                         unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; | ||||
|                     _exact_word_docids = Some(cloneable_chunk); | ||||
|                     TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } | ||||
|                 } | ||||
|                 TypedChunk::WordPairProximityDocids(chunk) => { | ||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; | ||||
| @@ -425,6 +429,10 @@ where | ||||
|         }); | ||||
|  | ||||
|         if let Some(word_docids) = word_docids { | ||||
|             let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); | ||||
|             word_docids_builder.push(word_docids.into_cursor()?); | ||||
|             // TODO: push exact_word_docids | ||||
|             let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; | ||||
|             // Run the word prefix docids update operation. | ||||
|             let mut builder = WordPrefixDocids::new(self.wtxn, self.index); | ||||
|             builder.chunk_compression_type = self.indexer_config.chunk_compression_type; | ||||
| @@ -432,7 +440,7 @@ where | ||||
|             builder.max_nb_chunks = self.indexer_config.max_nb_chunks; | ||||
|             builder.max_memory = self.indexer_config.max_memory; | ||||
|             builder.execute( | ||||
|                 word_docids, | ||||
|                 word_docids_iter, | ||||
|                 &new_prefix_fst_words, | ||||
|                 &common_prefix_fst_words, | ||||
|                 &del_prefix_fst_words, | ||||
|   | ||||
| @@ -3,14 +3,16 @@ use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
|  | ||||
| use grenad::MergerBuilder; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::{BytesDecode, RwTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, | ||||
|     self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, | ||||
|     CursorClonableMmap, | ||||
| }; | ||||
| use super::{ClonableMmap, MergeFn}; | ||||
| use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; | ||||
| use crate::update::index_documents::helpers::as_cloneable_grenad; | ||||
| use crate::{ | ||||
| @@ -25,7 +27,10 @@ pub(crate) enum TypedChunk { | ||||
|     Documents(grenad::Reader<CursorClonableMmap>), | ||||
|     FieldIdWordcountDocids(grenad::Reader<File>), | ||||
|     NewDocumentsIds(RoaringBitmap), | ||||
|     WordDocids(grenad::Reader<File>), | ||||
|     WordDocids { | ||||
|         word_docids_reader: grenad::Reader<File>, | ||||
|         exact_word_docids_reader: grenad::Reader<File>, | ||||
|     }, | ||||
|     WordPositionDocids(grenad::Reader<File>), | ||||
|     WordPairProximityDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetStringDocids(grenad::Reader<File>), | ||||
| @@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|         TypedChunk::NewDocumentsIds(documents_ids) => { | ||||
|             return Ok((documents_ids, is_merged_database)) | ||||
|         } | ||||
|         TypedChunk::WordDocids(word_docids_iter) => { | ||||
|             let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; | ||||
|         TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { | ||||
|             let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; | ||||
|             append_entries_into_database( | ||||
|                 word_docids_iter.clone(), | ||||
|                 &index.word_docids, | ||||
| @@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 merge_roaring_bitmaps, | ||||
|             )?; | ||||
|  | ||||
|             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; | ||||
|             append_entries_into_database( | ||||
|                 exact_word_docids_iter.clone(), | ||||
|                 &index.exact_word_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_roaring_bitmaps, | ||||
|             )?; | ||||
|  | ||||
|             // create fst from word docids | ||||
|             let mut builder = fst::SetBuilder::memory(); | ||||
|             let mut cursor = word_docids_iter.into_cursor()?; | ||||
|             while let Some((word, _value)) = cursor.move_on_next()? { | ||||
|                 // This is a lexicographically ordered word position | ||||
|                 // we use the key to construct the words fst. | ||||
|                 builder.insert(word)?; | ||||
|             } | ||||
|             let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; | ||||
|             let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; | ||||
|             let db_fst = index.words_fst(wtxn)?; | ||||
|  | ||||
|             // merge new fst with database fst | ||||
| @@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|     Ok((RoaringBitmap::new(), is_merged_database)) | ||||
| } | ||||
|  | ||||
| fn merge_word_docids_reader_into_fst( | ||||
|     word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, | ||||
|     exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, | ||||
| ) -> Result<fst::Set<Vec<u8>>> { | ||||
|     let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn); | ||||
|     merger_builder.push(word_docids_iter.into_cursor()?); | ||||
|     merger_builder.push(exact_word_docids_iter.into_cursor()?); | ||||
|     let mut iter = merger_builder.build().into_stream_merger_iter()?; | ||||
|     let mut builder = fst::SetBuilder::memory(); | ||||
|  | ||||
|     while let Some((k, _)) = iter.next()? { | ||||
|         builder.insert(k)?; | ||||
|     } | ||||
|  | ||||
|     Ok(builder.into_set()) | ||||
| } | ||||
|  | ||||
| fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> { | ||||
|     let new_value = RoaringBitmap::deserialize_from(new_value)?; | ||||
|     let db_value = RoaringBitmap::deserialize_from(db_value)?; | ||||
|   | ||||
| @@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||
|     #[logging_timer::time("WordPrefixDocids::{}")] | ||||
|     pub fn execute( | ||||
|         self, | ||||
|         new_word_docids: grenad::Reader<CursorClonableMmap>, | ||||
|         mut new_word_docids_iter: grenad::MergerIter<CursorClonableMmap, MergeFn>, | ||||
|         new_prefix_fst_words: &[String], | ||||
|         common_prefix_fst_words: &[&[String]], | ||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
| @@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||
|         ); | ||||
|  | ||||
|         if !common_prefix_fst_words.is_empty() { | ||||
|             let mut new_word_docids_iter = new_word_docids.into_cursor()?; | ||||
|             let mut current_prefixes: Option<&&[String]> = None; | ||||
|             let mut prefixes_cache = HashMap::new(); | ||||
|             while let Some((word, data)) = new_word_docids_iter.move_on_next()? { | ||||
|             while let Some((word, data)) = new_word_docids_iter.next()? { | ||||
|                 current_prefixes = match current_prefixes.take() { | ||||
|                     Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), | ||||
|                     _otherwise => { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user