mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Split position DB into fid and relative position DB
This commit is contained in:
		| @@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{ | |||||||
|     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, |     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, | ||||||
| }; | }; | ||||||
| pub use self::script_language_codec::ScriptLanguageCodec; | pub use self::script_language_codec::ScriptLanguageCodec; | ||||||
| pub use self::str_beu32_codec::StrBEU32Codec; | pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec}; | ||||||
| pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; | pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; | ||||||
|   | |||||||
| @@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { | |||||||
|         Some(Cow::Owned(bytes)) |         Some(Cow::Owned(bytes)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | pub struct StrBEU16Codec; | ||||||
|  |  | ||||||
|  | impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { | ||||||
|  |     type DItem = (&'a str, u16); | ||||||
|  |  | ||||||
|  |     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||||
|  |         let footer_len = size_of::<u16>(); | ||||||
|  |  | ||||||
|  |         if bytes.len() < footer_len { | ||||||
|  |             return None; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let (word, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||||
|  |         let word = str::from_utf8(word).ok()?; | ||||||
|  |         let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; | ||||||
|  |  | ||||||
|  |         Some((word, pos)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { | ||||||
|  |     type EItem = (&'a str, u16); | ||||||
|  |  | ||||||
|  |     fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> { | ||||||
|  |         let pos = pos.to_be_bytes(); | ||||||
|  |  | ||||||
|  |         let mut bytes = Vec::with_capacity(word.len() + pos.len()); | ||||||
|  |         bytes.extend_from_slice(word.as_bytes()); | ||||||
|  |         bytes.extend_from_slice(&pos[..]); | ||||||
|  |  | ||||||
|  |         Some(Cow::Owned(bytes)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|   | |||||||
| @@ -19,12 +19,12 @@ use crate::heed_codec::facet::{ | |||||||
|     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, |     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, | ||||||
|     FieldIdCodec, OrderedF64Codec, |     FieldIdCodec, OrderedF64Codec, | ||||||
| }; | }; | ||||||
| use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec}; | use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; | ||||||
| use crate::{ | use crate::{ | ||||||
|     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, |     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, | ||||||
|     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, |     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, | ||||||
|     FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, |     FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, | ||||||
|     Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, |     Search, U8StrStrCodec, BEU16, BEU32, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; | pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; | ||||||
| @@ -76,7 +76,9 @@ pub mod db_name { | |||||||
|     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; |     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; | ||||||
|     pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; |     pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; | ||||||
|     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; |     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; | ||||||
|  |     pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; | ||||||
|     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; |     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; | ||||||
|  |     pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids"; | ||||||
|     pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; |     pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; | ||||||
|     pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; |     pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; | ||||||
|     pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; |     pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; | ||||||
| @@ -118,11 +120,16 @@ pub struct Index { | |||||||
|     pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, |     pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||||
|  |  | ||||||
|     /// Maps the word and the position with the docids that corresponds to it. |     /// Maps the word and the position with the docids that corresponds to it. | ||||||
|     pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>, |     pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||||
|  |     /// Maps the word and the field id with the docids that corresponds to it. | ||||||
|  |     pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||||
|  |  | ||||||
|     /// Maps the field id and the word count with the docids that corresponds to it. |     /// Maps the field id and the word count with the docids that corresponds to it. | ||||||
|     pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>, |     pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>, | ||||||
|     /// Maps the position of a word prefix with all the docids where this prefix appears. |     /// Maps the position of a word prefix with all the docids where this prefix appears. | ||||||
|     pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>, |     pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||||
|  |     /// Maps the word and the field id with the docids that corresponds to it. | ||||||
|  |     pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||||
|  |  | ||||||
|     /// Maps the script and language with all the docids that corresponds to it. |     /// Maps the script and language with all the docids that corresponds to it. | ||||||
|     pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>, |     pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>, | ||||||
| @@ -153,7 +160,7 @@ impl Index { | |||||||
|     ) -> Result<Index> { |     ) -> Result<Index> { | ||||||
|         use db_name::*; |         use db_name::*; | ||||||
|  |  | ||||||
|         options.max_dbs(19); |         options.max_dbs(21); | ||||||
|         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; |         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; | ||||||
|  |  | ||||||
|         let env = options.open(path)?; |         let env = options.open(path)?; | ||||||
| @@ -170,8 +177,10 @@ impl Index { | |||||||
|         let prefix_word_pair_proximity_docids = |         let prefix_word_pair_proximity_docids = | ||||||
|             env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; |             env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; | ||||||
|         let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; |         let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; | ||||||
|  |         let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?; | ||||||
|         let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; |         let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; | ||||||
|         let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; |         let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; | ||||||
|  |         let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?; | ||||||
|         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; |         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; | ||||||
|         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; |         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; | ||||||
|         let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; |         let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; | ||||||
| @@ -196,7 +205,9 @@ impl Index { | |||||||
|             word_prefix_pair_proximity_docids, |             word_prefix_pair_proximity_docids, | ||||||
|             prefix_word_pair_proximity_docids, |             prefix_word_pair_proximity_docids, | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|  |             word_fid_docids, | ||||||
|             word_prefix_position_docids, |             word_prefix_position_docids, | ||||||
|  |             word_prefix_fid_docids, | ||||||
|             field_id_word_count_docids, |             field_id_word_count_docids, | ||||||
|             facet_id_f64_docids, |             facet_id_f64_docids, | ||||||
|             facet_id_string_docids, |             facet_id_string_docids, | ||||||
|   | |||||||
| @@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative | |||||||
| pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { | pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { | ||||||
|     (field_id as u32) << 16 | (relative as u32) |     (field_id as u32) << 16 | (relative as u32) | ||||||
| } | } | ||||||
|  | // TODO: this is wrong, but will do for now | ||||||
|  | /// Compute the "bucketed" absolute position from the field id and relative position in the field. | ||||||
|  | /// | ||||||
|  | /// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger. | ||||||
|  | pub fn bucketed_position(relative: u16) -> u16 { | ||||||
|  |     // The first few relative positions are kept intact. | ||||||
|  |     if relative < 16 { | ||||||
|  |         relative | ||||||
|  |     } else if relative < 24 { | ||||||
|  |         // Relative positions between 16 and 24 all become equal to 24 | ||||||
|  |         24 | ||||||
|  |     } else { | ||||||
|  |         // Then, groups of positions that have the same base-2 logarithm are reduced to | ||||||
|  |         // the same relative position: the smallest power of 2 that is greater than them | ||||||
|  |         (relative as f64).log2().ceil().exp2() as u16 | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| /// Transform a raw obkv store into a JSON Object. | /// Transform a raw obkv store into a JSON Object. | ||||||
| pub fn obkv_to_json( | pub fn obkv_to_json( | ||||||
|   | |||||||
| @@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> { | |||||||
| struct QueryPositionIterator<'t> { | struct QueryPositionIterator<'t> { | ||||||
|     #[allow(clippy::type_complexity)] |     #[allow(clippy::type_complexity)] | ||||||
|     inner: |     inner: | ||||||
|         Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>, |         Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u16), RoaringBitmap)>> + 't>>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t> QueryPositionIterator<'t> { | impl<'t> QueryPositionIterator<'t> { | ||||||
| @@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t> Iterator for QueryPositionIterator<'t> { | impl<'t> Iterator for QueryPositionIterator<'t> { | ||||||
|     type Item = heed::Result<(u32, RoaringBitmap)>; |     type Item = heed::Result<(u16, RoaringBitmap)>; | ||||||
|  |  | ||||||
|     fn next(&mut self) -> Option<Self::Item> { |     fn next(&mut self) -> Option<Self::Item> { | ||||||
|         // sort inner words from the closest next position to the farthest next position. |         // sort inner words from the closest next position to the farthest next position. | ||||||
| @@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> { | |||||||
| /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, | /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, | ||||||
| /// This branch allows us to iterate over meta-interval of positions. | /// This branch allows us to iterate over meta-interval of positions. | ||||||
| struct Branch<'t> { | struct Branch<'t> { | ||||||
|     query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>, |     query_level_iterator: Vec<(u16, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>, | ||||||
|     last_result: (u32, RoaringBitmap), |     last_result: (u16, RoaringBitmap), | ||||||
|     branch_size: u32, |     branch_size: u16, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t> Branch<'t> { | impl<'t> Branch<'t> { | ||||||
| @@ -303,7 +303,7 @@ impl<'t> Branch<'t> { | |||||||
|         let mut branch = Self { |         let mut branch = Self { | ||||||
|             query_level_iterator, |             query_level_iterator, | ||||||
|             last_result: (0, RoaringBitmap::new()), |             last_result: (0, RoaringBitmap::new()), | ||||||
|             branch_size: flatten_branch.len() as u32, |             branch_size: flatten_branch.len() as u16, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         branch.update_last_result(); |         branch.update_last_result(); | ||||||
| @@ -342,7 +342,7 @@ impl<'t> Branch<'t> { | |||||||
|                         Some(result) => { |                         Some(result) => { | ||||||
|                             result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) |                             result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) | ||||||
|                         } |                         } | ||||||
|                         None => u32::MAX, |                         None => u16::MAX, | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             }) |             }) | ||||||
| @@ -378,7 +378,8 @@ impl<'t> Branch<'t> { | |||||||
|     fn compute_rank(&self) -> u32 { |     fn compute_rank(&self) -> u32 { | ||||||
|         // we compute a rank from the position. |         // we compute a rank from the position. | ||||||
|         let (pos, _) = self.last_result; |         let (pos, _) = self.last_result; | ||||||
|         pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size |         pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS | ||||||
|  |             / self.branch_size as u32 | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn cmp(&self, other: &Self) -> Ordering { |     fn cmp(&self, other: &Self) -> Ordering { | ||||||
|   | |||||||
| @@ -171,7 +171,7 @@ pub trait Context<'c> { | |||||||
|         &self, |         &self, | ||||||
|         word: &str, |         word: &str, | ||||||
|         in_prefix_cache: bool, |         in_prefix_cache: bool, | ||||||
|     ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>; |     ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>; | ||||||
|     fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; |     fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; | ||||||
|     fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>; |     fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>; | ||||||
|     fn field_id_word_count_docids( |     fn field_id_word_count_docids( | ||||||
| @@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|         &self, |         &self, | ||||||
|         word: &str, |         word: &str, | ||||||
|         in_prefix_cache: bool, |         in_prefix_cache: bool, | ||||||
|     ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>> |     ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>> | ||||||
|     { |     { | ||||||
|         let range = { |         let range = { | ||||||
|             let left = u32::min_value(); |             let left = u16::min_value(); // TODO: this is wrong | ||||||
|             let right = u32::max_value(); |             let right = u16::max_value(); // TODO: this is wrong | ||||||
|             let left = (word, left); |             let left = (word, left); | ||||||
|             let right = (word, right); |             let right = (word, right); | ||||||
|             left..=right |             left..=right | ||||||
| @@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> { |     fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> { | ||||||
|         let key = (word, pos); |         let key = (word, pos as u16); // TODO: this is wrong | ||||||
|         self.index.word_position_docids.get(self.rtxn, &key) |         self.index.word_position_docids.get(self.rtxn, &key) | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -899,7 +899,7 @@ pub mod test { | |||||||
|             _word: &str, |             _word: &str, | ||||||
|             _in_prefix_cache: bool, |             _in_prefix_cache: bool, | ||||||
|         ) -> heed::Result< |         ) -> heed::Result< | ||||||
|             Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>, |             Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>, | ||||||
|         > { |         > { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|             word_prefix_pair_proximity_docids, |             word_prefix_pair_proximity_docids, | ||||||
|             prefix_word_pair_proximity_docids, |             prefix_word_pair_proximity_docids, | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|  |             word_fid_docids, | ||||||
|             field_id_word_count_docids, |             field_id_word_count_docids, | ||||||
|             word_prefix_position_docids, |             word_prefix_position_docids, | ||||||
|  |             word_prefix_fid_docids, | ||||||
|             script_language_docids, |             script_language_docids, | ||||||
|             facet_id_f64_docids, |             facet_id_f64_docids, | ||||||
|             facet_id_string_docids, |             facet_id_string_docids, | ||||||
| @@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; |         word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||||
|         prefix_word_pair_proximity_docids.clear(self.wtxn)?; |         prefix_word_pair_proximity_docids.clear(self.wtxn)?; | ||||||
|         word_position_docids.clear(self.wtxn)?; |         word_position_docids.clear(self.wtxn)?; | ||||||
|  |         word_fid_docids.clear(self.wtxn)?; | ||||||
|         field_id_word_count_docids.clear(self.wtxn)?; |         field_id_word_count_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_position_docids.clear(self.wtxn)?; |         word_prefix_position_docids.clear(self.wtxn)?; | ||||||
|  |         word_prefix_fid_docids.clear(self.wtxn)?; | ||||||
|         script_language_docids.clear(self.wtxn)?; |         script_language_docids.clear(self.wtxn)?; | ||||||
|         facet_id_f64_docids.clear(self.wtxn)?; |         facet_id_f64_docids.clear(self.wtxn)?; | ||||||
|         facet_id_exists_docids.clear(self.wtxn)?; |         facet_id_exists_docids.clear(self.wtxn)?; | ||||||
|   | |||||||
| @@ -2,8 +2,8 @@ use std::collections::btree_map::Entry; | |||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::{HashMap, HashSet}; | ||||||
|  |  | ||||||
| use fst::IntoStreamer; | use fst::IntoStreamer; | ||||||
| use heed::types::{ByteSlice, DecodeIgnore, Str}; | use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; | ||||||
| use heed::Database; | use heed::{BytesDecode, BytesEncode, Database, RwIter}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| use time::OffsetDateTime; | use time::OffsetDateTime; | ||||||
| @@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             prefix_word_pair_proximity_docids, |             prefix_word_pair_proximity_docids, | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_prefix_position_docids, |             word_prefix_position_docids, | ||||||
|  |             word_fid_docids, | ||||||
|  |             word_prefix_fid_docids, | ||||||
|             facet_id_f64_docids: _, |             facet_id_f64_docids: _, | ||||||
|             facet_id_string_docids: _, |             facet_id_string_docids: _, | ||||||
|             field_id_docid_facet_f64s: _, |             field_id_docid_facet_f64s: _, | ||||||
| @@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { |         for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { | ||||||
|             // We delete the documents ids from the word prefix pair proximity database docids |             // We delete the documents ids from the word prefix pair proximity database docids | ||||||
|             // and remove the empty pairs too. |             // and remove the empty pairs too. | ||||||
|             let db = db.remap_key_type::<ByteSlice>(); |             Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?; | ||||||
|             let mut iter = db.iter_mut(self.wtxn)?; |  | ||||||
|             while let Some(result) = iter.next() { |  | ||||||
|                 let (key, mut docids) = result?; |  | ||||||
|                 let previous_len = docids.len(); |  | ||||||
|                 docids -= &self.to_delete_docids; |  | ||||||
|                 if docids.is_empty() { |  | ||||||
|                     // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                     unsafe { iter.del_current()? }; |  | ||||||
|                 } else if docids.len() != previous_len { |  | ||||||
|                     let key = key.to_owned(); |  | ||||||
|                     // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                     unsafe { iter.put_current(&key, &docids)? }; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|  |         Self::delete_from_db( | ||||||
|         // We delete the documents ids that are under the pairs of words, |             word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|         // it is faster and use no memory to iterate over all the words pairs than |             &self.to_delete_docids, | ||||||
|         // to compute the cartesian product of every words of the deleted documents. |         )?; | ||||||
|         let mut iter = |         Self::delete_from_db( | ||||||
|             word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?; |             word_position_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|         while let Some(result) = iter.next() { |             &self.to_delete_docids, | ||||||
|             let (bytes, mut docids) = result?; |         )?; | ||||||
|             let previous_len = docids.len(); |         Self::delete_from_db( | ||||||
|             docids -= &self.to_delete_docids; |             word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|             if docids.is_empty() { |             &self.to_delete_docids, | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |         )?; | ||||||
|                 unsafe { iter.del_current()? }; |         Self::delete_from_db( | ||||||
|             } else if docids.len() != previous_len { |             word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|                 let bytes = bytes.to_owned(); |             &self.to_delete_docids, | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |         )?; | ||||||
|                 unsafe { iter.put_current(&bytes, &docids)? }; |         Self::delete_from_db( | ||||||
|             } |             word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|         } |             &self.to_delete_docids, | ||||||
|  |         )?; | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         // We delete the documents ids that are under the word level position docids. |  | ||||||
|         let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); |  | ||||||
|         while let Some(result) = iter.next() { |  | ||||||
|             let (bytes, mut docids) = result?; |  | ||||||
|             let previous_len = docids.len(); |  | ||||||
|             docids -= &self.to_delete_docids; |  | ||||||
|             if docids.is_empty() { |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } else if docids.len() != previous_len { |  | ||||||
|                 let bytes = bytes.to_owned(); |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.put_current(&bytes, &docids)? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         // We delete the documents ids that are under the word prefix level position docids. |  | ||||||
|         let mut iter = |  | ||||||
|             word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); |  | ||||||
|         while let Some(result) = iter.next() { |  | ||||||
|             let (bytes, mut docids) = result?; |  | ||||||
|             let previous_len = docids.len(); |  | ||||||
|             docids -= &self.to_delete_docids; |  | ||||||
|             if docids.is_empty() { |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } else if docids.len() != previous_len { |  | ||||||
|                 let bytes = bytes.to_owned(); |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.put_current(&bytes, &docids)? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         // Remove the documents ids from the field id word count database. |         // Remove the documents ids from the field id word count database. | ||||||
|         let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; |         Self::delete_from_db( | ||||||
|         while let Some((key, mut docids)) = iter.next().transpose()? { |             field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|             let previous_len = docids.len(); |             &self.to_delete_docids, | ||||||
|             docids -= &self.to_delete_docids; |         )?; | ||||||
|             if docids.is_empty() { |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } else if docids.len() != previous_len { |  | ||||||
|                 let key = key.to_owned(); |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.put_current(&key, &docids)? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { |         if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { | ||||||
|             let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; |             let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; | ||||||
| @@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Remove the documents ids from the script language database. |         // Remove the documents ids from the script language database. | ||||||
|         let mut iter = script_language_docids.iter_mut(self.wtxn)?; |         Self::delete_from_db( | ||||||
|         while let Some((key, mut docids)) = iter.next().transpose()? { |             script_language_docids.iter_mut(self.wtxn)?.remap_key_type(), | ||||||
|             let previous_len = docids.len(); |             &self.to_delete_docids, | ||||||
|             docids -= &self.to_delete_docids; |         )?; | ||||||
|             if docids.is_empty() { |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } else if docids.len() != previous_len { |  | ||||||
|                 let key = key.to_owned(); |  | ||||||
|                 // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                 unsafe { iter.put_current(&key, &docids)? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         drop(iter); |  | ||||||
|         // We delete the documents ids that are under the facet field id values. |         // We delete the documents ids that are under the facet field id values. | ||||||
|         remove_docids_from_facet_id_exists_docids( |         remove_docids_from_facet_id_exists_docids( | ||||||
|             self.wtxn, |             self.wtxn, | ||||||
| @@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             soft_deletion_used: false, |             soft_deletion_used: false, | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     fn delete_from_db<C>( | ||||||
|  |         mut iter: RwIter<UnalignedSlice<u8>, C>, | ||||||
|  |         to_delete_docids: &RoaringBitmap, | ||||||
|  |     ) -> Result<()> | ||||||
|  |     where | ||||||
|  |         C: for<'a> BytesDecode<'a, DItem = RoaringBitmap> | ||||||
|  |             + for<'a> BytesEncode<'a, EItem = RoaringBitmap>, | ||||||
|  |     { | ||||||
|  |         while let Some(result) = iter.next() { | ||||||
|  |             let (bytes, mut docids) = result?; | ||||||
|  |             let previous_len = docids.len(); | ||||||
|  |             docids -= to_delete_docids; | ||||||
|  |             if docids.is_empty() { | ||||||
|  |                 // safety: we don't keep references from inside the LMDB database. | ||||||
|  |                 unsafe { iter.del_current()? }; | ||||||
|  |             } else if docids.len() != previous_len { | ||||||
|  |                 let bytes = bytes.to_owned(); | ||||||
|  |                 // safety: we don't keep references from inside the LMDB database. | ||||||
|  |                 unsafe { iter.put_current(&bytes, &docids)? }; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn remove_from_word_prefix_docids( | fn remove_from_word_prefix_docids( | ||||||
|   | |||||||
| @@ -7,14 +7,17 @@ use super::helpers::{ | |||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::{DocumentId, Result}; | use crate::{ | ||||||
|  |     absolute_from_relative_position, bucketed_position, relative_from_absolute_position, | ||||||
|  |     DocumentId, Result, | ||||||
|  | }; | ||||||
|  |  | ||||||
| /// Extracts the word positions and the documents ids where this word appear. | /// Extracts the word positions and the documents ids where this word appear. | ||||||
| /// | /// | ||||||
| /// Returns a grenad reader with the list of extracted words at positions and | /// Returns a grenad reader with the list of extracted words at positions and | ||||||
| /// documents ids from the given chunk of docid word positions. | /// documents ids from the given chunk of docid word positions. | ||||||
| #[logging_timer::time] | #[logging_timer::time] | ||||||
| pub fn extract_word_position_docids<R: io::Read + io::Seek>( | pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>( | ||||||
|     docid_word_positions: grenad::Reader<R>, |     docid_word_positions: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
| ) -> Result<grenad::Reader<File>> { | ) -> Result<grenad::Reader<File>> { | ||||||
| @@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>( | |||||||
|         for position in read_u32_ne_bytes(value) { |         for position in read_u32_ne_bytes(value) { | ||||||
|             key_buffer.clear(); |             key_buffer.clear(); | ||||||
|             key_buffer.extend_from_slice(word_bytes); |             key_buffer.extend_from_slice(word_bytes); | ||||||
|  |             let (fid, position) = relative_from_absolute_position(position); | ||||||
|  |             let position = bucketed_position(position); | ||||||
|  |             let position = absolute_from_relative_position(fid, position); | ||||||
|             key_buffer.extend_from_slice(&position.to_be_bytes()); |             key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||||
|  |  | ||||||
|             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; |             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(word_position_docids_sorter, indexer) |     let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; | ||||||
|  |  | ||||||
|  |     Ok(word_position_docids_reader) | ||||||
| } | } | ||||||
|   | |||||||
| @@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; | |||||||
| use self::extract_geo_points::extract_geo_points; | use self::extract_geo_points::extract_geo_points; | ||||||
| use self::extract_word_docids::extract_word_docids; | use self::extract_word_docids::extract_word_docids; | ||||||
| use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | ||||||
| use self::extract_word_position_docids::extract_word_position_docids; | use self::extract_word_position_docids::extract_word_fid_and_position_docids; | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, |     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, | ||||||
|     GrenadParameters, MergeFn, MergeableReader, |     GrenadParameters, MergeFn, MergeableReader, | ||||||
| @@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         docid_word_positions_chunks, |         docid_word_positions_chunks, | ||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx.clone(), |         lmdb_writer_sx.clone(), | ||||||
|         extract_word_position_docids, |         extract_word_fid_and_position_docids, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_cbo_roaring_bitmaps, | ||||||
|         TypedChunk::WordPositionDocids, |         TypedChunk::WordPositionDocids, | ||||||
|         "word-position-docids", |         "word-position-docids", | ||||||
|   | |||||||
| @@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode}; | |||||||
| use log::debug; | use log::debug; | ||||||
|  |  | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::heed_codec::StrBEU32Codec; | use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec}; | ||||||
| use crate::index::main_key::WORDS_PREFIXES_FST_KEY; | use crate::index::main_key::WORDS_PREFIXES_FST_KEY; | ||||||
| use crate::update::index_documents::{ | use crate::update::index_documents::{ | ||||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, |     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, | ||||||
|     CursorClonableMmap, MergeFn, |     CursorClonableMmap, MergeFn, | ||||||
| }; | }; | ||||||
| use crate::{Index, Result}; | use crate::{bucketed_position, relative_from_absolute_position, Index, Result}; | ||||||
|  |  | ||||||
| pub struct WordPrefixPositionDocids<'t, 'u, 'i> { | pub struct WordPrefixPositionDocids<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | |||||||
|             let mut prefixes_cache = HashMap::new(); |             let mut prefixes_cache = HashMap::new(); | ||||||
|             while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { |             while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { | ||||||
|                 let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; |                 let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; | ||||||
|  |                 let (_fid, pos) = relative_from_absolute_position(pos); | ||||||
|  |  | ||||||
|                 current_prefixes = match current_prefixes.take() { |                 current_prefixes = match current_prefixes.take() { | ||||||
|                     Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), |                     Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), | ||||||
| @@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { | |||||||
|             let iter = db |             let iter = db | ||||||
|                 .remap_key_type::<ByteSlice>() |                 .remap_key_type::<ByteSlice>() | ||||||
|                 .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? |                 .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? | ||||||
|                 .remap_key_type::<StrBEU32Codec>(); |                 .remap_key_type::<StrBEU16Codec>(); | ||||||
|             for result in iter { |             for result in iter { | ||||||
|                 let ((word, pos), data) = result?; |                 let ((word, pos), data) = result?; | ||||||
|                 if word.starts_with(prefix) { |                 if word.starts_with(prefix) { | ||||||
|                     let key = (prefix, pos); |                     let key = (prefix, pos); | ||||||
|                     let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); |                     let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); | ||||||
|                     prefix_position_docids_sorter.insert(bytes, data)?; |                     prefix_position_docids_sorter.insert(bytes, data)?; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user