mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Merge #308
308: Implement a better parallel indexer r=Kerollmops a=ManyTheFish Rewrite the indexer: - enhance memory consumption control - optimize parallelism using rayon and crossbeam channel - factorize the different parts and make new DB implementation easier - optimize and fix prefix databases Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
		| @@ -9,12 +9,13 @@ bstr = "0.2.15" | ||||
| byteorder = "1.4.2" | ||||
| chrono = { version = "0.4.19", features = ["serde"] } | ||||
| concat-arrays = "0.1.2" | ||||
| crossbeam-channel = "0.5.1" | ||||
| csv = "1.1.5" | ||||
| either = "1.6.1" | ||||
| flate2 = "1.0.20" | ||||
| fst = "0.4.5" | ||||
| fxhash = "0.2.1" | ||||
| grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } | ||||
| grenad = "0.3.0" | ||||
| heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||
| human_format = "1.0.3" | ||||
| levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | ||||
|   | ||||
| @@ -2,51 +2,64 @@ use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::{marker, str}; | ||||
|  | ||||
| use super::try_split_at; | ||||
| use crate::error::SerializationError; | ||||
| use crate::heed_codec::RoaringBitmapCodec; | ||||
| use crate::{try_split_array_at, try_split_at, Result}; | ||||
|  | ||||
| /// A codec that encodes a string in front of the value. | ||||
| pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>; | ||||
|  | ||||
| /// A codec that encodes a string in front of a value. | ||||
| /// | ||||
| /// The usecase is for the facet string levels algorithm where we must know the | ||||
| /// original string of a normalized facet value, the original values are stored | ||||
| /// in the value to not break the lexicographical ordering of the LMDB keys. | ||||
| pub struct FacetStringLevelZeroValueCodec<C>(marker::PhantomData<C>); | ||||
| pub struct StringValueCodec<C>(marker::PhantomData<C>); | ||||
|  | ||||
| impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec<C> | ||||
| impl<'a, C> heed::BytesDecode<'a> for StringValueCodec<C> | ||||
| where | ||||
|     C: heed::BytesDecode<'a>, | ||||
| { | ||||
|     type DItem = (&'a str, C::DItem); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (string_len, bytes) = try_split_at(bytes, 2)?; | ||||
|         let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; | ||||
|  | ||||
|         let (string, bytes) = try_split_at(bytes, string_len as usize)?; | ||||
|         let string = str::from_utf8(string).ok()?; | ||||
|  | ||||
|         let (string, bytes) = decode_prefix_string(bytes)?; | ||||
|         C::bytes_decode(bytes).map(|item| (string, item)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec<C> | ||||
| impl<'a, C> heed::BytesEncode<'a> for StringValueCodec<C> | ||||
| where | ||||
|     C: heed::BytesEncode<'a>, | ||||
| { | ||||
|     type EItem = (&'a str, C::EItem); | ||||
|  | ||||
|     fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let string_len: u16 = string.len().try_into().ok()?; | ||||
|         let value_bytes = C::bytes_encode(&value)?; | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); | ||||
|         bytes.extend_from_slice(&string_len.to_be_bytes()); | ||||
|         bytes.extend_from_slice(string.as_bytes()); | ||||
|         encode_prefix_string(string, &mut bytes).ok()?; | ||||
|         bytes.extend_from_slice(&value_bytes[..]); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { | ||||
|     let (original_length_bytes, bytes) = try_split_array_at(value)?; | ||||
|     let original_length = u16::from_be_bytes(original_length_bytes) as usize; | ||||
|     let (string, bytes) = try_split_at(bytes, original_length)?; | ||||
|     let string = str::from_utf8(string).ok()?; | ||||
|     Some((string, bytes)) | ||||
| } | ||||
|  | ||||
| pub fn encode_prefix_string(string: &str, buffer: &mut Vec<u8>) -> Result<()> { | ||||
|     let string_len: u16 = | ||||
|         string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|     buffer.extend_from_slice(&string_len.to_be_bytes()); | ||||
|     buffer.extend_from_slice(string.as_bytes()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use heed::types::Unit; | ||||
| @@ -54,17 +67,15 @@ mod tests { | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
|     use super::*; | ||||
|     use crate::CboRoaringBitmapCodec; | ||||
|  | ||||
|     #[test] | ||||
|     fn deserialize_roaring_bitmaps() { | ||||
|         let string = "abc"; | ||||
|         let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); | ||||
|         let key = (string, docids.clone()); | ||||
|         let bytes = | ||||
|             FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&key).unwrap(); | ||||
|         let bytes = StringValueCodec::<RoaringBitmapCodec>::bytes_encode(&key).unwrap(); | ||||
|         let (out_string, out_docids) = | ||||
|             FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&bytes).unwrap(); | ||||
|             StringValueCodec::<RoaringBitmapCodec>::bytes_decode(&bytes).unwrap(); | ||||
|         assert_eq!((out_string, out_docids), (string, docids)); | ||||
|     } | ||||
|  | ||||
| @@ -72,9 +83,8 @@ mod tests { | ||||
|     fn deserialize_unit() { | ||||
|         let string = "def"; | ||||
|         let key = (string, ()); | ||||
|         let bytes = FacetStringLevelZeroValueCodec::<Unit>::bytes_encode(&key).unwrap(); | ||||
|         let (out_string, out_unit) = | ||||
|             FacetStringLevelZeroValueCodec::<Unit>::bytes_decode(&bytes).unwrap(); | ||||
|         let bytes = StringValueCodec::<Unit>::bytes_encode(&key).unwrap(); | ||||
|         let (out_string, out_unit) = StringValueCodec::<Unit>::bytes_decode(&bytes).unwrap(); | ||||
|         assert_eq!((out_string, out_unit), (string, ())); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -9,7 +9,9 @@ mod field_doc_id_facet_string_codec; | ||||
| pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; | ||||
| pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; | ||||
| pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; | ||||
| pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; | ||||
| pub use self::facet_string_level_zero_value_codec::{ | ||||
|     decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, | ||||
| }; | ||||
| pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; | ||||
| pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; | ||||
| pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; | ||||
|   | ||||
| @@ -52,6 +52,46 @@ impl CboRoaringBitmapCodec { | ||||
|             RoaringBitmap::deserialize_from(bytes) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Merge serialized CboRoaringBitmaps in a buffer. | ||||
|     /// | ||||
|     /// if the merged values length is under the threshold, values are directly | ||||
|     /// serialized in the buffer else a RoaringBitmap is created from the | ||||
|     /// values and is serialized in the buffer. | ||||
|     pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> { | ||||
|         let mut roaring = RoaringBitmap::new(); | ||||
|         let mut vec = Vec::new(); | ||||
|  | ||||
|         for bytes in slices { | ||||
|             if bytes.len() <= THRESHOLD * size_of::<u32>() { | ||||
|                 let mut reader = bytes.as_ref(); | ||||
|                 while let Ok(integer) = reader.read_u32::<NativeEndian>() { | ||||
|                     vec.push(integer); | ||||
|                 } | ||||
|             } else { | ||||
|                 roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if roaring.is_empty() { | ||||
|             vec.sort_unstable(); | ||||
|             vec.dedup(); | ||||
|  | ||||
|             if vec.len() <= THRESHOLD { | ||||
|                 for integer in vec { | ||||
|                     buffer.extend_from_slice(&integer.to_ne_bytes()); | ||||
|                 } | ||||
|             } else { | ||||
|                 let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()); | ||||
|                 roaring.serialize_into(buffer)?; | ||||
|             } | ||||
|         } else { | ||||
|             roaring.extend(vec); | ||||
|             roaring.serialize_into(buffer)?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { | ||||
| @@ -106,4 +146,40 @@ mod tests { | ||||
|  | ||||
|         assert!(roaring_size > bo_size); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn merge_cbo_roaring_bitmaps() { | ||||
|         let mut buffer = Vec::new(); | ||||
|  | ||||
|         let small_data = vec![ | ||||
|             RoaringBitmap::from_sorted_iter(1..4), | ||||
|             RoaringBitmap::from_sorted_iter(2..5), | ||||
|             RoaringBitmap::from_sorted_iter(4..6), | ||||
|             RoaringBitmap::from_sorted_iter(1..3), | ||||
|         ]; | ||||
|  | ||||
|         let small_data: Vec<_> = | ||||
|             small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); | ||||
|         CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap(); | ||||
|         let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); | ||||
|         let expected = RoaringBitmap::from_sorted_iter(1..6); | ||||
|         assert_eq!(bitmap, expected); | ||||
|  | ||||
|         let medium_data = vec![ | ||||
|             RoaringBitmap::from_sorted_iter(1..4), | ||||
|             RoaringBitmap::from_sorted_iter(2..5), | ||||
|             RoaringBitmap::from_sorted_iter(4..8), | ||||
|             RoaringBitmap::from_sorted_iter(0..3), | ||||
|             RoaringBitmap::from_sorted_iter(7..23), | ||||
|         ]; | ||||
|  | ||||
|         let medium_data: Vec<_> = | ||||
|             medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); | ||||
|         buffer.clear(); | ||||
|         CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap(); | ||||
|  | ||||
|         let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); | ||||
|         let expected = RoaringBitmap::from_sorted_iter(0..23); | ||||
|         assert_eq!(bitmap, expected); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -93,8 +93,7 @@ pub struct Index { | ||||
|     /// Maps the facet field id, level and the number with the docids that corresponds to it. | ||||
|     pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the facet field id and the string with the original string and docids that corresponds to it. | ||||
|     pub facet_id_string_docids: | ||||
|         Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>, | ||||
|     pub facet_id_string_docids: Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, | ||||
|  | ||||
|     /// Maps the document id, the facet field id and the numbers. | ||||
|     pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>, | ||||
|   | ||||
| @@ -13,11 +13,9 @@ mod search; | ||||
| pub mod tree_level; | ||||
| pub mod update; | ||||
|  | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashMap}; | ||||
| use std::convert::{TryFrom, TryInto}; | ||||
| use std::hash::BuildHasherDefault; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use fxhash::{FxHasher32, FxHasher64}; | ||||
| pub use grenad::CompressionType; | ||||
| @@ -54,8 +52,6 @@ pub type FieldId = u16; | ||||
| pub type Position = u32; | ||||
| pub type FieldDistribution = BTreeMap<String, u64>; | ||||
|  | ||||
| type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>; | ||||
|  | ||||
| /// Transform a raw obkv store into a JSON Object. | ||||
| pub fn obkv_to_json( | ||||
|     displayed_fields: &[FieldId], | ||||
|   | ||||
| @@ -2,8 +2,8 @@ use std::cmp; | ||||
|  | ||||
| use crate::{Attribute, Position}; | ||||
|  | ||||
| const ONE_ATTRIBUTE: u32 = 1000; | ||||
| const MAX_DISTANCE: u32 = 8; | ||||
| pub const ONE_ATTRIBUTE: u32 = 1000; | ||||
| pub const MAX_DISTANCE: u32 = 8; | ||||
|  | ||||
| pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { | ||||
|     if lhs <= rhs { | ||||
|   | ||||
| @@ -461,13 +461,18 @@ fn query_pair_proximity_docids( | ||||
|     let prefix = right.prefix; | ||||
|     match (&left.kind, &right.kind) { | ||||
|         (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { | ||||
|             if prefix && ctx.in_prefix_cache(&right) { | ||||
|                 Ok(ctx | ||||
|                     .word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? | ||||
|                     .unwrap_or_default()) | ||||
|             } else if prefix { | ||||
|                 let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; | ||||
|                 all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) | ||||
|             if prefix { | ||||
|                 match ctx.word_prefix_pair_proximity_docids( | ||||
|                     left.as_str(), | ||||
|                     right.as_str(), | ||||
|                     proximity, | ||||
|                 )? { | ||||
|                     Some(docids) => Ok(docids), | ||||
|                     None => { | ||||
|                         let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; | ||||
|                         all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) | ||||
|                     } | ||||
|                 } | ||||
|             } else { | ||||
|                 Ok(ctx | ||||
|                     .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? | ||||
| @@ -477,22 +482,24 @@ fn query_pair_proximity_docids( | ||||
|         (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { | ||||
|             let l_words = | ||||
|                 word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); | ||||
|             if prefix && ctx.in_prefix_cache(&right) { | ||||
|             if prefix { | ||||
|                 let mut docids = RoaringBitmap::new(); | ||||
|                 for (left, _) in l_words { | ||||
|                     let current_docids = ctx | ||||
|                         .word_prefix_pair_proximity_docids( | ||||
|                             left.as_ref(), | ||||
|                             right.as_ref(), | ||||
|                             proximity, | ||||
|                         )? | ||||
|                         .unwrap_or_default(); | ||||
|                     let current_docids = match ctx.word_prefix_pair_proximity_docids( | ||||
|                         left.as_str(), | ||||
|                         right.as_str(), | ||||
|                         proximity, | ||||
|                     )? { | ||||
|                         Some(docids) => Ok(docids), | ||||
|                         None => { | ||||
|                             let r_words = | ||||
|                                 word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; | ||||
|                             all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) | ||||
|                         } | ||||
|                     }?; | ||||
|                     docids |= current_docids; | ||||
|                 } | ||||
|                 Ok(docids) | ||||
|             } else if prefix { | ||||
|                 let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; | ||||
|                 all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) | ||||
|             } else { | ||||
|                 all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) | ||||
|             } | ||||
|   | ||||
| @@ -269,11 +269,7 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> { | ||||
| /// | ||||
| /// It yields the facet string and the roaring bitmap associated with it. | ||||
| pub struct FacetStringLevelZeroRange<'t> { | ||||
|     iter: RoRange< | ||||
|         't, | ||||
|         FacetStringLevelZeroCodec, | ||||
|         FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>, | ||||
|     >, | ||||
|     iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, | ||||
| } | ||||
|  | ||||
| impl<'t> FacetStringLevelZeroRange<'t> { | ||||
| @@ -316,10 +312,7 @@ impl<'t> FacetStringLevelZeroRange<'t> { | ||||
|         let iter = db | ||||
|             .remap_key_type::<ByteSlice>() | ||||
|             .range(rtxn, &(left_bound, right_bound))? | ||||
|             .remap_types::< | ||||
|                 FacetStringLevelZeroCodec, | ||||
|                 FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec> | ||||
|             >(); | ||||
|             .remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>(); | ||||
|  | ||||
|         Ok(FacetStringLevelZeroRange { iter }) | ||||
|     } | ||||
| @@ -340,11 +333,7 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { | ||||
| } | ||||
|  | ||||
| pub struct FacetStringLevelZeroRevRange<'t> { | ||||
|     iter: RoRevRange< | ||||
|         't, | ||||
|         FacetStringLevelZeroCodec, | ||||
|         FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>, | ||||
|     >, | ||||
|     iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, | ||||
| } | ||||
|  | ||||
| impl<'t> FacetStringLevelZeroRevRange<'t> { | ||||
| @@ -387,10 +376,7 @@ impl<'t> FacetStringLevelZeroRevRange<'t> { | ||||
|         let iter = db | ||||
|             .remap_key_type::<ByteSlice>() | ||||
|             .rev_range(rtxn, &(left_bound, right_bound))? | ||||
|             .remap_types::< | ||||
|                 FacetStringLevelZeroCodec, | ||||
|                 FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec> | ||||
|             >(); | ||||
|             .remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>(); | ||||
|  | ||||
|         Ok(FacetStringLevelZeroRevRange { iter }) | ||||
|     } | ||||
|   | ||||
| @@ -392,10 +392,7 @@ impl FilterCondition { | ||||
|         rtxn: &heed::RoTxn, | ||||
|         index: &Index, | ||||
|         numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, | ||||
|         strings_db: heed::Database< | ||||
|             FacetStringLevelZeroCodec, | ||||
|             FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>, | ||||
|         >, | ||||
|         strings_db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, | ||||
|         field_id: FieldId, | ||||
|         operator: &Operator, | ||||
|     ) -> Result<RoaringBitmap> { | ||||
|   | ||||
| @@ -490,7 +490,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( | ||||
|             None => { | ||||
|                 // The key corresponds to a level zero facet string. | ||||
|                 let (original_value, mut docids) = | ||||
|                     FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(val) | ||||
|                     FacetStringLevelZeroValueCodec::bytes_decode(val) | ||||
|                         .ok_or_else(|| SerializationError::Decoding { db_name })?; | ||||
|  | ||||
|                 let previous_len = docids.len(); | ||||
| @@ -501,9 +501,8 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( | ||||
|                 } else if docids.len() != previous_len { | ||||
|                     let key = key.to_owned(); | ||||
|                     let val = &(original_value, docids); | ||||
|                     let value_bytes = | ||||
|                         FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(val) | ||||
|                             .ok_or_else(|| SerializationError::Encoding { db_name })?; | ||||
|                     let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) | ||||
|                         .ok_or_else(|| SerializationError::Encoding { db_name })?; | ||||
|  | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.put_current(&key, &value_bytes)? }; | ||||
|   | ||||
| @@ -3,7 +3,7 @@ use std::num::{NonZeroU8, NonZeroUsize}; | ||||
| use std::{cmp, mem}; | ||||
|  | ||||
| use chrono::Utc; | ||||
| use grenad::{CompressionType, FileFuse, Reader, Writer}; | ||||
| use grenad::{CompressionType, Reader, Writer}; | ||||
| use heed::types::{ByteSlice, DecodeIgnore}; | ||||
| use heed::{BytesEncode, Error}; | ||||
| use log::debug; | ||||
| @@ -25,7 +25,6 @@ pub struct Facets<'t, 'u, 'i> { | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroUsize, | ||||
|     min_level_size: NonZeroUsize, | ||||
|     _update_id: u64, | ||||
| @@ -42,7 +41,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             level_group_size: NonZeroUsize::new(4).unwrap(), | ||||
|             min_level_size: NonZeroUsize::new(5).unwrap(), | ||||
|             _update_id: update_id, | ||||
| @@ -59,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("Facets::{}")] | ||||
|     pub fn execute(self) -> Result<()> { | ||||
|         self.index.set_updated_at(self.wtxn, &Utc::now())?; | ||||
|         // We get the faceted fields to be able to create the facet levels. | ||||
| @@ -86,7 +85,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|                 self.index.facet_id_string_docids, | ||||
|                 self.chunk_compression_type, | ||||
|                 self.chunk_compression_level, | ||||
|                 self.chunk_fusing_shrink_size, | ||||
|                 self.level_group_size, | ||||
|                 self.min_level_size, | ||||
|                 field_id, | ||||
| @@ -107,7 +105,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|                 self.index.facet_id_f64_docids, | ||||
|                 self.chunk_compression_type, | ||||
|                 self.chunk_compression_level, | ||||
|                 self.chunk_fusing_shrink_size, | ||||
|                 self.level_group_size, | ||||
|                 self.min_level_size, | ||||
|                 field_id, | ||||
| @@ -128,7 +125,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|                 self.wtxn, | ||||
|                 *self.index.facet_id_f64_docids.as_polymorph(), | ||||
|                 facet_number_levels, | ||||
|                 |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), | ||||
|                 |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?, | ||||
|                 WriteMethod::GetMergePut, | ||||
|             )?; | ||||
|  | ||||
| @@ -136,7 +133,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|                 self.wtxn, | ||||
|                 *self.index.facet_id_string_docids.as_polymorph(), | ||||
|                 facet_string_levels, | ||||
|                 |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), | ||||
|                 |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?, | ||||
|                 WriteMethod::GetMergePut, | ||||
|             )?; | ||||
|         } | ||||
| @@ -161,11 +158,10 @@ fn compute_facet_number_levels<'t>( | ||||
|     db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroUsize, | ||||
|     min_level_size: NonZeroUsize, | ||||
|     field_id: FieldId, | ||||
| ) -> Result<Reader<FileFuse>> { | ||||
| ) -> Result<Reader<File>> { | ||||
|     let first_level_size = db | ||||
|         .remap_key_type::<ByteSlice>() | ||||
|         .prefix_iter(rtxn, &field_id.to_be_bytes())? | ||||
| @@ -219,7 +215,7 @@ fn compute_facet_number_levels<'t>( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer, shrink_size) | ||||
|     writer_into_reader(writer) | ||||
| } | ||||
|  | ||||
| fn write_number_entry( | ||||
| @@ -239,7 +235,7 @@ fn write_number_entry( | ||||
|  | ||||
| fn compute_faceted_strings_documents_ids( | ||||
|     rtxn: &heed::RoTxn, | ||||
|     db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>, | ||||
|     db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec>, | ||||
|     field_id: FieldId, | ||||
| ) -> Result<RoaringBitmap> { | ||||
|     let mut documents_ids = RoaringBitmap::new(); | ||||
| @@ -278,17 +274,13 @@ fn clear_field_string_levels<'t>( | ||||
|  | ||||
| fn compute_facet_string_levels<'t>( | ||||
|     rtxn: &'t heed::RoTxn, | ||||
|     db: heed::Database< | ||||
|         FacetStringLevelZeroCodec, | ||||
|         FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>, | ||||
|     >, | ||||
|     db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroUsize, | ||||
|     min_level_size: NonZeroUsize, | ||||
|     field_id: FieldId, | ||||
| ) -> Result<Reader<FileFuse>> { | ||||
| ) -> Result<Reader<File>> { | ||||
|     let first_level_size = db | ||||
|         .remap_key_type::<ByteSlice>() | ||||
|         .prefix_iter(rtxn, &field_id.to_be_bytes())? | ||||
| @@ -340,7 +332,7 @@ fn compute_facet_string_levels<'t>( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer, shrink_size) | ||||
|     writer_into_reader(writer) | ||||
| } | ||||
|  | ||||
| fn write_string_entry( | ||||
|   | ||||
| @@ -0,0 +1,167 @@ | ||||
| use std::collections::HashSet; | ||||
| use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::{io, mem, str}; | ||||
|  | ||||
| use meilisearch_tokenizer::token::SeparatorKind; | ||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; | ||||
| use crate::error::{InternalError, SerializationError}; | ||||
| use crate::proximity::ONE_ATTRIBUTE; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extracts the word and positions where this word appear and | ||||
| /// prefixes it by the document id. | ||||
| /// | ||||
| /// Returns the generated internal documents ids and a grenad reader | ||||
| /// with the list of extracted words from the given chunk of documents. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_docid_word_positions<R: io::Read>( | ||||
|     mut obkv_documents: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     stop_words: Option<&fst::Set<&[u8]>>, | ||||
| ) -> Result<(RoaringBitmap, grenad::Reader<File>)> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut documents_ids = RoaringBitmap::new(); | ||||
|     let mut docid_word_positions_sorter = create_sorter( | ||||
|         concat_u32s_array, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut field_buffer = String::new(); | ||||
|     let mut config = AnalyzerConfig::default(); | ||||
|     if let Some(stop_words) = stop_words { | ||||
|         config.stop_words(stop_words); | ||||
|     } | ||||
|     let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default()); | ||||
|  | ||||
|     while let Some((key, value)) = obkv_documents.next()? { | ||||
|         let document_id = key | ||||
|             .try_into() | ||||
|             .map(u32::from_be_bytes) | ||||
|             .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|         let obkv = obkv::KvReader::<FieldId>::new(value); | ||||
|  | ||||
|         documents_ids.push(document_id); | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.extend_from_slice(&document_id.to_be_bytes()); | ||||
|  | ||||
|         for (field_id, field_bytes) in obkv.iter() { | ||||
|             if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { | ||||
|                 let value = | ||||
|                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|                 field_buffer.clear(); | ||||
|                 if let Some(field) = json_to_string(&value, &mut field_buffer) { | ||||
|                     let analyzed = analyzer.analyze(field); | ||||
|                     let tokens = process_tokens(analyzed.tokens()) | ||||
|                         .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); | ||||
|  | ||||
|                     for (index, token) in tokens { | ||||
|                         let token = token.text().trim(); | ||||
|                         key_buffer.truncate(mem::size_of::<u32>()); | ||||
|                         key_buffer.extend_from_slice(token.as_bytes()); | ||||
|  | ||||
|                         let position: u32 = index | ||||
|                             .try_into() | ||||
|                             .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|                         let position = field_id as u32 * ONE_ATTRIBUTE + position; | ||||
|                         docid_word_positions_sorter.insert(&key_buffer, &position.to_ne_bytes())?; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) | ||||
| } | ||||
|  | ||||
| /// Transform a JSON value into a string that can be indexed. | ||||
| fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> { | ||||
|     fn inner(value: &Value, output: &mut String) -> bool { | ||||
|         use std::fmt::Write; | ||||
|         match value { | ||||
|             Value::Null => false, | ||||
|             Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), | ||||
|             Value::Number(number) => write!(output, "{}", number).is_ok(), | ||||
|             Value::String(string) => write!(output, "{}", string).is_ok(), | ||||
|             Value::Array(array) => { | ||||
|                 let mut count = 0; | ||||
|                 for value in array { | ||||
|                     if inner(value, output) { | ||||
|                         output.push_str(". "); | ||||
|                         count += 1; | ||||
|                     } | ||||
|                 } | ||||
|                 // check that at least one value was written | ||||
|                 count != 0 | ||||
|             } | ||||
|             Value::Object(object) => { | ||||
|                 let mut buffer = String::new(); | ||||
|                 let mut count = 0; | ||||
|                 for (key, value) in object { | ||||
|                     buffer.clear(); | ||||
|                     let _ = write!(&mut buffer, "{}: ", key); | ||||
|                     if inner(value, &mut buffer) { | ||||
|                         buffer.push_str(". "); | ||||
|                         // We write the "key: value. " pair only when | ||||
|                         // we are sure that the value can be written. | ||||
|                         output.push_str(&buffer); | ||||
|                         count += 1; | ||||
|                     } | ||||
|                 } | ||||
|                 // check that at least one value was written | ||||
|                 count != 0 | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if let Value::String(string) = value { | ||||
|         Some(&string) | ||||
|     } else if inner(value, buffer) { | ||||
|         Some(buffer) | ||||
|     } else { | ||||
|         None | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// take an iterator on tokens and compute their relative position depending on separator kinds | ||||
| /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | ||||
| /// else we keep the standart proximity of 1 between words. | ||||
| fn process_tokens<'a>( | ||||
|     tokens: impl Iterator<Item = Token<'a>>, | ||||
| ) -> impl Iterator<Item = (usize, Token<'a>)> { | ||||
|     tokens | ||||
|         .skip_while(|token| token.is_separator().is_some()) | ||||
|         .scan((0, None), |(offset, prev_kind), token| { | ||||
|             match token.kind { | ||||
|                 TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { | ||||
|                     *offset += match *prev_kind { | ||||
|                         Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, | ||||
|                         Some(_) => 1, | ||||
|                         None => 0, | ||||
|                     }; | ||||
|                     *prev_kind = Some(token.kind) | ||||
|                 } | ||||
|                 TokenKind::Separator(SeparatorKind::Hard) => { | ||||
|                     *prev_kind = Some(token.kind); | ||||
|                 } | ||||
|                 TokenKind::Separator(SeparatorKind::Soft) | ||||
|                     if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => | ||||
|                 { | ||||
|                     *prev_kind = Some(token.kind); | ||||
|                 } | ||||
|                 _ => (), | ||||
|             } | ||||
|             Some((*offset, token)) | ||||
|         }) | ||||
|         .filter(|(_, t)| t.is_word()) | ||||
| } | ||||
| @@ -0,0 +1,42 @@ | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
|  | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, | ||||
| }; | ||||
| use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Extracts the facet number and the documents ids where this facet number appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted facet numbers and | ||||
| /// documents ids from the given chunk of docid facet number positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_facet_number_docids<R: io::Read>( | ||||
|     mut docid_fid_facet_number: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut facet_number_docids_sorter = create_sorter( | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     while let Some((key_bytes, _)) = docid_fid_facet_number.next()? { | ||||
|         let (field_id, document_id, number) = | ||||
|             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); | ||||
|  | ||||
|         let key = (field_id, 0, number, number); | ||||
|         let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); | ||||
|  | ||||
|         facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(facet_number_docids_sorter, indexer) | ||||
| } | ||||
| @@ -0,0 +1,58 @@ | ||||
| use std::fs::File; | ||||
| use std::iter::FromIterator; | ||||
| use std::{io, str}; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
| }; | ||||
| use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extracts the facet string and the documents ids where this facet string appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted facet strings and | ||||
| /// documents ids from the given chunk of docid facet string positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_facet_string_docids<R: io::Read>( | ||||
|     mut docid_fid_facet_string: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut facet_string_docids_sorter = create_sorter( | ||||
|         keep_first_prefix_value_merge_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut value_buffer = Vec::new(); | ||||
|     while let Some((key, original_value_bytes)) = docid_fid_facet_string.next()? { | ||||
|         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); | ||||
|         let field_id = FieldId::from_be_bytes(field_id_bytes); | ||||
|         let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|         let original_value = str::from_utf8(original_value_bytes)?; | ||||
|  | ||||
|         key_buffer.clear(); | ||||
|         FacetStringLevelZeroCodec::serialize_into( | ||||
|             field_id, | ||||
|             str::from_utf8(normalized_value_bytes)?, | ||||
|             &mut key_buffer, | ||||
|         ); | ||||
|  | ||||
|         value_buffer.clear(); | ||||
|         encode_prefix_string(original_value, &mut value_buffer)?; | ||||
|         let bitmap = RoaringBitmap::from_iter(Some(document_id)); | ||||
|         bitmap.serialize_into(&mut value_buffer)?; | ||||
|  | ||||
|         facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(facet_string_docids_sorter, indexer) | ||||
| } | ||||
| @@ -0,0 +1,120 @@ | ||||
| use std::collections::HashSet; | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
| use std::mem::size_of; | ||||
|  | ||||
| use heed::zerocopy::AsBytes; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; | ||||
| use crate::error::InternalError; | ||||
| use crate::facet::value_encoding::f64_into_bytes; | ||||
| use crate::{DocumentId, FieldId, Result}; | ||||
|  | ||||
| /// Extracts the facet values of each faceted field of each document. | ||||
| /// | ||||
| /// Returns the generated grenad reader containing the docid the fid and the orginal value as key | ||||
| /// and the normalized value as value extracted from the given chunk of documents. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_fid_docid_facet_values<R: io::Read>( | ||||
|     mut obkv_documents: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     faceted_fields: &HashSet<FieldId>, | ||||
| ) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut fid_docid_facet_numbers_sorter = create_sorter( | ||||
|         keep_first, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|m| m / 2), | ||||
|     ); | ||||
|  | ||||
|     let mut fid_docid_facet_strings_sorter = create_sorter( | ||||
|         keep_first, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|m| m / 2), | ||||
|     ); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     while let Some((docid_bytes, value)) = obkv_documents.next()? { | ||||
|         let obkv = obkv::KvReader::new(value); | ||||
|  | ||||
|         for (field_id, field_bytes) in obkv.iter() { | ||||
|             if faceted_fields.contains(&field_id) { | ||||
|                 let value = | ||||
|                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|                 let (numbers, strings) = extract_facet_values(&value); | ||||
|  | ||||
|                 key_buffer.clear(); | ||||
|  | ||||
|                 // prefix key with the field_id and the document_id | ||||
|                 key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||
|                 key_buffer.extend_from_slice(&docid_bytes); | ||||
|  | ||||
|                 // insert facet numbers in sorter | ||||
|                 for number in numbers { | ||||
|                     key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                     if let Some(value_bytes) = f64_into_bytes(number) { | ||||
|                         key_buffer.extend_from_slice(&value_bytes); | ||||
|                         key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||
|  | ||||
|                         fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 // insert  normalized and original facet string in sorter | ||||
|                 for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { | ||||
|                     key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                     key_buffer.extend_from_slice(normalized.as_bytes()); | ||||
|                     fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(( | ||||
|         sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, | ||||
|         sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, | ||||
|     )) | ||||
| } | ||||
|  | ||||
| fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) { | ||||
|     fn inner_extract_facet_values( | ||||
|         value: &Value, | ||||
|         can_recurse: bool, | ||||
|         output_numbers: &mut Vec<f64>, | ||||
|         output_strings: &mut Vec<(String, String)>, | ||||
|     ) { | ||||
|         match value { | ||||
|             Value::Null => (), | ||||
|             Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), | ||||
|             Value::Number(number) => { | ||||
|                 if let Some(float) = number.as_f64() { | ||||
|                     output_numbers.push(float); | ||||
|                 } | ||||
|             } | ||||
|             Value::String(original) => { | ||||
|                 let normalized = original.trim().to_lowercase(); | ||||
|                 output_strings.push((normalized, original.clone())); | ||||
|             } | ||||
|             Value::Array(values) => { | ||||
|                 if can_recurse { | ||||
|                     for value in values { | ||||
|                         inner_extract_facet_values(value, false, output_numbers, output_strings); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             Value::Object(_) => (), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut facet_number_values = Vec::new(); | ||||
|     let mut facet_string_values = Vec::new(); | ||||
|     inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); | ||||
|  | ||||
|     (facet_number_values, facet_string_values) | ||||
| } | ||||
| @@ -0,0 +1,95 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::fs::File; | ||||
| use std::{cmp, io}; | ||||
|  | ||||
| use grenad::Sorter; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, MergeFn, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::proximity::extract_position; | ||||
| use crate::{DocumentId, FieldId, Result}; | ||||
|  | ||||
| /// Extracts the field id word count and the documents ids where | ||||
| /// this field id with this amount of words appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted field id word counts | ||||
| /// and documents ids from the given chunk of docid word positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_fid_word_count_docids<R: io::Read>( | ||||
|     mut docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut fid_word_count_docids_sorter = create_sorter( | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     // This map is assumed to not consume a lot of memory. | ||||
|     let mut document_fid_wordcount = HashMap::new(); | ||||
|     let mut current_document_id = None; | ||||
|  | ||||
|     while let Some((key, value)) = docid_word_positions.next()? { | ||||
|         let (document_id_bytes, _word_bytes) = try_split_array_at(key) | ||||
|             .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         let curr_document_id = *current_document_id.get_or_insert(document_id); | ||||
|         if curr_document_id != document_id { | ||||
|             drain_document_fid_wordcount_into_sorter( | ||||
|                 &mut fid_word_count_docids_sorter, | ||||
|                 &mut document_fid_wordcount, | ||||
|                 curr_document_id, | ||||
|             )?; | ||||
|             current_document_id = Some(document_id); | ||||
|         } | ||||
|  | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             let (field_id, position) = extract_position(position); | ||||
|             let word_count = position + 1; | ||||
|  | ||||
|             let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); | ||||
|             *value = cmp::max(*value, word_count); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if let Some(document_id) = current_document_id { | ||||
|         // We must make sure that don't lose the current document field id | ||||
|         // word count map if we break because we reached the end of the chunk. | ||||
|         drain_document_fid_wordcount_into_sorter( | ||||
|             &mut fid_word_count_docids_sorter, | ||||
|             &mut document_fid_wordcount, | ||||
|             document_id, | ||||
|         )?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(fid_word_count_docids_sorter, indexer) | ||||
| } | ||||
|  | ||||
| fn drain_document_fid_wordcount_into_sorter( | ||||
|     fid_word_count_docids_sorter: &mut Sorter<MergeFn>, | ||||
|     document_fid_wordcount: &mut HashMap<FieldId, u32>, | ||||
|     document_id: DocumentId, | ||||
| ) -> Result<()> { | ||||
|     let mut key_buffer = Vec::new(); | ||||
|  | ||||
|     for (fid, count) in document_fid_wordcount.drain() { | ||||
|         if count <= 10 { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|             key_buffer.push(count as u8); | ||||
|  | ||||
|             fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
| @@ -0,0 +1,46 @@ | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
| use std::iter::FromIterator; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Extracts the word and the documents ids where this word appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted words and | ||||
| /// documents ids from the given chunk of docid word positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_word_docids<R: io::Read>( | ||||
|     mut docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_docids_sorter = create_sorter( | ||||
|         merge_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut value_buffer = Vec::new(); | ||||
|     while let Some((key, _value)) = docid_word_positions.next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|             .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         let bitmap = RoaringBitmap::from_iter(Some(document_id)); | ||||
|         serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; | ||||
|         word_docids_sorter.insert(word_bytes, &value_buffer)?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(word_docids_sorter, indexer) | ||||
| } | ||||
| @@ -0,0 +1,51 @@ | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::{DocumentId, Result}; | ||||
|  | ||||
| /// Extracts the word positions and the documents ids where this word appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted words at positions and | ||||
| /// documents ids from the given chunk of docid word positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_word_level_position_docids<R: io::Read>( | ||||
|     mut docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_level_position_docids_sorter = create_sorter( | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     while let Some((key, value)) = docid_word_positions.next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|             .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = DocumentId::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word_bytes); | ||||
|             key_buffer.push(0); // tree level | ||||
|  | ||||
|             // Levels are composed of left and right bounds. | ||||
|             key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||
|             key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||
|  | ||||
|             word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(word_level_position_docids_sorter, indexer) | ||||
| } | ||||
| @@ -0,0 +1,174 @@ | ||||
| use std::cmp::Ordering; | ||||
| use std::collections::{BinaryHeap, HashMap}; | ||||
| use std::fs::File; | ||||
| use std::{cmp, io, mem, str, vec}; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, MergeFn, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::proximity::{positions_proximity, MAX_DISTANCE}; | ||||
| use crate::{DocumentId, Result}; | ||||
|  | ||||
| /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. | ||||
| /// | ||||
| /// Returns a grenad reader with the list of extracted word pairs proximities and | ||||
| /// documents ids from the given chunk of docid word positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_word_pair_proximity_docids<R: io::Read>( | ||||
|     mut docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_pair_proximity_docids_sorter = create_sorter( | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|m| m / 2), | ||||
|     ); | ||||
|  | ||||
|     // This map is assumed to not consume a lot of memory. | ||||
|     let mut document_word_positions_heap = BinaryHeap::new(); | ||||
|     let mut current_document_id = None; | ||||
|  | ||||
|     while let Some((key, value)) = docid_word_positions.next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|             .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|         let word = str::from_utf8(word_bytes)?; | ||||
|  | ||||
|         let curr_document_id = *current_document_id.get_or_insert(document_id); | ||||
|         if curr_document_id != document_id { | ||||
|             let document_word_positions_heap = mem::take(&mut document_word_positions_heap); | ||||
|             document_word_positions_into_sorter( | ||||
|                 curr_document_id, | ||||
|                 document_word_positions_heap, | ||||
|                 &mut word_pair_proximity_docids_sorter, | ||||
|             )?; | ||||
|             current_document_id = Some(document_id); | ||||
|         } | ||||
|  | ||||
|         let word = word.to_string(); | ||||
|         let mut iter = read_u32_ne_bytes(value).collect::<Vec<_>>().into_iter(); | ||||
|         if let Some(position) = iter.next() { | ||||
|             document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if let Some(document_id) = current_document_id { | ||||
|         // We must make sure that don't lose the current document field id | ||||
|         // word count map if we break because we reached the end of the chunk. | ||||
|         let document_word_positions_heap = mem::take(&mut document_word_positions_heap); | ||||
|         document_word_positions_into_sorter( | ||||
|             document_id, | ||||
|             document_word_positions_heap, | ||||
|             &mut word_pair_proximity_docids_sorter, | ||||
|         )?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(word_pair_proximity_docids_sorter, indexer) | ||||
| } | ||||
|  | ||||
| /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. | ||||
| /// | ||||
| /// This list is used by the engine to calculate the documents containing words that are | ||||
| /// close to each other. | ||||
| fn document_word_positions_into_sorter<'b>( | ||||
|     document_id: DocumentId, | ||||
|     mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>, | ||||
|     word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>, | ||||
| ) -> Result<()> { | ||||
|     let mut word_pair_proximity = HashMap::new(); | ||||
|     let mut ordered_peeked_word_positions = Vec::new(); | ||||
|     while !word_positions_heap.is_empty() { | ||||
|         while let Some(peeked_word_position) = word_positions_heap.pop() { | ||||
|             ordered_peeked_word_positions.push(peeked_word_position); | ||||
|             if ordered_peeked_word_positions.len() == 7 { | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { | ||||
|             for PeekedWordPosition { word, position, .. } in tail { | ||||
|                 let prox = positions_proximity(head.position, *position); | ||||
|                 if prox > 0 && prox < MAX_DISTANCE { | ||||
|                     word_pair_proximity | ||||
|                         .entry((head.word.clone(), word.clone())) | ||||
|                         .and_modify(|p| { | ||||
|                             *p = cmp::min(*p, prox); | ||||
|                         }) | ||||
|                         .or_insert(prox); | ||||
|  | ||||
|                     // We also compute the inverse proximity. | ||||
|                     let prox = prox + 1; | ||||
|                     if prox < MAX_DISTANCE { | ||||
|                         word_pair_proximity | ||||
|                             .entry((word.clone(), head.word.clone())) | ||||
|                             .and_modify(|p| { | ||||
|                                 *p = cmp::min(*p, prox); | ||||
|                             }) | ||||
|                             .or_insert(prox); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // Push the tail in the heap. | ||||
|             let tail_iter = ordered_peeked_word_positions.drain(1..); | ||||
|             word_positions_heap.extend(tail_iter); | ||||
|  | ||||
|             // Advance the head and push it in the heap. | ||||
|             if let Some(mut head) = ordered_peeked_word_positions.pop() { | ||||
|                 if let Some(next_position) = head.iter.next() { | ||||
|                     word_positions_heap.push(PeekedWordPosition { | ||||
|                         word: head.word, | ||||
|                         position: next_position, | ||||
|                         iter: head.iter, | ||||
|                     }); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     for ((w1, w2), prox) in word_pair_proximity { | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.extend_from_slice(w1.as_bytes()); | ||||
|         key_buffer.push(0); | ||||
|         key_buffer.extend_from_slice(w2.as_bytes()); | ||||
|         key_buffer.push(prox as u8); | ||||
|  | ||||
|         word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| struct PeekedWordPosition<I> { | ||||
|     word: String, | ||||
|     position: u32, | ||||
|     iter: I, | ||||
| } | ||||
|  | ||||
| impl<I> Ord for PeekedWordPosition<I> { | ||||
|     fn cmp(&self, other: &Self) -> Ordering { | ||||
|         self.position.cmp(&other.position).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<I> PartialOrd for PeekedWordPosition<I> { | ||||
|     fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||||
|         Some(self.cmp(other)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<I> Eq for PeekedWordPosition<I> {} | ||||
|  | ||||
| impl<I> PartialEq for PeekedWordPosition<I> { | ||||
|     fn eq(&self, other: &Self) -> bool { | ||||
|         self.position == other.position | ||||
|     } | ||||
| } | ||||
							
								
								
									
										230
									
								
								milli/src/update/index_documents/extract/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										230
									
								
								milli/src/update/index_documents/extract/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,230 @@ | ||||
| mod extract_docid_word_positions; | ||||
| mod extract_facet_number_docids; | ||||
| mod extract_facet_string_docids; | ||||
| mod extract_fid_docid_facet_values; | ||||
| mod extract_fid_word_count_docids; | ||||
| mod extract_word_docids; | ||||
| mod extract_word_level_position_docids; | ||||
| mod extract_word_pair_proximity_docids; | ||||
|  | ||||
| use std::collections::HashSet; | ||||
| use std::fs::File; | ||||
|  | ||||
| use crossbeam_channel::Sender; | ||||
| use log::debug; | ||||
| use rayon::prelude::*; | ||||
|  | ||||
| use self::extract_docid_word_positions::extract_docid_word_positions; | ||||
| use self::extract_facet_number_docids::extract_facet_number_docids; | ||||
| use self::extract_facet_string_docids::extract_facet_string_docids; | ||||
| use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; | ||||
| use self::extract_fid_word_count_docids::extract_fid_word_count_docids; | ||||
| use self::extract_word_docids::extract_word_docids; | ||||
| use self::extract_word_level_position_docids::extract_word_level_position_docids; | ||||
| use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | ||||
| use super::helpers::{ | ||||
|     into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, | ||||
|     merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, | ||||
| }; | ||||
| use super::{helpers, TypedChunk}; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extract data for each databases from obkv documents in parallel. | ||||
| /// Send data in grenad file over provided Sender. | ||||
| pub(crate) fn data_from_obkv_documents( | ||||
|     obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, | ||||
|     indexer: GrenadParameters, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
|     searchable_fields: Option<HashSet<FieldId>>, | ||||
|     faceted_fields: HashSet<FieldId>, | ||||
|     stop_words: Option<fst::Set<&[u8]>>, | ||||
| ) -> Result<()> { | ||||
|     let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks | ||||
|         .par_bridge() | ||||
|         .map(|result| { | ||||
|             extract_documents_data( | ||||
|                 result, | ||||
|                 indexer, | ||||
|                 lmdb_writer_sx.clone(), | ||||
|                 &searchable_fields, | ||||
|                 &faceted_fields, | ||||
|                 &stop_words, | ||||
|             ) | ||||
|         }) | ||||
|         .collect(); | ||||
|  | ||||
|     let ( | ||||
|         docid_word_positions_chunks, | ||||
|         (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), | ||||
|     ) = result?; | ||||
|  | ||||
|     spawn_extraction_task( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_pair_proximity_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::WordPairProximityDocids, | ||||
|         "word-pair-proximity-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_fid_word_count_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdWordcountDocids, | ||||
|         "field-id-wordcount-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_docids, | ||||
|         merge_roaring_bitmaps, | ||||
|         TypedChunk::WordDocids, | ||||
|         "word-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_level_position_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::WordLevelPositionDocids, | ||||
|         "word-level-position-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task( | ||||
|         docid_fid_facet_strings_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_facet_string_docids, | ||||
|         keep_first_prefix_value_merge_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdFacetStringDocids, | ||||
|         "field-id-facet-string-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task( | ||||
|         docid_fid_facet_numbers_chunks.clone(), | ||||
|         indexer.clone(), | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_facet_number_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdFacetNumberDocids, | ||||
|         "field-id-facet-number-docids", | ||||
|     ); | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Spawn a new task to extract data for a specific DB using extract_fn. | ||||
| /// Generated grenad chunks are merged using the merge_fn. | ||||
| /// The result of merged chunks is serialized as TypedChunk using the serialize_fn | ||||
| /// and sent into lmdb_writer_sx. | ||||
| fn spawn_extraction_task<FE, FS>( | ||||
|     chunks: Vec<grenad::Reader<CursorClonableMmap>>, | ||||
|     indexer: GrenadParameters, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
|     extract_fn: FE, | ||||
|     merge_fn: MergeFn, | ||||
|     serialize_fn: FS, | ||||
|     name: &'static str, | ||||
| ) where | ||||
|     FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<grenad::Reader<File>> | ||||
|         + Sync | ||||
|         + Send | ||||
|         + 'static, | ||||
|     FS: Fn(grenad::Reader<File>) -> TypedChunk + Sync + Send + 'static, | ||||
| { | ||||
|     rayon::spawn(move || { | ||||
|         let chunks: Result<Vec<_>> = | ||||
|             chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); | ||||
|         rayon::spawn(move || match chunks { | ||||
|             Ok(chunks) => { | ||||
|                 debug!("merge {} database", name); | ||||
|                 let reader = merge_readers(chunks, merge_fn, indexer); | ||||
|                 let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); | ||||
|             } | ||||
|             Err(e) => { | ||||
|                 let _ = lmdb_writer_sx.send(Err(e)); | ||||
|             } | ||||
|         }) | ||||
|     }); | ||||
| } | ||||
|  | ||||
| /// Extract chuncked data and send it into lmdb_writer_sx sender: | ||||
| /// - documents | ||||
| /// - documents_ids | ||||
| /// - docid_word_positions | ||||
| /// - docid_fid_facet_numbers | ||||
| /// - docid_fid_facet_strings | ||||
| fn extract_documents_data( | ||||
|     documents_chunk: Result<grenad::Reader<File>>, | ||||
|     indexer: GrenadParameters, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     faceted_fields: &HashSet<FieldId>, | ||||
|     stop_words: &Option<fst::Set<&[u8]>>, | ||||
| ) -> Result<( | ||||
|     grenad::Reader<CursorClonableMmap>, | ||||
|     (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>), | ||||
| )> { | ||||
|     let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?; | ||||
|  | ||||
|     let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); | ||||
|  | ||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = | ||||
|         rayon::join( | ||||
|             || { | ||||
|                 let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( | ||||
|                     documents_chunk.clone(), | ||||
|                     indexer.clone(), | ||||
|                     searchable_fields, | ||||
|                     stop_words.as_ref(), | ||||
|                 )?; | ||||
|  | ||||
|                 // send documents_ids to DB writer | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); | ||||
|  | ||||
|                 // send docid_word_positions_chunk to DB writer | ||||
|                 let docid_word_positions_chunk = | ||||
|                     unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; | ||||
|                 let _ = lmdb_writer_sx | ||||
|                     .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); | ||||
|  | ||||
|                 Ok(docid_word_positions_chunk) | ||||
|             }, | ||||
|             || { | ||||
|                 let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = | ||||
|                     extract_fid_docid_facet_values( | ||||
|                         documents_chunk.clone(), | ||||
|                         indexer.clone(), | ||||
|                         faceted_fields, | ||||
|                     )?; | ||||
|  | ||||
|                 // send docid_fid_facet_numbers_chunk to DB writer | ||||
|                 let docid_fid_facet_numbers_chunk = | ||||
|                     unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; | ||||
|  | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( | ||||
|                     docid_fid_facet_numbers_chunk.clone(), | ||||
|                 ))); | ||||
|  | ||||
|                 // send docid_fid_facet_strings_chunk to DB writer | ||||
|                 let docid_fid_facet_strings_chunk = | ||||
|                     unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; | ||||
|  | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( | ||||
|                     docid_fid_facet_strings_chunk.clone(), | ||||
|                 ))); | ||||
|  | ||||
|                 Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) | ||||
|             }, | ||||
|         ); | ||||
|  | ||||
|     Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) | ||||
| } | ||||
							
								
								
									
										24
									
								
								milli/src/update/index_documents/helpers/clonable_mmap.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								milli/src/update/index_documents/helpers/clonable_mmap.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| use std::sync::Arc; | ||||
|  | ||||
| use memmap::Mmap; | ||||
|  | ||||
| /// Wrapper around Mmap allowing to virtualy clone grenad-chunks | ||||
| /// in a parallel process like the indexing. | ||||
| #[derive(Debug, Clone)] | ||||
| pub struct ClonableMmap { | ||||
|     inner: Arc<Mmap>, | ||||
| } | ||||
|  | ||||
| impl AsRef<[u8]> for ClonableMmap { | ||||
|     fn as_ref(&self) -> &[u8] { | ||||
|         self.inner.as_ref() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl From<Mmap> for ClonableMmap { | ||||
|     fn from(inner: Mmap) -> ClonableMmap { | ||||
|         ClonableMmap { inner: Arc::new(inner) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>; | ||||
							
								
								
									
										262
									
								
								milli/src/update/index_documents/helpers/grenad_helpers.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										262
									
								
								milli/src/update/index_documents/helpers/grenad_helpers.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,262 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::fs::File; | ||||
| use std::io::{self, Seek, SeekFrom}; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use grenad::{CompressionType, MergerIter, Reader, Sorter}; | ||||
| use heed::types::ByteSlice; | ||||
| use log::debug; | ||||
|  | ||||
| use super::{ClonableMmap, MergeFn}; | ||||
| use crate::error::InternalError; | ||||
| use crate::update::index_documents::WriteMethod; | ||||
| use crate::Result; | ||||
|  | ||||
| pub type CursorClonableMmap = io::Cursor<ClonableMmap>; | ||||
|  | ||||
| pub fn create_writer<R: io::Write>( | ||||
|     typ: grenad::CompressionType, | ||||
|     level: Option<u32>, | ||||
|     file: R, | ||||
| ) -> io::Result<grenad::Writer<R>> { | ||||
|     let mut builder = grenad::Writer::builder(); | ||||
|     builder.compression_type(typ); | ||||
|     if let Some(level) = level { | ||||
|         builder.compression_level(level); | ||||
|     } | ||||
|     builder.build(file) | ||||
| } | ||||
|  | ||||
| pub fn create_sorter( | ||||
|     merge: MergeFn, | ||||
|     chunk_compression_type: grenad::CompressionType, | ||||
|     chunk_compression_level: Option<u32>, | ||||
|     max_nb_chunks: Option<usize>, | ||||
|     max_memory: Option<usize>, | ||||
| ) -> grenad::Sorter<MergeFn> { | ||||
|     let mut builder = grenad::Sorter::builder(merge); | ||||
|     builder.chunk_compression_type(chunk_compression_type); | ||||
|     if let Some(level) = chunk_compression_level { | ||||
|         builder.chunk_compression_level(level); | ||||
|     } | ||||
|     if let Some(nb_chunks) = max_nb_chunks { | ||||
|         builder.max_nb_chunks(nb_chunks); | ||||
|     } | ||||
|     if let Some(memory) = max_memory { | ||||
|         builder.dump_threshold(memory); | ||||
|         builder.allow_realloc(false); | ||||
|     } | ||||
|     builder.build() | ||||
| } | ||||
|  | ||||
| pub fn sorter_into_reader( | ||||
|     sorter: grenad::Sorter<MergeFn>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let mut writer = tempfile::tempfile().and_then(|file| { | ||||
|         create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) | ||||
|     })?; | ||||
|     sorter.write_into(&mut writer)?; | ||||
|     Ok(writer_into_reader(writer)?) | ||||
| } | ||||
|  | ||||
| pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> { | ||||
|     let mut file = writer.into_inner()?; | ||||
|     file.seek(SeekFrom::Start(0))?; | ||||
|     grenad::Reader::new(file).map_err(Into::into) | ||||
| } | ||||
|  | ||||
| pub unsafe fn into_clonable_grenad( | ||||
|     reader: grenad::Reader<File>, | ||||
| ) -> Result<grenad::Reader<CursorClonableMmap>> { | ||||
|     let file = reader.into_inner(); | ||||
|     let mmap = memmap::Mmap::map(&file)?; | ||||
|     let cursor = io::Cursor::new(ClonableMmap::from(mmap)); | ||||
|     let reader = grenad::Reader::new(cursor)?; | ||||
|     Ok(reader) | ||||
| } | ||||
|  | ||||
| pub fn merge_readers<R: io::Read>( | ||||
|     readers: Vec<grenad::Reader<R>>, | ||||
|     merge_fn: MergeFn, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let mut merger_builder = grenad::MergerBuilder::new(merge_fn); | ||||
|     merger_builder.extend(readers); | ||||
|     let merger = merger_builder.build(); | ||||
|     let mut writer = tempfile::tempfile().and_then(|file| { | ||||
|         create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) | ||||
|     })?; | ||||
|     merger.write_into(&mut writer)?; | ||||
|     let reader = writer_into_reader(writer)?; | ||||
|     Ok(reader) | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct GrenadParameters { | ||||
|     pub chunk_compression_type: CompressionType, | ||||
|     pub chunk_compression_level: Option<u32>, | ||||
|     pub max_memory: Option<usize>, | ||||
|     pub max_nb_chunks: Option<usize>, | ||||
| } | ||||
|  | ||||
| impl Default for GrenadParameters { | ||||
|     fn default() -> Self { | ||||
|         Self { | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             max_memory: None, | ||||
|             max_nb_chunks: None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl GrenadParameters { | ||||
|     /// This function use the number of threads in the current threadpool to compute the value. | ||||
|     /// This should be called inside of a rayon thread pool, | ||||
|     /// Otherwise, it will take the global number of threads. | ||||
|     pub fn max_memory_by_thread(&self) -> Option<usize> { | ||||
|         self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Returns an iterator that outputs grenad readers of obkv documents | ||||
| /// with a maximum size of approximately `documents_chunks_size`. | ||||
| /// | ||||
| /// The grenad obkv entries are composed of an incremental document id big-endian | ||||
| /// encoded as the key and an obkv object with an `u8` for the field as the key | ||||
| /// and a simple UTF-8 encoded string as the value. | ||||
| pub fn grenad_obkv_into_chunks<R: io::Read>( | ||||
|     mut reader: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     documents_chunk_size: usize, | ||||
| ) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> { | ||||
|     let mut continue_reading = true; | ||||
|  | ||||
|     let indexer_clone = indexer.clone(); | ||||
|     let mut transposer = move || { | ||||
|         if !continue_reading { | ||||
|             return Ok(None); | ||||
|         } | ||||
|  | ||||
|         let mut current_chunk_size = 0u64; | ||||
|         let mut obkv_documents = tempfile::tempfile().and_then(|file| { | ||||
|             create_writer( | ||||
|                 indexer_clone.chunk_compression_type, | ||||
|                 indexer_clone.chunk_compression_level, | ||||
|                 file, | ||||
|             ) | ||||
|         })?; | ||||
|  | ||||
|         while let Some((document_id, obkv)) = reader.next()? { | ||||
|             obkv_documents.insert(document_id, obkv)?; | ||||
|             current_chunk_size += document_id.len() as u64 + obkv.len() as u64; | ||||
|  | ||||
|             if current_chunk_size >= documents_chunk_size as u64 { | ||||
|                 return writer_into_reader(obkv_documents).map(Some); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         continue_reading = false; | ||||
|         writer_into_reader(obkv_documents).map(Some) | ||||
|     }; | ||||
|  | ||||
|     Ok(std::iter::from_fn(move || transposer().transpose())) | ||||
| } | ||||
|  | ||||
| pub fn write_into_lmdb_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     mut reader: Reader<File>, | ||||
|     merge: MergeFn, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> { | ||||
|     debug!("Writing MTBL stores..."); | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     match method { | ||||
|         WriteMethod::Append => { | ||||
|             let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|             while let Some((k, v)) = reader.next()? { | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { out_iter.append(k, v)? }; | ||||
|             } | ||||
|         } | ||||
|         WriteMethod::GetMergePut => { | ||||
|             while let Some((k, v)) = reader.next()? { | ||||
|                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||
|                 match iter.next().transpose()? { | ||||
|                     Some((key, old_val)) if key == k => { | ||||
|                         let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; | ||||
|                         let val = merge(k, &vals)?; | ||||
|                         // safety: we don't keep references from inside the LMDB database. | ||||
|                         unsafe { iter.put_current(k, &val)? }; | ||||
|                     } | ||||
|                     _ => { | ||||
|                         drop(iter); | ||||
|                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     debug!("MTBL stores merged in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| pub fn sorter_into_lmdb_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     sorter: Sorter<MergeFn>, | ||||
|     merge: MergeFn, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> { | ||||
|     debug!("Writing MTBL sorter..."); | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge, method)?; | ||||
|  | ||||
|     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn merger_iter_into_lmdb_database<R: io::Read>( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     mut sorter: MergerIter<R, MergeFn>, | ||||
|     merge: MergeFn, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> { | ||||
|     match method { | ||||
|         WriteMethod::Append => { | ||||
|             let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|             while let Some((k, v)) = sorter.next()? { | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { out_iter.append(k, v)? }; | ||||
|             } | ||||
|         } | ||||
|         WriteMethod::GetMergePut => { | ||||
|             while let Some((k, v)) = sorter.next()? { | ||||
|                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||
|                 match iter.next().transpose()? { | ||||
|                     Some((key, old_val)) if key == k => { | ||||
|                         let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; | ||||
|                         let val = merge(k, &vals).map_err(|_| { | ||||
|                             // TODO just wrap this error? | ||||
|                             InternalError::IndexingMergingKeys { process: "get-put-merge" } | ||||
|                         })?; | ||||
|                         // safety: we don't keep references from inside the LMDB database. | ||||
|                         unsafe { iter.put_current(k, &val)? }; | ||||
|                     } | ||||
|                     _ => { | ||||
|                         drop(iter); | ||||
|                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
							
								
								
									
										130
									
								
								milli/src/update/index_documents/helpers/merge_functions.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								milli/src/update/index_documents/helpers/merge_functions.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,130 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::io; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::read_u32_ne_bytes; | ||||
| use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::Result; | ||||
|  | ||||
| pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>; | ||||
|  | ||||
| pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         let capacity = values.iter().map(|v| v.len()).sum::<usize>(); | ||||
|         let mut output = Vec::with_capacity(capacity); | ||||
|         values.iter().for_each(|integers| output.extend_from_slice(integers)); | ||||
|         Ok(Cow::Owned(output)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap { | ||||
|     read_u32_ne_bytes(slice).collect() | ||||
| } | ||||
|  | ||||
| pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> { | ||||
|     buffer.clear(); | ||||
|     buffer.reserve(bitmap.serialized_size()); | ||||
|     bitmap.serialize_into(buffer) | ||||
| } | ||||
|  | ||||
| pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         let merged = values | ||||
|             .iter() | ||||
|             .map(AsRef::as_ref) | ||||
|             .map(RoaringBitmap::deserialize_from) | ||||
|             .map(StdResult::unwrap) | ||||
|             .reduce(|a, b| a | b) | ||||
|             .unwrap(); | ||||
|         let mut buffer = Vec::new(); | ||||
|         serialize_roaring_bitmap(&merged, &mut buffer)?; | ||||
|         Ok(Cow::Owned(buffer)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( | ||||
|     _key: &[u8], | ||||
|     values: &[Cow<'a, [u8]>], | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         let original = decode_prefix_string(&values[0]).unwrap().0; | ||||
|         let merged_bitmaps = values | ||||
|             .iter() | ||||
|             .map(AsRef::as_ref) | ||||
|             .map(decode_prefix_string) | ||||
|             .map(Option::unwrap) | ||||
|             .map(|(_, bitmap_bytes)| bitmap_bytes) | ||||
|             .map(RoaringBitmap::deserialize_from) | ||||
|             .map(StdResult::unwrap) | ||||
|             .reduce(|a, b| a | b) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let cap = std::mem::size_of::<u16>() + original.len() + merged_bitmaps.serialized_size(); | ||||
|         let mut buffer = Vec::with_capacity(cap); | ||||
|         encode_prefix_string(original, &mut buffer)?; | ||||
|         merged_bitmaps.serialize_into(&mut buffer)?; | ||||
|         Ok(Cow::Owned(buffer)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     Ok(values[0].clone()) | ||||
| } | ||||
|  | ||||
| /// Only the last value associated with an id is kept. | ||||
| pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     Ok(obkvs.last().unwrap().clone()) | ||||
| } | ||||
|  | ||||
| /// Merge all the obks in the order we see them. | ||||
| pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     Ok(obkvs | ||||
|         .into_iter() | ||||
|         .cloned() | ||||
|         .reduce(|acc, current| { | ||||
|             let first = obkv::KvReader::new(&acc); | ||||
|             let second = obkv::KvReader::new(¤t); | ||||
|             let mut buffer = Vec::new(); | ||||
|             merge_two_obkvs(first, second, &mut buffer); | ||||
|             Cow::from(buffer) | ||||
|         }) | ||||
|         .unwrap()) | ||||
| } | ||||
|  | ||||
| pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) { | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     buffer.clear(); | ||||
|  | ||||
|     let mut writer = obkv::KvWriter::new(buffer); | ||||
|     for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { | ||||
|         match eob { | ||||
|             Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer.finish().unwrap(); | ||||
| } | ||||
|  | ||||
| pub fn merge_cbo_roaring_bitmaps<'a>( | ||||
|     _key: &[u8], | ||||
|     values: &[Cow<'a, [u8]>], | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         let mut vec = Vec::new(); | ||||
|         CboRoaringBitmapCodec::merge_into(values, &mut vec)?; | ||||
|         Ok(Cow::from(vec)) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										45
									
								
								milli/src/update/index_documents/helpers/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								milli/src/update/index_documents/helpers/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| mod clonable_mmap; | ||||
| mod grenad_helpers; | ||||
| mod merge_functions; | ||||
|  | ||||
| use std::convert::{TryFrom, TryInto}; | ||||
|  | ||||
| pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; | ||||
| pub use grenad_helpers::{ | ||||
|     create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers, | ||||
|     sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, | ||||
|     GrenadParameters, | ||||
| }; | ||||
| pub use merge_functions::{ | ||||
|     concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, | ||||
|     merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, | ||||
|     roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, | ||||
| }; | ||||
|  | ||||
| pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { | ||||
|     key.as_ref().len() <= 511 | ||||
| } | ||||
|  | ||||
| /// Divides one slice into two at an index, returns `None` if mid is out of bounds. | ||||
| pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { | ||||
|     if mid <= slice.len() { | ||||
|         Some(slice.split_at(mid)) | ||||
|     } else { | ||||
|         None | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Divides one slice into an array and the tail at an index, | ||||
| /// returns `None` if `N` is out of bounds. | ||||
| pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])> | ||||
| where | ||||
|     [T; N]: for<'a> TryFrom<&'a [T]>, | ||||
| { | ||||
|     let (head, tail) = try_split_at(slice, N)?; | ||||
|     let head = head.try_into().ok()?; | ||||
|     Some((head, tail)) | ||||
| } | ||||
|  | ||||
| pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ { | ||||
|     bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) | ||||
| } | ||||
| @@ -1,106 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use fst::IntoStreamer; | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::error::SerializationError; | ||||
| use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Only the last value associated with an id is kept. | ||||
| pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     Ok(obkvs.last().unwrap().clone().into_owned()) | ||||
| } | ||||
|  | ||||
| /// Merge all the obks in the order we see them. | ||||
| pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     let mut iter = obkvs.iter(); | ||||
|     let first = iter.next().map(|b| b.clone().into_owned()).unwrap(); | ||||
|     Ok(iter.fold(first, |acc, current| { | ||||
|         let first = obkv::KvReader::new(&acc); | ||||
|         let second = obkv::KvReader::new(current); | ||||
|         let mut buffer = Vec::new(); | ||||
|         merge_two_obkvs(first, second, &mut buffer); | ||||
|         buffer | ||||
|     })) | ||||
| } | ||||
|  | ||||
| // Union of multiple FSTs | ||||
| pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     let fsts = values.iter().map(fst::Set::new).collect::<StdResult<Vec<_>, _>>()?; | ||||
|     let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect(); | ||||
|     let op = op_builder.r#union(); | ||||
|  | ||||
|     let mut build = fst::SetBuilder::memory(); | ||||
|     build.extend_stream(op.into_stream()).unwrap(); | ||||
|     Ok(build.into_inner().unwrap()) | ||||
| } | ||||
|  | ||||
| pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     Ok(values.first().unwrap().to_vec()) | ||||
| } | ||||
|  | ||||
| pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) { | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     buffer.clear(); | ||||
|  | ||||
|     let mut writer = obkv::KvWriter::new(buffer); | ||||
|     for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { | ||||
|         match eob { | ||||
|             Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer.finish().unwrap(); | ||||
| } | ||||
|  | ||||
| pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     let (head, tail) = values.split_first().unwrap(); | ||||
|     let mut head = RoaringBitmap::deserialize_from(&head[..])?; | ||||
|  | ||||
|     for value in tail { | ||||
|         head |= RoaringBitmap::deserialize_from(&value[..])?; | ||||
|     } | ||||
|  | ||||
|     let mut vec = Vec::with_capacity(head.serialized_size()); | ||||
|     head.serialize_into(&mut vec)?; | ||||
|     Ok(vec) | ||||
| } | ||||
|  | ||||
| /// Uses the FacetStringLevelZeroValueCodec to merge the values. | ||||
| pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     let (head, tail) = values.split_first().unwrap(); | ||||
|     let (head_string, mut head_rb) = | ||||
|         FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&head[..]) | ||||
|             .ok_or(SerializationError::Decoding { db_name: None })?; | ||||
|  | ||||
|     for value in tail { | ||||
|         let (_string, rb) = | ||||
|             FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&value[..]) | ||||
|                 .ok_or(SerializationError::Decoding { db_name: None })?; | ||||
|         head_rb |= rb; | ||||
|     } | ||||
|  | ||||
|     FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&(head_string, head_rb)) | ||||
|         .map(|cow| cow.into_owned()) | ||||
|         .ok_or(SerializationError::Encoding { db_name: None }) | ||||
|         .map_err(Into::into) | ||||
| } | ||||
|  | ||||
| pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> { | ||||
|     let (head, tail) = values.split_first().unwrap(); | ||||
|     let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; | ||||
|  | ||||
|     for value in tail { | ||||
|         head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; | ||||
|     } | ||||
|  | ||||
|     let mut vec = Vec::new(); | ||||
|     CboRoaringBitmapCodec::serialize_into(&head, &mut vec); | ||||
|     Ok(vec) | ||||
| } | ||||
| @@ -1,240 +1,44 @@ | ||||
| use std::borrow::Cow; | ||||
| mod extract; | ||||
| mod helpers; | ||||
| mod transform; | ||||
| mod typed_chunk; | ||||
|  | ||||
| use std::collections::HashSet; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; | ||||
| use std::io::{self, BufRead, BufReader}; | ||||
| use std::iter::FromIterator; | ||||
| use std::num::{NonZeroU32, NonZeroUsize}; | ||||
| use std::result::Result as StdResult; | ||||
| use std::str; | ||||
| use std::sync::mpsc::sync_channel; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use bstr::ByteSlice as _; | ||||
| use chrono::Utc; | ||||
| use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer}; | ||||
| use heed::types::ByteSlice; | ||||
| use log::{debug, error, info}; | ||||
| use memmap::Mmap; | ||||
| use rayon::prelude::*; | ||||
| use crossbeam_channel::{Receiver, Sender}; | ||||
| use grenad::{self, CompressionType}; | ||||
| use log::{debug, info}; | ||||
| use rayon::ThreadPool; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | ||||
|  | ||||
| pub use self::merge_function::{ | ||||
|     cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, | ||||
|     tuple_string_cbo_roaring_bitmap_merge, | ||||
| pub use self::helpers::{ | ||||
|     create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, | ||||
|     sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, MergeFn, | ||||
| }; | ||||
| use self::store::{Readers, Store}; | ||||
| use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | ||||
| pub use self::transform::{Transform, TransformOutput}; | ||||
| use super::UpdateBuilder; | ||||
| use crate::error::{Error, InternalError}; | ||||
| use crate::update::{ | ||||
|     Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, | ||||
|     Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, | ||||
|     WordsLevelPositions, WordsPrefixesFst, | ||||
| }; | ||||
| use crate::{Index, MergeFn, Result}; | ||||
| use crate::{Index, Result}; | ||||
|  | ||||
| mod merge_function; | ||||
| mod store; | ||||
| mod transform; | ||||
| static MERGED_DATABASE_COUNT: usize = 7; | ||||
| static PREFIX_DATABASE_COUNT: usize = 5; | ||||
| static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; | ||||
|  | ||||
| #[derive(Debug, Serialize, Deserialize, Clone)] | ||||
| pub struct DocumentAdditionResult { | ||||
|     pub nb_documents: usize, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Copy, Clone)] | ||||
| pub enum WriteMethod { | ||||
|     Append, | ||||
|     GetMergePut, | ||||
| } | ||||
|  | ||||
| pub fn create_writer( | ||||
|     typ: CompressionType, | ||||
|     level: Option<u32>, | ||||
|     file: File, | ||||
| ) -> io::Result<Writer<File>> { | ||||
|     let mut builder = Writer::builder(); | ||||
|     builder.compression_type(typ); | ||||
|     if let Some(level) = level { | ||||
|         builder.compression_level(level); | ||||
|     } | ||||
|     builder.build(file) | ||||
| } | ||||
|  | ||||
| pub fn create_sorter<E>( | ||||
|     merge: MergeFn<E>, | ||||
|     chunk_compression_type: CompressionType, | ||||
|     chunk_compression_level: Option<u32>, | ||||
|     chunk_fusing_shrink_size: Option<u64>, | ||||
|     max_nb_chunks: Option<usize>, | ||||
|     max_memory: Option<usize>, | ||||
| ) -> Sorter<MergeFn<E>> { | ||||
|     let mut builder = Sorter::builder(merge); | ||||
|     if let Some(shrink_size) = chunk_fusing_shrink_size { | ||||
|         builder.file_fusing_shrink_size(shrink_size); | ||||
|     } | ||||
|     builder.chunk_compression_type(chunk_compression_type); | ||||
|     if let Some(level) = chunk_compression_level { | ||||
|         builder.chunk_compression_level(level); | ||||
|     } | ||||
|     if let Some(nb_chunks) = max_nb_chunks { | ||||
|         builder.max_nb_chunks(nb_chunks); | ||||
|     } | ||||
|     if let Some(memory) = max_memory { | ||||
|         builder.max_memory(memory); | ||||
|     } | ||||
|     builder.build() | ||||
| } | ||||
|  | ||||
| pub fn writer_into_reader( | ||||
|     writer: Writer<File>, | ||||
|     shrink_size: Option<u64>, | ||||
| ) -> Result<Reader<FileFuse>> { | ||||
|     let mut file = writer.into_inner()?; | ||||
|     file.seek(SeekFrom::Start(0))?; | ||||
|     let file = if let Some(shrink_size) = shrink_size { | ||||
|         FileFuse::builder().shrink_size(shrink_size).build(file) | ||||
|     } else { | ||||
|         FileFuse::new(file) | ||||
|     }; | ||||
|     Reader::new(file).map_err(Into::into) | ||||
| } | ||||
|  | ||||
| pub fn merge_readers<E>( | ||||
|     sources: Vec<Reader<FileFuse>>, | ||||
|     merge: MergeFn<E>, | ||||
| ) -> Merger<FileFuse, MergeFn<E>> { | ||||
|     let mut builder = Merger::builder(merge); | ||||
|     builder.extend(sources); | ||||
|     builder.build() | ||||
| } | ||||
|  | ||||
| pub fn merge_into_lmdb_database<E>( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     sources: Vec<Reader<FileFuse>>, | ||||
|     merge: MergeFn<E>, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     Error: From<E>, | ||||
| { | ||||
|     debug!("Merging {} MTBL stores...", sources.len()); | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     let merger = merge_readers(sources, merge); | ||||
|     merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?; | ||||
|  | ||||
|     debug!("MTBL stores merged in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| pub fn write_into_lmdb_database<E>( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     mut reader: Reader<FileFuse>, | ||||
|     merge: MergeFn<E>, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     Error: From<E>, | ||||
| { | ||||
|     debug!("Writing MTBL stores..."); | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     match method { | ||||
|         WriteMethod::Append => { | ||||
|             let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|             while let Some((k, v)) = reader.next()? { | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { out_iter.append(k, v)? }; | ||||
|             } | ||||
|         } | ||||
|         WriteMethod::GetMergePut => { | ||||
|             while let Some((k, v)) = reader.next()? { | ||||
|                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||
|                 match iter.next().transpose()? { | ||||
|                     Some((key, old_val)) if key == k => { | ||||
|                         let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; | ||||
|                         let val = merge(k, &vals)?; | ||||
|                         // safety: we don't keep references from inside the LMDB database. | ||||
|                         unsafe { iter.put_current(k, &val)? }; | ||||
|                     } | ||||
|                     _ => { | ||||
|                         drop(iter); | ||||
|                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     debug!("MTBL stores merged in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| pub fn sorter_into_lmdb_database<E>( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     sorter: Sorter<MergeFn<E>>, | ||||
|     merge: MergeFn<E>, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     Error: From<E>, | ||||
|     Error: From<grenad::Error<E>>, | ||||
| { | ||||
|     debug!("Writing MTBL sorter..."); | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?; | ||||
|  | ||||
|     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn merger_iter_into_lmdb_database<R: io::Read, E>( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     mut sorter: MergerIter<R, MergeFn<E>>, | ||||
|     merge: MergeFn<E>, | ||||
|     method: WriteMethod, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     Error: From<E>, | ||||
| { | ||||
|     match method { | ||||
|         WriteMethod::Append => { | ||||
|             let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|             while let Some((k, v)) = sorter.next()? { | ||||
|                 // safety: we don't keep references from inside the LMDB database. | ||||
|                 unsafe { out_iter.append(k, v)? }; | ||||
|             } | ||||
|         } | ||||
|         WriteMethod::GetMergePut => { | ||||
|             while let Some((k, v)) = sorter.next()? { | ||||
|                 let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||
|                 match iter.next().transpose()? { | ||||
|                     Some((key, old_val)) if key == k => { | ||||
|                         let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; | ||||
|                         let val = merge(k, &vals).map_err(|_| { | ||||
|                             // TODO just wrap this error? | ||||
|                             InternalError::IndexingMergingKeys { process: "get-put-merge" } | ||||
|                         })?; | ||||
|                         // safety: we don't keep references from inside the LMDB database. | ||||
|                         unsafe { iter.put_current(k, &val)? }; | ||||
|                     } | ||||
|                     _ => { | ||||
|                         drop(iter); | ||||
|                         database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] | ||||
| #[non_exhaustive] | ||||
| pub enum IndexDocumentsMethod { | ||||
| @@ -247,6 +51,12 @@ pub enum IndexDocumentsMethod { | ||||
|     UpdateDocuments, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Copy, Clone)] | ||||
| pub enum WriteMethod { | ||||
|     Append, | ||||
|     GetMergePut, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] | ||||
| #[non_exhaustive] | ||||
| pub enum UpdateFormat { | ||||
| @@ -262,16 +72,15 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) log_every_n: Option<usize>, | ||||
|     pub(crate) documents_chunk_size: Option<usize>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     pub(crate) linked_hash_map_size: Option<usize>, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) thread_pool: Option<&'a ThreadPool>, | ||||
|     facet_level_group_size: Option<NonZeroUsize>, | ||||
|     facet_min_level_size: Option<NonZeroUsize>, | ||||
|     words_prefix_threshold: Option<f64>, | ||||
|     words_prefix_threshold: Option<u32>, | ||||
|     max_prefix_length: Option<usize>, | ||||
|     words_positions_level_group_size: Option<NonZeroU32>, | ||||
|     words_positions_min_level_size: Option<NonZeroU32>, | ||||
| @@ -291,12 +100,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             wtxn, | ||||
|             index, | ||||
|             log_every_n: None, | ||||
|             documents_chunk_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             linked_hash_map_size: None, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             thread_pool: None, | ||||
|             facet_level_group_size: None, | ||||
|             facet_min_level_size: None, | ||||
| @@ -327,6 +135,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         self.autogenerate_docids = false; | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("IndexDocuments::{}")] | ||||
|     pub fn execute<R, F>(self, reader: R, progress_callback: F) -> Result<DocumentAdditionResult> | ||||
|     where | ||||
|         R: io::Read, | ||||
| @@ -344,14 +153,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         let before_transform = Instant::now(); | ||||
|         let update_id = self.update_id; | ||||
|         let progress_callback = |step| progress_callback(step, update_id); | ||||
|  | ||||
|         let transform = Transform { | ||||
|             rtxn: &self.wtxn, | ||||
|             index: self.index, | ||||
|             log_every_n: self.log_every_n, | ||||
|             chunk_compression_type: self.chunk_compression_type, | ||||
|             chunk_compression_level: self.chunk_compression_level, | ||||
|             chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, | ||||
|             max_nb_chunks: self.max_nb_chunks, | ||||
|             max_memory: self.max_memory, | ||||
|             index_documents_method: self.update_method, | ||||
| @@ -374,12 +181,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         Ok(DocumentAdditionResult { nb_documents }) | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("IndexDocuments::{}")] | ||||
|     pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()> | ||||
|     where | ||||
|         F: Fn(UpdateIndexingStep) + Sync, | ||||
|     { | ||||
|         let before_indexing = Instant::now(); | ||||
|  | ||||
|         let TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map, | ||||
| @@ -395,6 +201,78 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         // up to date field map. | ||||
|         self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||
|  | ||||
|         let backup_pool; | ||||
|         let pool = match self.thread_pool { | ||||
|             Some(pool) => pool, | ||||
|             #[cfg(not(test))] | ||||
|             None => { | ||||
|                 // We initialize a bakcup pool with the default | ||||
|                 // settings if none have already been set. | ||||
|                 backup_pool = rayon::ThreadPoolBuilder::new().build()?; | ||||
|                 &backup_pool | ||||
|             } | ||||
|             #[cfg(test)] | ||||
|             None => { | ||||
|                 // We initialize a bakcup pool with the default | ||||
|                 // settings if none have already been set. | ||||
|                 backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?; | ||||
|                 &backup_pool | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         let documents_file = grenad::Reader::new(documents_file)?; | ||||
|  | ||||
|         // create LMDB writer channel | ||||
|         let (lmdb_writer_sx, lmdb_writer_rx): ( | ||||
|             Sender<Result<TypedChunk>>, | ||||
|             Receiver<Result<TypedChunk>>, | ||||
|         ) = crossbeam_channel::unbounded(); | ||||
|  | ||||
|         // get searchable fields for word databases | ||||
|         let searchable_fields = | ||||
|             self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); | ||||
|         // get filterable fields for facet databases | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
|  | ||||
|         let stop_words = self.index.stop_words(self.wtxn)?; | ||||
|         // let stop_words = stop_words.as_ref(); | ||||
|  | ||||
|         // Run extraction pipeline in parallel. | ||||
|         pool.install(|| { | ||||
|             let params = GrenadParameters { | ||||
|                 chunk_compression_type: self.chunk_compression_type, | ||||
|                 chunk_compression_level: self.chunk_compression_level, | ||||
|                 max_memory: self.max_memory, | ||||
|                 max_nb_chunks: self.max_nb_chunks, // default value, may be chosen. | ||||
|             }; | ||||
|  | ||||
|             // split obkv file into several chuncks | ||||
|             let chunk_iter = grenad_obkv_into_chunks( | ||||
|                 documents_file, | ||||
|                 params.clone(), | ||||
|                 self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB | ||||
|             ); | ||||
|  | ||||
|             let result = chunk_iter.map(|chunk_iter| { | ||||
|                 // extract all databases from the chunked obkv douments | ||||
|                 extract::data_from_obkv_documents( | ||||
|                     chunk_iter, | ||||
|                     params, | ||||
|                     lmdb_writer_sx.clone(), | ||||
|                     searchable_fields, | ||||
|                     faceted_fields, | ||||
|                     stop_words, | ||||
|                 ) | ||||
|             }); | ||||
|  | ||||
|             if let Err(e) = result { | ||||
|                 let _ = lmdb_writer_sx.send(Err(e)); | ||||
|             } | ||||
|  | ||||
|             // needs to be droped to avoid channel waiting lock. | ||||
|             drop(lmdb_writer_sx) | ||||
|         }); | ||||
|  | ||||
|         // We delete the documents that this document addition replaces. This way we are | ||||
|         // able to simply insert all the documents even if they already exist in the database. | ||||
|         if !replaced_documents_ids.is_empty() { | ||||
| @@ -402,10 +280,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                 log_every_n: self.log_every_n, | ||||
|                 max_nb_chunks: self.max_nb_chunks, | ||||
|                 max_memory: self.max_memory, | ||||
|                 linked_hash_map_size: self.linked_hash_map_size, | ||||
|                 documents_chunk_size: self.documents_chunk_size, | ||||
|                 chunk_compression_type: self.chunk_compression_type, | ||||
|                 chunk_compression_level: self.chunk_compression_level, | ||||
|                 chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, | ||||
|                 thread_pool: self.thread_pool, | ||||
|                 update_id: self.update_id, | ||||
|             }; | ||||
| @@ -416,189 +293,39 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             debug!("{} documents actually deleted", deleted_documents_count); | ||||
|         } | ||||
|  | ||||
|         if documents_count == 0 { | ||||
|             return Ok(()); | ||||
|         } | ||||
|         let index_documents_ids = self.index.documents_ids(self.wtxn)?; | ||||
|         let index_is_empty = index_documents_ids.len() == 0; | ||||
|         let mut final_documents_ids = RoaringBitmap::new(); | ||||
|  | ||||
|         let bytes = unsafe { Mmap::map(&documents_file)? }; | ||||
|         let documents = grenad::Reader::new(bytes.as_bytes()).unwrap(); | ||||
|         let mut databases_seen = 0; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen, | ||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|         }); | ||||
|  | ||||
|         // The enum which indicates the type of the readers | ||||
|         // merges that are potentially done on different threads. | ||||
|         enum DatabaseType { | ||||
|             Main, | ||||
|             WordDocids, | ||||
|             WordLevel0PositionDocids, | ||||
|             FieldIdWordCountDocids, | ||||
|             FacetLevel0NumbersDocids, | ||||
|         } | ||||
|  | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
|         let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? { | ||||
|             Some(fields) => fields.iter().copied().collect(), | ||||
|             None => fields_ids_map.iter().map(|(id, _name)| id).collect(), | ||||
|         }; | ||||
|  | ||||
|         let stop_words = self.index.stop_words(self.wtxn)?; | ||||
|         let stop_words = stop_words.as_ref(); | ||||
|         let linked_hash_map_size = self.linked_hash_map_size; | ||||
|         let max_nb_chunks = self.max_nb_chunks; | ||||
|         let max_memory = self.max_memory; | ||||
|         let chunk_compression_type = self.chunk_compression_type; | ||||
|         let chunk_compression_level = self.chunk_compression_level; | ||||
|         let log_every_n = self.log_every_n; | ||||
|         let chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|  | ||||
|         let backup_pool; | ||||
|         let pool = match self.thread_pool { | ||||
|             Some(pool) => pool, | ||||
|             None => { | ||||
|                 // We initialize a bakcup pool with the default | ||||
|                 // settings if none have already been set. | ||||
|                 backup_pool = rayon::ThreadPoolBuilder::new().build()?; | ||||
|                 &backup_pool | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         let readers = pool.install(|| { | ||||
|             let num_threads = rayon::current_num_threads(); | ||||
|             let max_memory_by_job = max_memory.map(|mm| mm / num_threads); | ||||
|  | ||||
|             let readers = rayon::iter::repeatn(documents, num_threads) | ||||
|                 .enumerate() | ||||
|                 .map(|(i, documents)| { | ||||
|                     let store = Store::new( | ||||
|                         searchable_fields.clone(), | ||||
|                         faceted_fields.clone(), | ||||
|                         linked_hash_map_size, | ||||
|                         max_nb_chunks, | ||||
|                         max_memory_by_job, | ||||
|                         chunk_compression_type, | ||||
|                         chunk_compression_level, | ||||
|                         chunk_fusing_shrink_size, | ||||
|                         stop_words, | ||||
|                     )?; | ||||
|                     store.index( | ||||
|                         documents, | ||||
|                         documents_count, | ||||
|                         i, | ||||
|                         num_threads, | ||||
|                         log_every_n, | ||||
|                         &progress_callback, | ||||
|                     ) | ||||
|                 }) | ||||
|                 .collect::<StdResult<Vec<_>, _>>()?; | ||||
|  | ||||
|             let mut main_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut word_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut documents_readers = Vec::with_capacity(readers.len()); | ||||
|             readers.into_iter().for_each(|readers| { | ||||
|                 let Readers { | ||||
|                     main, | ||||
|                     word_docids, | ||||
|                     docid_word_positions, | ||||
|                     words_pairs_proximities_docids, | ||||
|                     word_level_position_docids, | ||||
|                     field_id_word_count_docids, | ||||
|                     facet_field_numbers_docids, | ||||
|                     facet_field_strings_docids, | ||||
|                     field_id_docid_facet_numbers, | ||||
|                     field_id_docid_facet_strings, | ||||
|                     documents, | ||||
|                 } = readers; | ||||
|                 main_readers.push(main); | ||||
|                 word_docids_readers.push(word_docids); | ||||
|                 docid_word_positions_readers.push(docid_word_positions); | ||||
|                 words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); | ||||
|                 word_level_position_docids_readers.push(word_level_position_docids); | ||||
|                 field_id_word_count_docids_readers.push(field_id_word_count_docids); | ||||
|                 facet_field_numbers_docids_readers.push(facet_field_numbers_docids); | ||||
|                 facet_field_strings_docids_readers.push(facet_field_strings_docids); | ||||
|                 field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); | ||||
|                 field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings); | ||||
|                 documents_readers.push(documents); | ||||
|             }); | ||||
|  | ||||
|             // This is the function that merge the readers | ||||
|             // by using the given merge function. | ||||
|             let merge_readers = move |readers, merge| { | ||||
|                 let mut writer = tempfile::tempfile().and_then(|f| { | ||||
|                     create_writer(chunk_compression_type, chunk_compression_level, f) | ||||
|                 })?; | ||||
|                 let merger = merge_readers(readers, merge); | ||||
|                 merger.write_into(&mut writer)?; | ||||
|                 writer_into_reader(writer, chunk_fusing_shrink_size) | ||||
|             }; | ||||
|  | ||||
|             // The enum and the channel which is used to transfert | ||||
|             // the readers merges potentially done on another thread. | ||||
|             let (sender, receiver) = sync_channel(2); | ||||
|  | ||||
|             debug!("Merging the main, word docids and words pairs proximity docids in parallel..."); | ||||
|             rayon::spawn(move || { | ||||
|                 vec![ | ||||
|                     (DatabaseType::Main, main_readers, fst_merge as MergeFn<_>), | ||||
|                     (DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge), | ||||
|                     ( | ||||
|                         DatabaseType::FacetLevel0NumbersDocids, | ||||
|                         facet_field_numbers_docids_readers, | ||||
|                         cbo_roaring_bitmap_merge, | ||||
|                     ), | ||||
|                     ( | ||||
|                         DatabaseType::WordLevel0PositionDocids, | ||||
|                         word_level_position_docids_readers, | ||||
|                         cbo_roaring_bitmap_merge, | ||||
|                     ), | ||||
|                     ( | ||||
|                         DatabaseType::FieldIdWordCountDocids, | ||||
|                         field_id_word_count_docids_readers, | ||||
|                         cbo_roaring_bitmap_merge, | ||||
|                     ), | ||||
|                 ] | ||||
|                 .into_par_iter() | ||||
|                 .for_each(|(dbtype, readers, merge)| { | ||||
|                     let result = merge_readers(readers, merge); | ||||
|                     if let Err(e) = sender.send((dbtype, result)) { | ||||
|                         error!("sender error: {}", e); | ||||
|                     } | ||||
|         for typed_chunk in lmdb_writer_rx { | ||||
|             let (docids, is_merged_database) = | ||||
|                 write_typed_chunk_into_index(typed_chunk?, &self.index, self.wtxn, index_is_empty)?; | ||||
|             if !docids.is_empty() { | ||||
|                 final_documents_ids |= docids; | ||||
|                 let documents_seen_count = final_documents_ids.len(); | ||||
|                 progress_callback(UpdateIndexingStep::IndexDocuments { | ||||
|                     documents_seen: documents_seen_count as usize, | ||||
|                     total_documents: documents_count, | ||||
|                 }); | ||||
|             }); | ||||
|  | ||||
|             Ok(( | ||||
|                 receiver, | ||||
|                 docid_word_positions_readers, | ||||
|                 documents_readers, | ||||
|                 words_pairs_proximities_docids_readers, | ||||
|                 facet_field_strings_docids_readers, | ||||
|                 field_id_docid_facet_numbers_readers, | ||||
|                 field_id_docid_facet_strings_readers, | ||||
|             )) as Result<_> | ||||
|         })?; | ||||
|  | ||||
|         let ( | ||||
|             receiver, | ||||
|             docid_word_positions_readers, | ||||
|             documents_readers, | ||||
|             words_pairs_proximities_docids_readers, | ||||
|             facet_field_strings_docids_readers, | ||||
|             field_id_docid_facet_numbers_readers, | ||||
|             field_id_docid_facet_strings_readers, | ||||
|         ) = readers; | ||||
|  | ||||
|         let mut documents_ids = self.index.documents_ids(self.wtxn)?; | ||||
|         let contains_documents = !documents_ids.is_empty(); | ||||
|         let write_method = | ||||
|             if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append }; | ||||
|  | ||||
|         debug!("Writing using the write method: {:?}", write_method); | ||||
|                 debug!( | ||||
|                     "We have seen {} documents on {} total document so far", | ||||
|                     documents_seen_count, documents_count | ||||
|                 ); | ||||
|             } | ||||
|             if is_merged_database { | ||||
|                 databases_seen += 1; | ||||
|                 progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|                     databases_seen: databases_seen, | ||||
|                     total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|                 }); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // We write the field distribution into the main database | ||||
|         self.index.put_field_distribution(self.wtxn, &field_distribution)?; | ||||
| @@ -609,180 +336,24 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         // We write the external documents ids into the main database. | ||||
|         self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; | ||||
|  | ||||
|         // We merge the new documents ids with the existing ones. | ||||
|         documents_ids |= new_documents_ids; | ||||
|         documents_ids |= replaced_documents_ids; | ||||
|         self.index.put_documents_ids(self.wtxn, &documents_ids)?; | ||||
|         let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; | ||||
|         self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; | ||||
|  | ||||
|         let mut database_count = 0; | ||||
|         let total_databases = 11; | ||||
|         self.execute_prefix_databases(progress_callback) | ||||
|     } | ||||
|  | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: 0, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Inserting the docid word positions into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.docid_word_positions.as_polymorph(), | ||||
|             docid_word_positions_readers, | ||||
|             keep_first, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Inserting the documents into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.documents.as_polymorph(), | ||||
|             documents_readers, | ||||
|             keep_first, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Writing the facet id string docids into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.facet_id_string_docids.as_polymorph(), | ||||
|             facet_field_strings_docids_readers, | ||||
|             tuple_string_cbo_roaring_bitmap_merge, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Writing the field id docid facet numbers into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.field_id_docid_facet_f64s.as_polymorph(), | ||||
|             field_id_docid_facet_numbers_readers, | ||||
|             keep_first, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Writing the field id docid facet strings into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.field_id_docid_facet_strings.as_polymorph(), | ||||
|             field_id_docid_facet_strings_readers, | ||||
|             keep_first, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Writing the words pairs proximities docids into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_pair_proximity_docids.as_polymorph(), | ||||
|             words_pairs_proximities_docids_readers, | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         for (db_type, result) in receiver { | ||||
|             let content = result?; | ||||
|             match db_type { | ||||
|                 DatabaseType::Main => { | ||||
|                     debug!("Writing the main elements into LMDB on disk..."); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         self.index.main, | ||||
|                         content, | ||||
|                         fst_merge, | ||||
|                         WriteMethod::GetMergePut, | ||||
|                     )?; | ||||
|                 } | ||||
|                 DatabaseType::WordDocids => { | ||||
|                     debug!("Writing the words docids into LMDB on disk..."); | ||||
|                     let db = *self.index.word_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|                         content, | ||||
|                         roaring_bitmap_merge, | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 } | ||||
|                 DatabaseType::FacetLevel0NumbersDocids => { | ||||
|                     debug!("Writing the facet numbers docids into LMDB on disk..."); | ||||
|                     let db = *self.index.facet_id_f64_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|                         content, | ||||
|                         cbo_roaring_bitmap_merge, | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 } | ||||
|                 DatabaseType::FieldIdWordCountDocids => { | ||||
|                     debug!("Writing the field id word count docids into LMDB on disk..."); | ||||
|                     let db = *self.index.field_id_word_count_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|                         content, | ||||
|                         cbo_roaring_bitmap_merge, | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 } | ||||
|                 DatabaseType::WordLevel0PositionDocids => { | ||||
|                     debug!("Writing the word level 0 positions docids into LMDB on disk..."); | ||||
|                     let db = *self.index.word_level_position_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|                         content, | ||||
|                         cbo_roaring_bitmap_merge, | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             database_count += 1; | ||||
|             progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|                 databases_seen: database_count, | ||||
|                 total_databases, | ||||
|             }); | ||||
|         } | ||||
|     #[logging_timer::time("IndexDocuments::{}")] | ||||
|     pub fn execute_prefix_databases<F>(self, progress_callback: F) -> Result<()> | ||||
|     where | ||||
|         F: Fn(UpdateIndexingStep) + Sync, | ||||
|     { | ||||
|         // Merged databases are already been indexed, we start from this count; | ||||
|         let mut databases_seen = MERGED_DATABASE_COUNT; | ||||
|  | ||||
|         // Run the facets update operation. | ||||
|         let mut builder = Facets::new(self.wtxn, self.index, self.update_id); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         if let Some(value) = self.facet_level_group_size { | ||||
|             builder.level_group_size(value); | ||||
|         } | ||||
| @@ -791,6 +362,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         databases_seen += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: databases_seen, | ||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|         }); | ||||
|  | ||||
|         // Run the words prefixes update operation. | ||||
|         let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); | ||||
|         if let Some(value) = self.words_prefix_threshold { | ||||
| @@ -801,29 +378,44 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         databases_seen += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: databases_seen, | ||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|         }); | ||||
|  | ||||
|         // Run the word prefix docids update operation. | ||||
|         let mut builder = WordPrefixDocids::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         builder.max_memory = self.max_memory; | ||||
|         builder.execute()?; | ||||
|  | ||||
|         databases_seen += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: databases_seen, | ||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|         }); | ||||
|  | ||||
|         // Run the word prefix pair proximity docids update operation. | ||||
|         let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         builder.max_memory = self.max_memory; | ||||
|         builder.execute()?; | ||||
|  | ||||
|         databases_seen += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: databases_seen, | ||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|         }); | ||||
|  | ||||
|         // Run the words level positions update operation. | ||||
|         let mut builder = WordsLevelPositions::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         if let Some(value) = self.words_positions_level_group_size { | ||||
|             builder.level_group_size(value); | ||||
|         } | ||||
| @@ -832,9 +424,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         debug_assert_eq!(database_count, total_databases); | ||||
|  | ||||
|         info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); | ||||
|         databases_seen += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: databases_seen, | ||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|         }); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|   | ||||
| @@ -1,985 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashMap, HashSet}; | ||||
| use std::convert::{TryFrom, TryInto}; | ||||
| use std::fs::File; | ||||
| use std::iter::FromIterator; | ||||
| use std::time::Instant; | ||||
| use std::{cmp, iter}; | ||||
|  | ||||
| use bstr::ByteSlice as _; | ||||
| use concat_arrays::concat_arrays; | ||||
| use fst::Set; | ||||
| use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; | ||||
| use heed::BytesEncode; | ||||
| use linked_hash_map::LinkedHashMap; | ||||
| use log::{debug, info, warn}; | ||||
| use meilisearch_tokenizer::token::SeparatorKind; | ||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; | ||||
| use ordered_float::OrderedFloat; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
| use tempfile::tempfile; | ||||
|  | ||||
| use super::merge_function::{ | ||||
|     cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, | ||||
|     tuple_string_cbo_roaring_bitmap_merge, | ||||
| }; | ||||
| use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; | ||||
| use crate::error::{Error, InternalError, SerializationError}; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, | ||||
|     FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, | ||||
| }; | ||||
| use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| use crate::update::UpdateIndexingStep; | ||||
| use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32}; | ||||
|  | ||||
| const LMDB_MAX_KEY_LENGTH: usize = 511; | ||||
| const ONE_KILOBYTE: usize = 1024 * 1024; | ||||
|  | ||||
| const MAX_POSITION: usize = 1000; | ||||
| const WORDS_FST_KEY: &[u8] = crate::index::main_key::WORDS_FST_KEY.as_bytes(); | ||||
|  | ||||
| pub struct Readers { | ||||
|     pub main: Reader<FileFuse>, | ||||
|     pub word_docids: Reader<FileFuse>, | ||||
|     pub docid_word_positions: Reader<FileFuse>, | ||||
|     pub words_pairs_proximities_docids: Reader<FileFuse>, | ||||
|     pub word_level_position_docids: Reader<FileFuse>, | ||||
|     pub field_id_word_count_docids: Reader<FileFuse>, | ||||
|     pub facet_field_numbers_docids: Reader<FileFuse>, | ||||
|     pub facet_field_strings_docids: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_numbers: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_strings: Reader<FileFuse>, | ||||
|     pub documents: Reader<FileFuse>, | ||||
| } | ||||
|  | ||||
| pub struct Store<'s, A> { | ||||
|     // Indexing parameters | ||||
|     searchable_fields: HashSet<FieldId>, | ||||
|     filterable_fields: HashSet<FieldId>, | ||||
|     // Caches | ||||
|     word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, | ||||
|     word_docids_limit: usize, | ||||
|     field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, | ||||
|     words_pairs_proximities_docids: | ||||
|         LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, | ||||
|     words_pairs_proximities_docids_limit: usize, | ||||
|     facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, | ||||
|     facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, | ||||
|     facet_field_value_docids_limit: usize, | ||||
|     // MTBL parameters | ||||
|     chunk_compression_type: CompressionType, | ||||
|     chunk_compression_level: Option<u32>, | ||||
|     chunk_fusing_shrink_size: Option<u64>, | ||||
|     // MTBL sorters | ||||
|     main_sorter: Sorter<MergeFn<Error>>, | ||||
|     word_docids_sorter: Sorter<MergeFn<Error>>, | ||||
|     words_pairs_proximities_docids_sorter: Sorter<MergeFn<Error>>, | ||||
|     word_level_position_docids_sorter: Sorter<MergeFn<Error>>, | ||||
|     field_id_word_count_docids_sorter: Sorter<MergeFn<Error>>, | ||||
|     facet_field_numbers_docids_sorter: Sorter<MergeFn<Error>>, | ||||
|     facet_field_strings_docids_sorter: Sorter<MergeFn<Error>>, | ||||
|     field_id_docid_facet_numbers_sorter: Sorter<MergeFn<Error>>, | ||||
|     field_id_docid_facet_strings_sorter: Sorter<MergeFn<Error>>, | ||||
|     // MTBL writers | ||||
|     docid_word_positions_writer: Writer<File>, | ||||
|     documents_writer: Writer<File>, | ||||
|     // tokenizer | ||||
|     analyzer: Analyzer<'s, A>, | ||||
| } | ||||
|  | ||||
| impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|     pub fn new( | ||||
|         searchable_fields: HashSet<FieldId>, | ||||
|         filterable_fields: HashSet<FieldId>, | ||||
|         linked_hash_map_size: Option<usize>, | ||||
|         max_nb_chunks: Option<usize>, | ||||
|         max_memory: Option<usize>, | ||||
|         chunk_compression_type: CompressionType, | ||||
|         chunk_compression_level: Option<u32>, | ||||
|         chunk_fusing_shrink_size: Option<u64>, | ||||
|         stop_words: Option<&'s Set<A>>, | ||||
|     ) -> Result<Self> { | ||||
|         // We divide the max memory by the number of sorter the Store have. | ||||
|         let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); | ||||
|         let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); | ||||
|  | ||||
|         let main_sorter = create_sorter( | ||||
|             fst_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let word_docids_sorter = create_sorter( | ||||
|             roaring_bitmap_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let words_pairs_proximities_docids_sorter = create_sorter( | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let word_level_position_docids_sorter = create_sorter( | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let field_id_word_count_docids_sorter = create_sorter( | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let facet_field_numbers_docids_sorter = create_sorter( | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let facet_field_strings_docids_sorter = create_sorter( | ||||
|             tuple_string_cbo_roaring_bitmap_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let field_id_docid_facet_numbers_sorter = create_sorter( | ||||
|             keep_first, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             Some(1024 * 1024 * 1024), // 1MB | ||||
|         ); | ||||
|         let field_id_docid_facet_strings_sorter = create_sorter( | ||||
|             keep_first, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             Some(1024 * 1024 * 1024), // 1MB | ||||
|         ); | ||||
|  | ||||
|         let documents_writer = tempfile() | ||||
|             .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; | ||||
|         let docid_word_positions_writer = tempfile() | ||||
|             .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; | ||||
|  | ||||
|         let mut config = AnalyzerConfig::default(); | ||||
|         if let Some(stop_words) = stop_words { | ||||
|             config.stop_words(stop_words); | ||||
|         } | ||||
|         let analyzer = Analyzer::new(config); | ||||
|  | ||||
|         Ok(Store { | ||||
|             // Indexing parameters. | ||||
|             searchable_fields, | ||||
|             filterable_fields, | ||||
|             // Caches | ||||
|             word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             field_id_word_count_docids: HashMap::new(), | ||||
|             word_docids_limit: linked_hash_map_size, | ||||
|             words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             words_pairs_proximities_docids_limit: linked_hash_map_size, | ||||
|             facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             facet_field_value_docids_limit: linked_hash_map_size, | ||||
|             // MTBL parameters | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             // MTBL sorters | ||||
|             main_sorter, | ||||
|             word_docids_sorter, | ||||
|             words_pairs_proximities_docids_sorter, | ||||
|             word_level_position_docids_sorter, | ||||
|             field_id_word_count_docids_sorter, | ||||
|             facet_field_numbers_docids_sorter, | ||||
|             facet_field_strings_docids_sorter, | ||||
|             field_id_docid_facet_numbers_sorter, | ||||
|             field_id_docid_facet_strings_sorter, | ||||
|             // MTBL writers | ||||
|             docid_word_positions_writer, | ||||
|             documents_writer, | ||||
|             // tokenizer | ||||
|             analyzer, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     // Save the documents ids under the position and word we have seen it. | ||||
|     fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { | ||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||
|         match self.word_docids.get_refresh(word.as_bytes()) { | ||||
|             Some(old) => { | ||||
|                 old.insert(id); | ||||
|             } | ||||
|             None => { | ||||
|                 let word_vec = SmallVec32::from(word.as_bytes()); | ||||
|                 // A newly inserted element is append at the end of the linked hash map. | ||||
|                 self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id))); | ||||
|                 // If the word docids just reached it's capacity we must make sure to remove | ||||
|                 // one element, this way next time we insert we doesn't grow the capacity. | ||||
|                 if self.word_docids.len() == self.word_docids_limit { | ||||
|                     // Removing the front element is equivalent to removing the LRU element. | ||||
|                     let lru = self.word_docids.pop_front(); | ||||
|                     Self::write_word_docids(&mut self.word_docids_sorter, lru)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn insert_facet_number_values_docid( | ||||
|         &mut self, | ||||
|         field_id: FieldId, | ||||
|         value: OrderedFloat<f64>, | ||||
|         id: DocumentId, | ||||
|     ) -> Result<()> { | ||||
|         let sorter = &mut self.field_id_docid_facet_numbers_sorter; | ||||
|         Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; | ||||
|  | ||||
|         let key = (field_id, value); | ||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||
|         match self.facet_field_number_docids.get_refresh(&key) { | ||||
|             Some(old) => { | ||||
|                 old.insert(id); | ||||
|             } | ||||
|             None => { | ||||
|                 // A newly inserted element is append at the end of the linked hash map. | ||||
|                 self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); | ||||
|                 // If the word docids just reached it's capacity we must make sure to remove | ||||
|                 // one element, this way next time we insert we doesn't grow the capacity. | ||||
|                 if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit { | ||||
|                     // Removing the front element is equivalent to removing the LRU element. | ||||
|                     Self::write_facet_field_number_docids( | ||||
|                         &mut self.facet_field_numbers_docids_sorter, | ||||
|                         self.facet_field_number_docids.pop_front(), | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     // Save the documents ids under the facet field id and value we have seen it. | ||||
|     fn insert_facet_string_values_docid( | ||||
|         &mut self, | ||||
|         field_id: FieldId, | ||||
|         normalized_value: String, | ||||
|         original_value: String, | ||||
|         id: DocumentId, | ||||
|     ) -> Result<()> { | ||||
|         if normalized_value.is_empty() { | ||||
|             return Ok(()); | ||||
|         } | ||||
|  | ||||
|         let sorter = &mut self.field_id_docid_facet_strings_sorter; | ||||
|         Self::write_field_id_docid_facet_string_value( | ||||
|             sorter, | ||||
|             field_id, | ||||
|             id, | ||||
|             &normalized_value, | ||||
|             &original_value, | ||||
|         )?; | ||||
|  | ||||
|         let key = (field_id, normalized_value); | ||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||
|         match self.facet_field_string_docids.get_refresh(&key) { | ||||
|             Some((_original_value, old)) => { | ||||
|                 old.insert(id); | ||||
|             } | ||||
|             None => { | ||||
|                 // A newly inserted element is append at the end of the linked hash map. | ||||
|                 self.facet_field_string_docids | ||||
|                     .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); | ||||
|                 // If the word docids just reached it's capacity we must make sure to remove | ||||
|                 // one element, this way next time we insert we doesn't grow the capacity. | ||||
|                 if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { | ||||
|                     // Removing the front element is equivalent to removing the LRU element. | ||||
|                     Self::write_facet_field_string_docids( | ||||
|                         &mut self.facet_field_strings_docids_sorter, | ||||
|                         self.facet_field_string_docids.pop_front(), | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     // Save the documents ids under the words pairs proximities that it contains. | ||||
|     fn insert_words_pairs_proximities_docids<'a>( | ||||
|         &mut self, | ||||
|         words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>, | ||||
|         id: DocumentId, | ||||
|     ) -> Result<()> { | ||||
|         for ((w1, w2), prox) in words_pairs_proximities { | ||||
|             let w1 = SmallVec32::from(w1.as_bytes()); | ||||
|             let w2 = SmallVec32::from(w2.as_bytes()); | ||||
|             let key = (w1, w2, prox); | ||||
|             // if get_refresh finds the element it is assured | ||||
|             // to be at the end of the linked hash map. | ||||
|             match self.words_pairs_proximities_docids.get_refresh(&key) { | ||||
|                 Some(old) => { | ||||
|                     old.insert(id); | ||||
|                 } | ||||
|                 None => { | ||||
|                     // A newly inserted element is append at the end of the linked hash map. | ||||
|                     let ids = RoaringBitmap::from_iter(Some(id)); | ||||
|                     self.words_pairs_proximities_docids.insert(key, ids); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // If the linked hashmap is over capacity we must remove the overflowing elements. | ||||
|         let len = self.words_pairs_proximities_docids.len(); | ||||
|         let overflow = len.checked_sub(self.words_pairs_proximities_docids_limit); | ||||
|         if let Some(overflow) = overflow { | ||||
|             let mut lrus = Vec::with_capacity(overflow); | ||||
|             // Removing front elements is equivalent to removing the LRUs. | ||||
|             let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); | ||||
|             iter.take(overflow).for_each(|x| lrus.push(x)); | ||||
|             Self::write_words_pairs_proximities( | ||||
|                 &mut self.words_pairs_proximities_docids_sorter, | ||||
|                 lrus, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_document( | ||||
|         &mut self, | ||||
|         document_id: DocumentId, | ||||
|         words_positions: &mut HashMap<String, SmallVec32<Position>>, | ||||
|         facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>, | ||||
|         facet_strings_values: &mut HashMap<FieldId, Vec<(String, String)>>, | ||||
|         record: &[u8], | ||||
|     ) -> Result<()> { | ||||
|         // We compute the list of words pairs proximities (self-join) and write it directly to disk. | ||||
|         let words_pair_proximities = compute_words_pair_proximities(&words_positions); | ||||
|         self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; | ||||
|  | ||||
|         // We store document_id associated with all the words the record contains. | ||||
|         for (word, _) in words_positions.iter() { | ||||
|             self.insert_word_docid(word, document_id)?; | ||||
|         } | ||||
|  | ||||
|         self.documents_writer.insert(document_id.to_be_bytes(), record)?; | ||||
|         Self::write_docid_word_positions( | ||||
|             &mut self.docid_word_positions_writer, | ||||
|             document_id, | ||||
|             words_positions, | ||||
|         )?; | ||||
|         Self::write_word_position_docids( | ||||
|             &mut self.word_level_position_docids_sorter, | ||||
|             document_id, | ||||
|             words_positions, | ||||
|         )?; | ||||
|  | ||||
|         words_positions.clear(); | ||||
|  | ||||
|         // We store document_id associated with all the facet numbers fields ids and values. | ||||
|         for (field, values) in facet_numbers_values.drain() { | ||||
|             for value in values { | ||||
|                 let value = OrderedFloat::from(value); | ||||
|                 self.insert_facet_number_values_docid(field, value, document_id)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // We store document_id associated with all the facet strings fields ids and values. | ||||
|         for (field, values) in facet_strings_values.drain() { | ||||
|             for (normalized, original) in values { | ||||
|                 self.insert_facet_string_values_docid(field, normalized, original, document_id)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_words_pairs_proximities<E>( | ||||
|         sorter: &mut Sorter<MergeFn<E>>, | ||||
|         iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>, | ||||
|     ) -> Result<()> | ||||
|     where | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let mut key = Vec::new(); | ||||
|         let mut buffer = Vec::new(); | ||||
|  | ||||
|         for ((w1, w2, min_prox), docids) in iter { | ||||
|             key.clear(); | ||||
|             key.extend_from_slice(w1.as_bytes()); | ||||
|             key.push(0); | ||||
|             key.extend_from_slice(w2.as_bytes()); | ||||
|             // Storing the minimun proximity found between those words | ||||
|             key.push(min_prox); | ||||
|             // We serialize the document ids into a buffer | ||||
|             buffer.clear(); | ||||
|             buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids)); | ||||
|             CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer); | ||||
|             // that we write under the generated key into MTBL | ||||
|             if lmdb_key_valid_size(&key) { | ||||
|                 sorter.insert(&key, &buffer)?; | ||||
|             } else { | ||||
|                 warn!( | ||||
|                     "words pairs proximity ({:?} - {:?}, {:?}) is too large to be saved", | ||||
|                     w1, w2, min_prox | ||||
|                 ); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_docid_word_positions( | ||||
|         writer: &mut Writer<File>, | ||||
|         id: DocumentId, | ||||
|         words_positions: &HashMap<String, SmallVec32<Position>>, | ||||
|     ) -> Result<()> { | ||||
|         // We prefix the words by the document id. | ||||
|         let mut key = id.to_be_bytes().to_vec(); | ||||
|         let mut buffer = Vec::new(); | ||||
|         let base_size = key.len(); | ||||
|  | ||||
|         // We order the words lexicographically, this way we avoid passing by a sorter. | ||||
|         let words_positions = BTreeMap::from_iter(words_positions); | ||||
|  | ||||
|         for (word, positions) in words_positions { | ||||
|             key.truncate(base_size); | ||||
|             key.extend_from_slice(word.as_bytes()); | ||||
|             buffer.clear(); | ||||
|  | ||||
|             // We serialize the positions into a buffer. | ||||
|             let positions = RoaringBitmap::from_iter(positions.iter().cloned()); | ||||
|             BoRoaringBitmapCodec::serialize_into(&positions, &mut buffer); | ||||
|  | ||||
|             // that we write under the generated key into MTBL | ||||
|             if lmdb_key_valid_size(&key) { | ||||
|                 writer.insert(&key, &buffer)?; | ||||
|             } else { | ||||
|                 warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_word_position_docids<E>( | ||||
|         writer: &mut Sorter<MergeFn<E>>, | ||||
|         document_id: DocumentId, | ||||
|         words_positions: &HashMap<String, SmallVec32<Position>>, | ||||
|     ) -> Result<()> | ||||
|     where | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let mut key_buffer = Vec::new(); | ||||
|         let mut data_buffer = Vec::new(); | ||||
|  | ||||
|         for (word, positions) in words_positions { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word.as_bytes()); | ||||
|             key_buffer.push(0); // level 0 | ||||
|  | ||||
|             for position in positions { | ||||
|                 key_buffer.truncate(word.len() + 1); | ||||
|                 let position_bytes = position.to_be_bytes(); | ||||
|                 key_buffer.extend_from_slice(position_bytes.as_bytes()); | ||||
|                 key_buffer.extend_from_slice(position_bytes.as_bytes()); | ||||
|  | ||||
|                 data_buffer.clear(); | ||||
|                 let positions = RoaringBitmap::from_iter(Some(document_id)); | ||||
|                 // We serialize the positions into a buffer. | ||||
|                 CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer); | ||||
|  | ||||
|                 // that we write under the generated key into MTBL | ||||
|                 if lmdb_key_valid_size(&key_buffer) { | ||||
|                     writer.insert(&key_buffer, &data_buffer)?; | ||||
|                 } else { | ||||
|                     warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> | ||||
|     where | ||||
|         I: IntoIterator<Item = ((FieldId, String), (String, RoaringBitmap))>, | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let mut key_buffer = Vec::new(); | ||||
|  | ||||
|         for ((field_id, normalized_value), (original_value, docids)) in iter { | ||||
|             key_buffer.clear(); | ||||
|  | ||||
|             FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); | ||||
|  | ||||
|             let data = (original_value.as_str(), docids); | ||||
|             let data = FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data) | ||||
|                 .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; | ||||
|  | ||||
|             if lmdb_key_valid_size(&key_buffer) { | ||||
|                 sorter.insert(&key_buffer, &data)?; | ||||
|             } else { | ||||
|                 warn!( | ||||
|                     "facet value {:?} is too large to be saved", | ||||
|                     original_value.as_bytes().as_bstr() | ||||
|                 ); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> | ||||
|     where | ||||
|         I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>, | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let mut data_buffer = Vec::new(); | ||||
|  | ||||
|         for ((field_id, value), docids) in iter { | ||||
|             data_buffer.clear(); | ||||
|  | ||||
|             let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) | ||||
|                 .map(Cow::into_owned) | ||||
|                 .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; | ||||
|  | ||||
|             CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); | ||||
|  | ||||
|             if lmdb_key_valid_size(&key) { | ||||
|                 sorter.insert(&key, &data_buffer)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_field_id_docid_facet_number_value<E>( | ||||
|         sorter: &mut Sorter<MergeFn<E>>, | ||||
|         field_id: FieldId, | ||||
|         document_id: DocumentId, | ||||
|         value: OrderedFloat<f64>, | ||||
|     ) -> Result<()> | ||||
|     where | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) | ||||
|             .map(Cow::into_owned) | ||||
|             .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; | ||||
|  | ||||
|         if lmdb_key_valid_size(&key) { | ||||
|             sorter.insert(&key, &[])?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_field_id_docid_facet_string_value<E>( | ||||
|         sorter: &mut Sorter<MergeFn<E>>, | ||||
|         field_id: FieldId, | ||||
|         document_id: DocumentId, | ||||
|         normalized_value: &str, | ||||
|         original_value: &str, | ||||
|     ) -> Result<()> | ||||
|     where | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let mut buffer = Vec::new(); | ||||
|         FieldDocIdFacetStringCodec::serialize_into( | ||||
|             field_id, | ||||
|             document_id, | ||||
|             normalized_value, | ||||
|             &mut buffer, | ||||
|         ); | ||||
|  | ||||
|         if lmdb_key_valid_size(&buffer) { | ||||
|             sorter.insert(&buffer, original_value.as_bytes())?; | ||||
|         } else { | ||||
|             warn!("facet value {:?} is too large to be saved", original_value.as_bytes().as_bstr()); | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()> | ||||
|     where | ||||
|         I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>, | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let mut key = Vec::new(); | ||||
|         let mut buffer = Vec::new(); | ||||
|  | ||||
|         for (word, ids) in iter { | ||||
|             key.clear(); | ||||
|             key.extend_from_slice(&word); | ||||
|             // We serialize the document ids into a buffer | ||||
|             buffer.clear(); | ||||
|             let ids = RoaringBitmap::from_iter(ids); | ||||
|             buffer.reserve(ids.serialized_size()); | ||||
|             ids.serialize_into(&mut buffer)?; | ||||
|             // that we write under the generated key into MTBL | ||||
|             if lmdb_key_valid_size(&key) { | ||||
|                 sorter.insert(&key, &buffer)?; | ||||
|             } else { | ||||
|                 warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     pub fn index<F>( | ||||
|         mut self, | ||||
|         mut documents: grenad::Reader<&[u8]>, | ||||
|         documents_count: usize, | ||||
|         thread_index: usize, | ||||
|         num_threads: usize, | ||||
|         log_every_n: Option<usize>, | ||||
|         mut progress_callback: F, | ||||
|     ) -> Result<Readers> | ||||
|     where | ||||
|         F: FnMut(UpdateIndexingStep), | ||||
|     { | ||||
|         debug!("{:?}: Indexing in a Store...", thread_index); | ||||
|  | ||||
|         let mut before = Instant::now(); | ||||
|         let mut words_positions = HashMap::new(); | ||||
|         let mut facet_numbers_values = HashMap::new(); | ||||
|         let mut facet_strings_values = HashMap::new(); | ||||
|  | ||||
|         let mut count: usize = 0; | ||||
|         while let Some((key, value)) = documents.next()? { | ||||
|             let document_id = key.try_into().map(u32::from_be_bytes).unwrap(); | ||||
|             let document = obkv::KvReader::new(value); | ||||
|  | ||||
|             // We skip documents that must not be indexed by this thread. | ||||
|             if count % num_threads == thread_index { | ||||
|                 // This is a log routine that we do every `log_every_n` documents. | ||||
|                 if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { | ||||
|                     info!( | ||||
|                         "We have seen {} documents so far ({:.02?}).", | ||||
|                         format_count(count), | ||||
|                         before.elapsed() | ||||
|                     ); | ||||
|                     progress_callback(UpdateIndexingStep::IndexDocuments { | ||||
|                         documents_seen: count, | ||||
|                         total_documents: documents_count, | ||||
|                     }); | ||||
|                     before = Instant::now(); | ||||
|                 } | ||||
|  | ||||
|                 for (attr, content) in document.iter() { | ||||
|                     if self.filterable_fields.contains(&attr) | ||||
|                         || self.searchable_fields.contains(&attr) | ||||
|                     { | ||||
|                         let value = | ||||
|                             serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; | ||||
|  | ||||
|                         if self.filterable_fields.contains(&attr) { | ||||
|                             let (facet_numbers, facet_strings) = extract_facet_values(&value); | ||||
|                             facet_numbers_values | ||||
|                                 .entry(attr) | ||||
|                                 .or_insert_with(Vec::new) | ||||
|                                 .extend(facet_numbers); | ||||
|                             facet_strings_values | ||||
|                                 .entry(attr) | ||||
|                                 .or_insert_with(Vec::new) | ||||
|                                 .extend(facet_strings); | ||||
|                         } | ||||
|  | ||||
|                         if self.searchable_fields.contains(&attr) { | ||||
|                             let content = match json_to_string(&value) { | ||||
|                                 Some(content) => content, | ||||
|                                 None => continue, | ||||
|                             }; | ||||
|  | ||||
|                             let analyzed = self.analyzer.analyze(&content); | ||||
|                             let tokens = process_tokens(analyzed.tokens()); | ||||
|  | ||||
|                             let mut last_pos = None; | ||||
|                             for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { | ||||
|                                 last_pos = Some(pos); | ||||
|                                 let position = (attr as usize * MAX_POSITION + pos) as u32; | ||||
|                                 words_positions | ||||
|                                     .entry(token.text().to_string()) | ||||
|                                     .or_insert_with(SmallVec32::new) | ||||
|                                     .push(position); | ||||
|                             } | ||||
|  | ||||
|                             if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { | ||||
|                                 let key = (attr, last_pos as u8 + 1); | ||||
|                                 self.field_id_word_count_docids | ||||
|                                     .entry(key) | ||||
|                                     .or_insert_with(RoaringBitmap::new) | ||||
|                                     .insert(document_id); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 // We write the document in the documents store. | ||||
|                 self.write_document( | ||||
|                     document_id, | ||||
|                     &mut words_positions, | ||||
|                     &mut facet_numbers_values, | ||||
|                     &mut facet_strings_values, | ||||
|                     value, | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             // Compute the document id of the next document. | ||||
|             count += 1; | ||||
|         } | ||||
|  | ||||
|         progress_callback(UpdateIndexingStep::IndexDocuments { | ||||
|             documents_seen: count, | ||||
|             total_documents: documents_count, | ||||
|         }); | ||||
|  | ||||
|         let readers = self.finish()?; | ||||
|         debug!("{:?}: Store created!", thread_index); | ||||
|         Ok(readers) | ||||
|     } | ||||
|  | ||||
|     fn finish(mut self) -> Result<Readers> { | ||||
|         let comp_type = self.chunk_compression_type; | ||||
|         let comp_level = self.chunk_compression_level; | ||||
|         let shrink_size = self.chunk_fusing_shrink_size; | ||||
|  | ||||
|         Self::write_word_docids(&mut self.word_docids_sorter, self.word_docids)?; | ||||
|         Self::write_words_pairs_proximities( | ||||
|             &mut self.words_pairs_proximities_docids_sorter, | ||||
|             self.words_pairs_proximities_docids, | ||||
|         )?; | ||||
|         Self::write_facet_field_number_docids( | ||||
|             &mut self.facet_field_numbers_docids_sorter, | ||||
|             self.facet_field_number_docids, | ||||
|         )?; | ||||
|  | ||||
|         Self::write_facet_field_string_docids( | ||||
|             &mut self.facet_field_strings_docids_sorter, | ||||
|             self.facet_field_string_docids, | ||||
|         )?; | ||||
|  | ||||
|         let mut word_docids_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         let mut builder = fst::SetBuilder::memory(); | ||||
|  | ||||
|         let mut iter = self.word_docids_sorter.into_iter()?; | ||||
|         while let Some((word, val)) = iter.next()? { | ||||
|             // This is a lexicographically ordered word position | ||||
|             // we use the key to construct the words fst. | ||||
|             builder.insert(word)?; | ||||
|             word_docids_wtr.insert(word, val)?; | ||||
|         } | ||||
|  | ||||
|         let mut docids_buffer = Vec::new(); | ||||
|         for ((fid, count), docids) in self.field_id_word_count_docids { | ||||
|             docids_buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); | ||||
|             let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]); | ||||
|             self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?; | ||||
|         } | ||||
|  | ||||
|         let fst = builder.into_set(); | ||||
|         self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; | ||||
|  | ||||
|         let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.main_sorter.write_into(&mut main_wtr)?; | ||||
|  | ||||
|         let mut words_pairs_proximities_docids_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.words_pairs_proximities_docids_sorter | ||||
|             .write_into(&mut words_pairs_proximities_docids_wtr)?; | ||||
|  | ||||
|         let mut word_level_position_docids_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; | ||||
|  | ||||
|         let mut field_id_word_count_docids_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; | ||||
|  | ||||
|         let mut facet_field_numbers_docids_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; | ||||
|  | ||||
|         let mut facet_field_strings_docids_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; | ||||
|  | ||||
|         let mut field_id_docid_facet_numbers_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_docid_facet_numbers_sorter | ||||
|             .write_into(&mut field_id_docid_facet_numbers_wtr)?; | ||||
|  | ||||
|         let mut field_id_docid_facet_strings_wtr = | ||||
|             tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_docid_facet_strings_sorter | ||||
|             .write_into(&mut field_id_docid_facet_strings_wtr)?; | ||||
|  | ||||
|         let main = writer_into_reader(main_wtr, shrink_size)?; | ||||
|         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; | ||||
|         let words_pairs_proximities_docids = | ||||
|             writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; | ||||
|         let word_level_position_docids = | ||||
|             writer_into_reader(word_level_position_docids_wtr, shrink_size)?; | ||||
|         let field_id_word_count_docids = | ||||
|             writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; | ||||
|         let facet_field_numbers_docids = | ||||
|             writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; | ||||
|         let facet_field_strings_docids = | ||||
|             writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_numbers = | ||||
|             writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_strings = | ||||
|             writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; | ||||
|         let docid_word_positions = | ||||
|             writer_into_reader(self.docid_word_positions_writer, shrink_size)?; | ||||
|         let documents = writer_into_reader(self.documents_writer, shrink_size)?; | ||||
|  | ||||
|         Ok(Readers { | ||||
|             main, | ||||
|             word_docids, | ||||
|             docid_word_positions, | ||||
|             words_pairs_proximities_docids, | ||||
|             word_level_position_docids, | ||||
|             field_id_word_count_docids, | ||||
|             facet_field_numbers_docids, | ||||
|             facet_field_strings_docids, | ||||
|             field_id_docid_facet_numbers, | ||||
|             field_id_docid_facet_strings, | ||||
|             documents, | ||||
|         }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Outputs a list of all pairs of words with the shortest proximity between 1 and 7 inclusive. | ||||
| /// | ||||
| /// This list is used by the engine to calculate the documents containing words that are | ||||
| /// close to each other. | ||||
| fn compute_words_pair_proximities( | ||||
|     word_positions: &HashMap<String, SmallVec32<Position>>, | ||||
| ) -> HashMap<(&str, &str), u8> { | ||||
|     use itertools::Itertools; | ||||
|  | ||||
|     let mut words_pair_proximities = HashMap::new(); | ||||
|     for ((w1, ps1), (w2, ps2)) in word_positions.iter().cartesian_product(word_positions) { | ||||
|         let mut min_prox = None; | ||||
|         for (ps1, ps2) in ps1.iter().cartesian_product(ps2) { | ||||
|             let prox = crate::proximity::positions_proximity(*ps1, *ps2); | ||||
|             let prox = u8::try_from(prox).unwrap(); | ||||
|             // We don't care about a word that appear at the | ||||
|             // same position or too far from the other. | ||||
|             if prox >= 1 && prox <= 7 && min_prox.map_or(true, |mp| prox < mp) { | ||||
|                 min_prox = Some(prox) | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if let Some(min_prox) = min_prox { | ||||
|             words_pair_proximities.insert((w1.as_str(), w2.as_str()), min_prox); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     words_pair_proximities | ||||
| } | ||||
|  | ||||
| fn format_count(n: usize) -> String { | ||||
|     human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64) | ||||
| } | ||||
|  | ||||
| fn lmdb_key_valid_size(key: &[u8]) -> bool { | ||||
|     !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH | ||||
| } | ||||
|  | ||||
| /// take an iterator on tokens and compute their relative position depending on separator kinds | ||||
| /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, | ||||
| /// else we keep the standart proximity of 1 between words. | ||||
| fn process_tokens<'a>( | ||||
|     tokens: impl Iterator<Item = Token<'a>>, | ||||
| ) -> impl Iterator<Item = (usize, Token<'a>)> { | ||||
|     tokens | ||||
|         .skip_while(|token| token.is_separator().is_some()) | ||||
|         .scan((0, None), |(offset, prev_kind), token| { | ||||
|             match token.kind { | ||||
|                 TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { | ||||
|                     *offset += match *prev_kind { | ||||
|                         Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, | ||||
|                         Some(_) => 1, | ||||
|                         None => 0, | ||||
|                     }; | ||||
|                     *prev_kind = Some(token.kind) | ||||
|                 } | ||||
|                 TokenKind::Separator(SeparatorKind::Hard) => { | ||||
|                     *prev_kind = Some(token.kind); | ||||
|                 } | ||||
|                 TokenKind::Separator(SeparatorKind::Soft) | ||||
|                     if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => | ||||
|                 { | ||||
|                     *prev_kind = Some(token.kind); | ||||
|                 } | ||||
|                 _ => (), | ||||
|             } | ||||
|             Some((*offset, token)) | ||||
|         }) | ||||
|         .filter(|(_, t)| t.is_word()) | ||||
| } | ||||
|  | ||||
| fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) { | ||||
|     fn inner_extract_facet_values( | ||||
|         value: &Value, | ||||
|         can_recurse: bool, | ||||
|         output_numbers: &mut Vec<f64>, | ||||
|         output_strings: &mut Vec<(String, String)>, | ||||
|     ) { | ||||
|         match value { | ||||
|             Value::Null => (), | ||||
|             Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), | ||||
|             Value::Number(number) => { | ||||
|                 if let Some(float) = number.as_f64() { | ||||
|                     output_numbers.push(float); | ||||
|                 } | ||||
|             } | ||||
|             Value::String(original) => { | ||||
|                 let normalized = original.trim().to_lowercase(); | ||||
|                 output_strings.push((normalized, original.clone())); | ||||
|             } | ||||
|             Value::Array(values) => { | ||||
|                 if can_recurse { | ||||
|                     for value in values { | ||||
|                         inner_extract_facet_values(value, false, output_numbers, output_strings); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             Value::Object(_) => (), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut facet_number_values = Vec::new(); | ||||
|     let mut facet_string_values = Vec::new(); | ||||
|     inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); | ||||
|  | ||||
|     (facet_number_values, facet_string_values) | ||||
| } | ||||
|   | ||||
| @@ -11,15 +11,14 @@ use log::info; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::{Map, Value}; | ||||
|  | ||||
| use super::merge_function::merge_two_obkvs; | ||||
| use super::{create_sorter, create_writer, IndexDocumentsMethod}; | ||||
| use crate::error::{Error, InternalError, UserError}; | ||||
| use crate::index::db_name; | ||||
| use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; | ||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||
| use crate::{ | ||||
|     ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, MergeFn, Result, BEU32, | ||||
| use super::helpers::{ | ||||
|     create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, | ||||
| }; | ||||
| use super::IndexDocumentsMethod; | ||||
| use crate::error::{InternalError, UserError}; | ||||
| use crate::index::db_name; | ||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||
| use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; | ||||
|  | ||||
| const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | ||||
|  | ||||
| @@ -46,7 +45,6 @@ pub struct Transform<'t, 'i> { | ||||
|     pub log_every_n: Option<usize>, | ||||
|     pub chunk_compression_type: CompressionType, | ||||
|     pub chunk_compression_level: Option<u32>, | ||||
|     pub chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub max_nb_chunks: Option<usize>, | ||||
|     pub max_memory: Option<usize>, | ||||
|     pub index_documents_method: IndexDocumentsMethod, | ||||
| @@ -149,7 +147,6 @@ impl Transform<'_, '_> { | ||||
|             merge_function, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
| @@ -169,7 +166,7 @@ impl Transform<'_, '_> { | ||||
|             } | ||||
|  | ||||
|             obkv_buffer.clear(); | ||||
|             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); | ||||
|             let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); | ||||
|  | ||||
|             // We prepare the fields ids map with the documents keys. | ||||
|             for (key, _value) in &document { | ||||
| @@ -209,7 +206,6 @@ impl Transform<'_, '_> { | ||||
|                         .map_err(InternalError::SerdeJson)?; | ||||
|                     writer.insert(field_id, &json_buffer)?; | ||||
|                 } | ||||
|  | ||||
|                 // We validate the document id [a-zA-Z0-9\-_]. | ||||
|                 if field_id == primary_key_id && validate_document_id(&external_id).is_none() { | ||||
|                     return Err(UserError::InvalidDocumentId { | ||||
| @@ -291,7 +287,6 @@ impl Transform<'_, '_> { | ||||
|             keep_latest_obkv, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
| @@ -306,7 +301,7 @@ impl Transform<'_, '_> { | ||||
|         let mut record = csv::StringRecord::new(); | ||||
|         while csv.read_record(&mut record).map_err(UserError::Csv)? { | ||||
|             obkv_buffer.clear(); | ||||
|             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); | ||||
|             let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); | ||||
|  | ||||
|             if self.log_every_n.map_or(false, |len| documents_count % len == 0) { | ||||
|                 progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { | ||||
| @@ -372,9 +367,9 @@ impl Transform<'_, '_> { | ||||
|     /// Generate the `TransformOutput` based on the given sorter that can be generated from any | ||||
|     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document | ||||
|     /// id for the user side and the value must be an obkv where keys are valid fields ids. | ||||
|     fn output_from_sorter<F, E>( | ||||
|     fn output_from_sorter<F>( | ||||
|         self, | ||||
|         sorter: grenad::Sorter<MergeFn<E>>, | ||||
|         sorter: grenad::Sorter<MergeFn>, | ||||
|         primary_key: String, | ||||
|         fields_ids_map: FieldsIdsMap, | ||||
|         approximate_number_of_documents: usize, | ||||
| @@ -383,7 +378,6 @@ impl Transform<'_, '_> { | ||||
|     ) -> Result<TransformOutput> | ||||
|     where | ||||
|         F: Fn(UpdateIndexingStep) + Sync, | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let documents_ids = self.index.documents_ids(self.rtxn)?; | ||||
|         let mut field_distribution = self.index.field_distribution(self.rtxn)?; | ||||
| @@ -391,10 +385,15 @@ impl Transform<'_, '_> { | ||||
|  | ||||
|         // Once we have sort and deduplicated the documents we write them into a final file. | ||||
|         let mut final_sorter = create_sorter( | ||||
|             |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }), | ||||
|             |_id, obkvs| { | ||||
|                 if obkvs.len() == 1 { | ||||
|                     Ok(obkvs[0].clone()) | ||||
|                 } else { | ||||
|                     Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) | ||||
|                 } | ||||
|             }, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
| @@ -405,7 +404,7 @@ impl Transform<'_, '_> { | ||||
|  | ||||
|         // While we write into final file we get or generate the internal documents ids. | ||||
|         let mut documents_count = 0; | ||||
|         let mut iter = sorter.into_iter()?; | ||||
|         let mut iter = sorter.into_merger_iter()?; | ||||
|         while let Some((external_id, update_obkv)) = iter.next()? { | ||||
|             if self.log_every_n.map_or(false, |len| documents_count % len == 0) { | ||||
|                 progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { | ||||
| @@ -534,7 +533,7 @@ impl Transform<'_, '_> { | ||||
|             let docid = docid.get(); | ||||
|  | ||||
|             obkv_buffer.clear(); | ||||
|             let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer); | ||||
|             let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); | ||||
|  | ||||
|             // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. | ||||
|             for (id, name) in new_fields_ids_map.iter() { | ||||
|   | ||||
							
								
								
									
										282
									
								
								milli/src/update/index_documents/typed_chunk.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										282
									
								
								milli/src/update/index_documents/typed_chunk.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,282 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::fs::File; | ||||
|  | ||||
| use heed::types::ByteSlice; | ||||
| use heed::{BytesDecode, RwTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, | ||||
| }; | ||||
| use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; | ||||
| use crate::update::index_documents::helpers::into_clonable_grenad; | ||||
| use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result}; | ||||
|  | ||||
| pub(crate) enum TypedChunk { | ||||
|     DocidWordPositions(grenad::Reader<CursorClonableMmap>), | ||||
|     FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), | ||||
|     FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>), | ||||
|     Documents(grenad::Reader<CursorClonableMmap>), | ||||
|     FieldIdWordcountDocids(grenad::Reader<File>), | ||||
|     NewDocumentsIds(RoaringBitmap), | ||||
|     WordDocids(grenad::Reader<File>), | ||||
|     WordLevelPositionDocids(grenad::Reader<File>), | ||||
|     WordPairProximityDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetStringDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetNumberDocids(grenad::Reader<File>), | ||||
| } | ||||
|  | ||||
| /// Write typed chunk in the corresponding LMDB database of the provided index. | ||||
| /// Return new documents seen. | ||||
| pub(crate) fn write_typed_chunk_into_index( | ||||
|     typed_chunk: TypedChunk, | ||||
|     index: &Index, | ||||
|     wtxn: &mut RwTxn, | ||||
|     index_is_empty: bool, | ||||
| ) -> Result<(RoaringBitmap, bool)> { | ||||
|     let mut is_merged_database = false; | ||||
|     match typed_chunk { | ||||
|         TypedChunk::DocidWordPositions(docid_word_positions_iter) => { | ||||
|             write_entries_into_database( | ||||
|                 docid_word_positions_iter, | ||||
|                 &index.docid_word_positions, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, buffer| { | ||||
|                     // ensure that values are unique and ordered | ||||
|                     let positions = roaring_bitmap_from_u32s_array(value); | ||||
|                     BoRoaringBitmapCodec::serialize_into(&positions, buffer); | ||||
|                     Ok(buffer) | ||||
|                 }, | ||||
|                 |new_values, db_values, buffer| { | ||||
|                     let new_values = roaring_bitmap_from_u32s_array(new_values); | ||||
|                     let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) { | ||||
|                         Some(db_values) => new_values | db_values, | ||||
|                         None => new_values, // should not happen | ||||
|                     }; | ||||
|                     BoRoaringBitmapCodec::serialize_into(&positions, buffer); | ||||
|                     Ok(()) | ||||
|                 }, | ||||
|             )?; | ||||
|         } | ||||
|         TypedChunk::Documents(mut obkv_documents_iter) => { | ||||
|             while let Some((key, value)) = obkv_documents_iter.next()? { | ||||
|                 index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?; | ||||
|             } | ||||
|         } | ||||
|         TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 fid_word_count_docids_iter, | ||||
|                 &index.field_id_word_count_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::NewDocumentsIds(documents_ids) => { | ||||
|             return Ok((documents_ids, is_merged_database)) | ||||
|         } | ||||
|         TypedChunk::WordDocids(word_docids_iter) => { | ||||
|             let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; | ||||
|             append_entries_into_database( | ||||
|                 word_docids_iter.clone(), | ||||
|                 &index.word_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_roaring_bitmaps, | ||||
|             )?; | ||||
|  | ||||
|             // create fst from word docids | ||||
|             let mut builder = fst::SetBuilder::memory(); | ||||
|             while let Some((word, _value)) = word_docids_iter.next()? { | ||||
|                 // This is a lexicographically ordered word position | ||||
|                 // we use the key to construct the words fst. | ||||
|                 builder.insert(word)?; | ||||
|             } | ||||
|             let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; | ||||
|             let db_fst = index.words_fst(wtxn)?; | ||||
|  | ||||
|             // merge new fst with database fst | ||||
|             let union_stream = fst.op().add(db_fst.stream()).union(); | ||||
|             let mut builder = fst::SetBuilder::memory(); | ||||
|             builder.extend_stream(union_stream)?; | ||||
|             let fst = builder.into_set(); | ||||
|             index.put_words_fst(wtxn, &fst)?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 word_level_position_docids_iter, | ||||
|                 &index.word_level_position_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 facet_id_f64_docids_iter, | ||||
|                 &index.facet_id_f64_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 word_pair_proximity_docids_iter, | ||||
|                 &index.word_pair_proximity_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { | ||||
|             let index_fid_docid_facet_numbers = | ||||
|                 index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>(); | ||||
|             while let Some((key, value)) = fid_docid_facet_number.next()? { | ||||
|                 if valid_lmdb_key(key) { | ||||
|                     index_fid_docid_facet_numbers.put(wtxn, key, &value)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         TypedChunk::FieldIdDocidFacetStrings(mut fid_docid_facet_string) => { | ||||
|             let index_fid_docid_facet_strings = | ||||
|                 index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>(); | ||||
|             while let Some((key, value)) = fid_docid_facet_string.next()? { | ||||
|                 if valid_lmdb_key(key) { | ||||
|                     index_fid_docid_facet_strings.put(wtxn, key, &value)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { | ||||
|             append_entries_into_database( | ||||
|                 facet_id_string_docids, | ||||
|                 &index.facet_id_string_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 |new_values, db_values, buffer| { | ||||
|                     let (_, new_values) = decode_prefix_string(new_values).unwrap(); | ||||
|                     let new_values = RoaringBitmap::deserialize_from(new_values)?; | ||||
|                     let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); | ||||
|                     let db_values = RoaringBitmap::deserialize_from(db_values)?; | ||||
|                     let values = new_values | db_values; | ||||
|                     encode_prefix_string(db_original, buffer)?; | ||||
|                     Ok(values.serialize_into(buffer)?) | ||||
|                 }, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok((RoaringBitmap::new(), is_merged_database)) | ||||
| } | ||||
|  | ||||
| fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> { | ||||
|     let new_value = RoaringBitmap::deserialize_from(new_value)?; | ||||
|     let db_value = RoaringBitmap::deserialize_from(db_value)?; | ||||
|     let value = new_value | db_value; | ||||
|     Ok(serialize_roaring_bitmap(&value, buffer)?) | ||||
| } | ||||
|  | ||||
| fn merge_cbo_roaring_bitmaps( | ||||
|     new_value: &[u8], | ||||
|     db_value: &[u8], | ||||
|     buffer: &mut Vec<u8>, | ||||
| ) -> Result<()> { | ||||
|     Ok(CboRoaringBitmapCodec::merge_into( | ||||
|         &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], | ||||
|         buffer, | ||||
|     )?) | ||||
| } | ||||
|  | ||||
| /// Write provided entries in database using serialize_value function. | ||||
| /// merge_values function is used if an entry already exist in the database. | ||||
| fn write_entries_into_database<R, K, V, FS, FM>( | ||||
|     mut data: grenad::Reader<R>, | ||||
|     database: &heed::Database<K, V>, | ||||
|     wtxn: &mut RwTxn, | ||||
|     index_is_empty: bool, | ||||
|     serialize_value: FS, | ||||
|     merge_values: FM, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     R: std::io::Read, | ||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||
|     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>, | ||||
| { | ||||
|     let mut buffer = Vec::new(); | ||||
|     let database = database.remap_types::<ByteSlice, ByteSlice>(); | ||||
|  | ||||
|     while let Some((key, value)) = data.next()? { | ||||
|         if valid_lmdb_key(key) { | ||||
|             buffer.clear(); | ||||
|             let value = if index_is_empty { | ||||
|                 serialize_value(value, &mut buffer)? | ||||
|             } else { | ||||
|                 match database.get(wtxn, key)? { | ||||
|                     Some(prev_value) => { | ||||
|                         merge_values(value, prev_value, &mut buffer)?; | ||||
|                         &buffer[..] | ||||
|                     } | ||||
|                     None => serialize_value(value, &mut buffer)?, | ||||
|                 } | ||||
|             }; | ||||
|             database.put(wtxn, key, value)?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Write provided entries in database using serialize_value function. | ||||
| /// merge_values function is used if an entry already exist in the database. | ||||
| /// All provided entries must be ordered. | ||||
| /// If the index is not empty, write_entries_into_database is called instead. | ||||
| fn append_entries_into_database<R, K, V, FS, FM>( | ||||
|     mut data: grenad::Reader<R>, | ||||
|     database: &heed::Database<K, V>, | ||||
|     wtxn: &mut RwTxn, | ||||
|     index_is_empty: bool, | ||||
|     serialize_value: FS, | ||||
|     merge_values: FM, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     R: std::io::Read, | ||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||
|     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>, | ||||
| { | ||||
|     if !index_is_empty { | ||||
|         return write_entries_into_database( | ||||
|             data, | ||||
|             database, | ||||
|             wtxn, | ||||
|             false, | ||||
|             serialize_value, | ||||
|             merge_values, | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut database = database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>(); | ||||
|  | ||||
|     while let Some((key, value)) = data.next()? { | ||||
|         if valid_lmdb_key(key) { | ||||
|             buffer.clear(); | ||||
|             let value = serialize_value(value, &mut buffer)?; | ||||
|             unsafe { database.append(key, value)? }; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
| @@ -65,10 +65,9 @@ pub struct Settings<'a, 't, 'u, 'i> { | ||||
|     pub(crate) log_every_n: Option<usize>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     pub(crate) linked_hash_map_size: Option<usize>, | ||||
|     pub(crate) documents_chunk_size: Option<usize>, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) thread_pool: Option<&'a ThreadPool>, | ||||
|     update_id: u64, | ||||
|  | ||||
| @@ -95,10 +94,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|             log_every_n: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             linked_hash_map_size: None, | ||||
|             documents_chunk_size: None, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             thread_pool: None, | ||||
|             searchable_fields: Setting::NotSet, | ||||
|             displayed_fields: Setting::NotSet, | ||||
| @@ -205,7 +203,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|             log_every_n: self.log_every_n, | ||||
|             chunk_compression_type: self.chunk_compression_type, | ||||
|             chunk_compression_level: self.chunk_compression_level, | ||||
|             chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, | ||||
|             max_nb_chunks: self.max_nb_chunks, | ||||
|             max_memory: self.max_memory, | ||||
|             index_documents_method: IndexDocumentsMethod::ReplaceDocuments, | ||||
| @@ -232,10 +229,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         indexing_builder.log_every_n = self.log_every_n; | ||||
|         indexing_builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         indexing_builder.max_memory = self.max_memory; | ||||
|         indexing_builder.linked_hash_map_size = self.linked_hash_map_size; | ||||
|         indexing_builder.documents_chunk_size = self.documents_chunk_size; | ||||
|         indexing_builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         indexing_builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         indexing_builder.thread_pool = self.thread_pool; | ||||
|         indexing_builder.execute_raw(output, &cb)?; | ||||
|  | ||||
|   | ||||
| @@ -7,11 +7,10 @@ use crate::{Index, Result}; | ||||
| pub struct UpdateBuilder<'a> { | ||||
|     pub(crate) log_every_n: Option<usize>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) documents_chunk_size: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     pub(crate) linked_hash_map_size: Option<usize>, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) thread_pool: Option<&'a ThreadPool>, | ||||
|     pub(crate) update_id: u64, | ||||
| } | ||||
| @@ -21,11 +20,10 @@ impl<'a> UpdateBuilder<'a> { | ||||
|         UpdateBuilder { | ||||
|             log_every_n: None, | ||||
|             max_nb_chunks: None, | ||||
|             documents_chunk_size: None, | ||||
|             max_memory: None, | ||||
|             linked_hash_map_size: None, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             thread_pool: None, | ||||
|             update_id, | ||||
|         } | ||||
| @@ -43,8 +41,8 @@ impl<'a> UpdateBuilder<'a> { | ||||
|         self.max_memory = Some(max_memory); | ||||
|     } | ||||
|  | ||||
|     pub fn linked_hash_map_size(&mut self, linked_hash_map_size: usize) { | ||||
|         self.linked_hash_map_size = Some(linked_hash_map_size); | ||||
|     pub fn documents_chunk_size(&mut self, documents_chunk_size: usize) { | ||||
|         self.documents_chunk_size = Some(documents_chunk_size); | ||||
|     } | ||||
|  | ||||
|     pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) { | ||||
| @@ -55,10 +53,6 @@ impl<'a> UpdateBuilder<'a> { | ||||
|         self.chunk_compression_level = Some(chunk_compression_level); | ||||
|     } | ||||
|  | ||||
|     pub fn chunk_fusing_shrink_size(&mut self, chunk_fusing_shrink_size: u64) { | ||||
|         self.chunk_fusing_shrink_size = Some(chunk_fusing_shrink_size); | ||||
|     } | ||||
|  | ||||
|     pub fn thread_pool(&mut self, thread_pool: &'a ThreadPool) { | ||||
|         self.thread_pool = Some(thread_pool); | ||||
|     } | ||||
| @@ -89,10 +83,9 @@ impl<'a> UpdateBuilder<'a> { | ||||
|         builder.log_every_n = self.log_every_n; | ||||
|         builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         builder.max_memory = self.max_memory; | ||||
|         builder.linked_hash_map_size = self.linked_hash_map_size; | ||||
|         builder.documents_chunk_size = self.documents_chunk_size; | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         builder.thread_pool = self.thread_pool; | ||||
|  | ||||
|         builder | ||||
| @@ -108,10 +101,9 @@ impl<'a> UpdateBuilder<'a> { | ||||
|         builder.log_every_n = self.log_every_n; | ||||
|         builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         builder.max_memory = self.max_memory; | ||||
|         builder.linked_hash_map_size = self.linked_hash_map_size; | ||||
|         builder.documents_chunk_size = self.documents_chunk_size; | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         builder.thread_pool = self.thread_pool; | ||||
|  | ||||
|         builder | ||||
| @@ -126,7 +118,6 @@ impl<'a> UpdateBuilder<'a> { | ||||
|  | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|  | ||||
|         builder | ||||
|     } | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
|  | ||||
| use crate::update::index_documents::{ | ||||
|     create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod, | ||||
|     create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, | ||||
| }; | ||||
| use crate::{Index, Result}; | ||||
|  | ||||
| @@ -14,7 +14,6 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
| } | ||||
| @@ -29,12 +28,12 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("WordPrefixDocids::{}")] | ||||
|     pub fn execute(self) -> Result<()> { | ||||
|         // Clear the word prefix docids database. | ||||
|         self.index.word_prefix_docids.clear(self.wtxn)?; | ||||
| @@ -44,10 +43,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||
|         // It is forbidden to keep a mutable reference into the database | ||||
|         // and write into it at the same time, therefore we write into another file. | ||||
|         let mut prefix_docids_sorter = create_sorter( | ||||
|             roaring_bitmap_merge, | ||||
|             merge_roaring_bitmaps, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
| @@ -70,7 +68,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_docids.as_polymorph(), | ||||
|             prefix_docids_sorter, | ||||
|             roaring_bitmap_merge, | ||||
|             merge_roaring_bitmaps, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|   | ||||
| @@ -1,15 +1,13 @@ | ||||
| use std::str; | ||||
| use std::collections::HashMap; | ||||
|  | ||||
| use fst::automaton::{Automaton, Str}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use fst::IntoStreamer; | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::BytesEncode; | ||||
| use log::debug; | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::heed_codec::StrStrU8Codec; | ||||
| use crate::update::index_documents::{ | ||||
|     cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod, | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, MergeFn, WriteMethod, | ||||
| }; | ||||
| use crate::{Index, Result}; | ||||
|  | ||||
| @@ -18,9 +16,9 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     threshold: u32, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
| @@ -33,55 +31,123 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             threshold: 100, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Set the number of words required to make a prefix be part of the words prefixes | ||||
|     /// database. If a word prefix is supposed to match more than this number of words in the | ||||
|     /// dictionnary, therefore this prefix is added to the words prefixes datastructures. | ||||
|     /// | ||||
|     /// Default value is 100. This value must be higher than 50 and will be clamped | ||||
|     /// to these bound otherwise. | ||||
|     pub fn threshold(&mut self, value: u32) -> &mut Self { | ||||
|         self.threshold = value.max(50); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("WordPrefixPairProximityDocids::{}")] | ||||
|     pub fn execute(self) -> Result<()> { | ||||
|         debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); | ||||
|  | ||||
|         self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; | ||||
|  | ||||
|         // Here we create a sorter akin to the previous one. | ||||
|         let mut word_prefix_pair_proximity_docids_sorter = create_sorter( | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             merge_cbo_roaring_bitmaps, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
|  | ||||
|         // We insert all the word pairs corresponding to the word-prefix pairs | ||||
|         // where the prefixes appears in the prefix FST previously constructed. | ||||
|         let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(); | ||||
|         for result in db.iter(self.wtxn)? { | ||||
|             let ((word1, word2, prox), data) = result?; | ||||
|             let automaton = Str::new(word2).starts_with(); | ||||
|             let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); | ||||
|             while let Some(prefix) = matching_prefixes.next() { | ||||
|                 let prefix = str::from_utf8(prefix)?; | ||||
|                 let pair = (word1, prefix, prox); | ||||
|                 let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); | ||||
|                 word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; | ||||
|         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; | ||||
|         let prefix_fst_keys = prefix_fst.into_stream().into_bytes(); | ||||
|         let prefix_fst_keys: Vec<_> = prefix_fst_keys | ||||
|             .as_slice() | ||||
|             .linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap()) | ||||
|             .collect(); | ||||
|  | ||||
|         let mut db = | ||||
|             self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?; | ||||
|  | ||||
|         let mut buffer = Vec::new(); | ||||
|         let mut current_prefixes: Option<&&[Vec<u8>]> = None; | ||||
|         let mut prefixes_cache = HashMap::new(); | ||||
|         while let Some(((w1, w2, prox), data)) = db.next().transpose()? { | ||||
|             current_prefixes = match current_prefixes.take() { | ||||
|                 Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes), | ||||
|                 _otherwise => { | ||||
|                     write_prefixes_in_sorter( | ||||
|                         &mut prefixes_cache, | ||||
|                         &mut word_prefix_pair_proximity_docids_sorter, | ||||
|                         self.threshold, | ||||
|                     )?; | ||||
|                     prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0])) | ||||
|                 } | ||||
|             }; | ||||
|  | ||||
|             if let Some(prefixes) = current_prefixes { | ||||
|                 buffer.clear(); | ||||
|                 buffer.extend_from_slice(w1.as_bytes()); | ||||
|                 buffer.push(0); | ||||
|                 for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) { | ||||
|                     buffer.truncate(w1.len() + 1); | ||||
|                     buffer.extend_from_slice(prefix); | ||||
|                     buffer.push(prox); | ||||
|  | ||||
|                     match prefixes_cache.get_mut(&buffer) { | ||||
|                         Some(value) => value.push(data), | ||||
|                         None => { | ||||
|                             prefixes_cache.insert(buffer.clone(), vec![data]); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         write_prefixes_in_sorter( | ||||
|             &mut prefixes_cache, | ||||
|             &mut word_prefix_pair_proximity_docids_sorter, | ||||
|             self.threshold, | ||||
|         )?; | ||||
|  | ||||
|         drop(prefix_fst); | ||||
|         drop(db); | ||||
|  | ||||
|         // We finally write the word prefix pair proximity docids into the LMDB database. | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             word_prefix_pair_proximity_docids_sorter, | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             merge_cbo_roaring_bitmaps, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn write_prefixes_in_sorter( | ||||
|     prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>, | ||||
|     sorter: &mut grenad::Sorter<MergeFn>, | ||||
|     min_word_per_prefix: u32, | ||||
| ) -> Result<()> { | ||||
|     for (i, (key, data_slices)) in prefixes.drain().enumerate() { | ||||
|         // if the number of words prefixed by the prefix is higher than the threshold, | ||||
|         // we insert it in the sorter. | ||||
|         if data_slices.len() > min_word_per_prefix as usize { | ||||
|             for data in data_slices { | ||||
|                 sorter.insert(&key, data)?; | ||||
|             } | ||||
|         // if the first prefix isn't elligible for insertion, | ||||
|         // then the other prefixes can't be elligible. | ||||
|         } else if i == 0 { | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -4,7 +4,7 @@ use std::num::NonZeroU32; | ||||
| use std::{cmp, str}; | ||||
|  | ||||
| use fst::Streamer; | ||||
| use grenad::{CompressionType, FileFuse, Reader, Writer}; | ||||
| use grenad::{CompressionType, Reader, Writer}; | ||||
| use heed::types::{ByteSlice, DecodeIgnore, Str}; | ||||
| use heed::{BytesEncode, Error}; | ||||
| use log::debug; | ||||
| @@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError}; | ||||
| use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; | ||||
| use crate::index::main_key::WORDS_PREFIXES_FST_KEY; | ||||
| use crate::update::index_documents::{ | ||||
|     cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, | ||||
|     create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, | ||||
|     write_into_lmdb_database, writer_into_reader, WriteMethod, | ||||
| }; | ||||
| use crate::{Index, Result, TreeLevel}; | ||||
| @@ -24,7 +24,6 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     level_group_size: NonZeroU32, | ||||
| @@ -41,7 +40,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             level_group_size: NonZeroU32::new(4).unwrap(), | ||||
| @@ -59,6 +57,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("WordsLevelPositions::{}")] | ||||
|     pub fn execute(self) -> Result<()> { | ||||
|         debug!("Computing and writing the word levels positions docids into LMDB on disk..."); | ||||
|  | ||||
| @@ -68,7 +67,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|             self.index.word_level_position_docids, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.level_group_size, | ||||
|             self.min_level_size, | ||||
|         )?; | ||||
| @@ -81,7 +79,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|             self.wtxn, | ||||
|             *self.index.word_level_position_docids.as_polymorph(), | ||||
|             entries, | ||||
|             |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" }), | ||||
|             |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" })?, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
| @@ -89,10 +87,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|         self.index.word_prefix_level_position_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         let mut word_prefix_level_positions_docids_sorter = create_sorter( | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             merge_cbo_roaring_bitmaps, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
| @@ -131,7 +128,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_level_position_docids.as_polymorph(), | ||||
|             word_prefix_level_positions_docids_sorter, | ||||
|             cbo_roaring_bitmap_merge, | ||||
|             merge_cbo_roaring_bitmaps, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
| @@ -141,7 +138,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|             self.index.word_prefix_level_position_docids, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.level_group_size, | ||||
|             self.min_level_size, | ||||
|         )?; | ||||
| @@ -155,7 +151,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|             *self.index.word_prefix_level_position_docids.as_polymorph(), | ||||
|             entries, | ||||
|             |_, _| { | ||||
|                 Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }) | ||||
|                 Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })? | ||||
|             }, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
| @@ -185,10 +181,9 @@ fn compute_positions_levels( | ||||
|     words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroU32, | ||||
|     min_level_size: NonZeroU32, | ||||
| ) -> Result<Reader<FileFuse>> { | ||||
| ) -> Result<Reader<File>> { | ||||
|     // It is forbidden to keep a cursor and write in a database at the same time with LMDB | ||||
|     // therefore we write the facet levels entries into a grenad file before transfering them. | ||||
|     let mut writer = tempfile::tempfile() | ||||
| @@ -254,7 +249,7 @@ fn compute_positions_levels( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer, shrink_size) | ||||
|     writer_into_reader(writer) | ||||
| } | ||||
|  | ||||
| fn write_level_entry( | ||||
|   | ||||
| @@ -8,7 +8,7 @@ use crate::{Index, Result, SmallString32}; | ||||
| pub struct WordsPrefixesFst<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     threshold: f64, | ||||
|     threshold: u32, | ||||
|     max_prefix_length: usize, | ||||
|     _update_id: u64, | ||||
| } | ||||
| @@ -22,20 +22,20 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { | ||||
|         WordsPrefixesFst { | ||||
|             wtxn, | ||||
|             index, | ||||
|             threshold: 0.1 / 100.0, // .01% | ||||
|             threshold: 100, | ||||
|             max_prefix_length: 4, | ||||
|             _update_id: update_id, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Set the ratio of concerned words required to make a prefix be part of the words prefixes | ||||
|     /// Set the number of words required to make a prefix be part of the words prefixes | ||||
|     /// database. If a word prefix is supposed to match more than this number of words in the | ||||
|     /// dictionnary, therefore this prefix is added to the words prefixes datastructures. | ||||
|     /// | ||||
|     /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped | ||||
|     /// to these bounds otherwise. | ||||
|     pub fn threshold(&mut self, value: f64) -> &mut Self { | ||||
|         self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] | ||||
|     /// Default value is 100. This value must be higher than 50 and will be clamped | ||||
|     /// to this bound otherwise. | ||||
|     pub fn threshold(&mut self, value: u32) -> &mut Self { | ||||
|         self.threshold = value.max(50); | ||||
|         self | ||||
|     } | ||||
|  | ||||
| @@ -48,10 +48,9 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("WordsPrefixesFst::{}")] | ||||
|     pub fn execute(self) -> Result<()> { | ||||
|         let words_fst = self.index.words_fst(&self.wtxn)?; | ||||
|         let number_of_words = words_fst.len(); | ||||
|         let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; | ||||
|  | ||||
|         let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); | ||||
|         for n in 1..=self.max_prefix_length { | ||||
| @@ -80,7 +79,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { | ||||
|                 current_prefix_count += 1; | ||||
|  | ||||
|                 // There is enough words corresponding to this prefix to add it to the cache. | ||||
|                 if current_prefix_count == min_number_of_words { | ||||
|                 if current_prefix_count >= self.threshold { | ||||
|                     builder.insert(prefix)?; | ||||
|                 } | ||||
|             } | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use big_s::S; | ||||
| use either::{Either, Left, Right}; | ||||
| use heed::EnvOpenOptions; | ||||
| use maplit::{hashmap, hashset}; | ||||
| use milli::update::{IndexDocuments, Settings, UpdateFormat}; | ||||
| use milli::update::{Settings, UpdateBuilder, UpdateFormat}; | ||||
| use milli::{AscDesc, Criterion, DocumentId, Index}; | ||||
| use serde::Deserialize; | ||||
| use slice_group_by::GroupBy; | ||||
| @@ -50,7 +50,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|     builder.execute(|_, _| ()).unwrap(); | ||||
|  | ||||
|     // index documents | ||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||
|     let mut builder = UpdateBuilder::new(0); | ||||
|     builder.max_memory(10 * 1024 * 1024); // 10MiB | ||||
|     let mut builder = builder.index_documents(&mut wtxn, &index); | ||||
|     builder.update_format(UpdateFormat::JsonStream); | ||||
|     builder.enable_autogenerate_docids(); | ||||
|     builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user