mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Introduce a lot of facet string helper iterators
This commit is contained in:
		
							
								
								
									
										52
									
								
								milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,52 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::num::NonZeroU8; | ||||
|  | ||||
| use crate::FieldId; | ||||
|  | ||||
| /// A codec that stores the field id, level 1 and higher and the groups ids. | ||||
| /// | ||||
| /// It can only be used to encode the facet string of the level 1 or higher. | ||||
| pub struct FacetLevelValueU32Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { | ||||
|     type DItem = (FieldId, NonZeroU8, u32, u32); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id, bytes) = bytes.split_first()?; | ||||
|         let (level, bytes) = bytes.split_first()?; | ||||
|         let level = NonZeroU8::new(*level)?; | ||||
|         let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?; | ||||
|         let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?; | ||||
|         Some((*field_id, level, left, right)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { | ||||
|     type EItem = (FieldId, NonZeroU8, u32, u32); | ||||
|  | ||||
|     fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut buffer = [0u8; 16]; | ||||
|  | ||||
|         // Write the big-endian integers. | ||||
|         let bytes = left.to_be_bytes(); | ||||
|         buffer[..4].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         let bytes = right.to_be_bytes(); | ||||
|         buffer[4..8].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         // Then the u32 values just to be able to read them back. | ||||
|         let bytes = left.to_be_bytes(); | ||||
|         buffer[8..12].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         let bytes = right.to_be_bytes(); | ||||
|         buffer[12..].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(buffer.len() + 2); | ||||
|         bytes.push(*field_id); | ||||
|         bytes.push(level.get()); | ||||
|         bytes.extend_from_slice(&buffer); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										49
									
								
								milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,49 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::str; | ||||
|  | ||||
| use crate::FieldId; | ||||
|  | ||||
| /// A codec that stores the field id, level 0, and facet string. | ||||
| /// | ||||
| /// It can only be used to encode the facet string of the level 0, | ||||
| /// as it hardcodes the level. | ||||
| /// | ||||
| /// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, | ||||
| /// and make sure that the levels are not mixed-up. The level 0 is special, the key | ||||
| /// are strings, other levels represent groups and keys are simply two integers. | ||||
| pub struct FacetStringLevelZeroCodec; | ||||
|  | ||||
| impl FacetStringLevelZeroCodec { | ||||
|     pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) { | ||||
|         out.reserve(value.len() + 2); | ||||
|         out.push(field_id); | ||||
|         out.push(0); // the level zero (for LMDB ordering only) | ||||
|         out.extend_from_slice(value.as_bytes()); | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { | ||||
|     type DItem = (FieldId, &'a str); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id, bytes) = bytes.split_first()?; | ||||
|         let (level, bytes) = bytes.split_first()?; | ||||
|  | ||||
|         if *level != 0 { | ||||
|             return None; | ||||
|         } | ||||
|  | ||||
|         let value = str::from_utf8(bytes).ok()?; | ||||
|         Some((*field_id, value)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { | ||||
|     type EItem = (FieldId, &'a str); | ||||
|  | ||||
|     fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut bytes = Vec::new(); | ||||
|         FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,80 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::{marker, str}; | ||||
|  | ||||
| /// A codec that encodes two strings in front of the value. | ||||
| /// | ||||
| /// The usecase is for the facet string levels algorithm where we must | ||||
| /// know the origin of a group, the group left and right bounds are stored | ||||
| /// in the value to not break the lexicographical ordering of the LMDB keys. | ||||
| pub struct FacetStringZeroBoundsValueCodec<C>(marker::PhantomData<C>); | ||||
|  | ||||
| impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec<C> | ||||
| where | ||||
|     C: heed::BytesDecode<'a>, | ||||
| { | ||||
|     type DItem = (Option<(&'a str, &'a str)>, C::DItem); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (contains_bounds, tail_bytes) = bytes.split_first()?; | ||||
|  | ||||
|         if *contains_bounds != 0 { | ||||
|             let (left_len, bytes) = try_split_at(bytes, 2)?; | ||||
|             let (right_len, bytes) = try_split_at(bytes, 2)?; | ||||
|  | ||||
|             let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; | ||||
|             let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; | ||||
|  | ||||
|             let (left, bytes) = try_split_at(bytes, left_len as usize)?; | ||||
|             let (right, bytes) = try_split_at(bytes, right_len as usize)?; | ||||
|  | ||||
|             let left = str::from_utf8(left).ok()?; | ||||
|             let right = str::from_utf8(right).ok()?; | ||||
|  | ||||
|             C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) | ||||
|         } else { | ||||
|             C::bytes_decode(tail_bytes).map(|item| (None, item)) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec<C> | ||||
| where | ||||
|     C: heed::BytesEncode<'a>, | ||||
| { | ||||
|     type EItem = (Option<(&'a str, &'a str)>, C::EItem); | ||||
|  | ||||
|     fn bytes_encode((bounds, value): &'a Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut bytes = Vec::new(); | ||||
|  | ||||
|         match bounds { | ||||
|             Some((left, right)) => { | ||||
|                 let left_len: u16 = left.len().try_into().ok()?; | ||||
|                 let right_len: u16 = right.len().try_into().ok()?; | ||||
|                 bytes.extend_from_slice(&left_len.to_be_bytes()); | ||||
|                 bytes.extend_from_slice(&right_len.to_be_bytes()); | ||||
|  | ||||
|                 let value_bytes = C::bytes_encode(&value)?; | ||||
|                 bytes.extend_from_slice(&value_bytes[..]); | ||||
|  | ||||
|                 Some(Cow::Owned(bytes)) | ||||
|             } | ||||
|             None => { | ||||
|                 bytes.push(0); | ||||
|                 let value_bytes = C::bytes_encode(&value)?; | ||||
|                 bytes.extend_from_slice(&value_bytes[..]); | ||||
|                 Some(Cow::Owned(bytes)) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Tries to split a slice in half at the given middle point, | ||||
| /// `None` if the slice is too short. | ||||
| fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { | ||||
|     if slice.len() >= mid { | ||||
|         Some(slice.split_at(mid)) | ||||
|     } else { | ||||
|         None | ||||
|     } | ||||
| } | ||||
| @@ -1,9 +1,15 @@ | ||||
| mod facet_level_value_f64_codec; | ||||
| mod facet_level_value_u32_codec; | ||||
| mod facet_string_level_zero_codec; | ||||
| mod facet_string_zero_bounds_value_codec; | ||||
| mod facet_value_string_codec; | ||||
| mod field_doc_id_facet_f64_codec; | ||||
| mod field_doc_id_facet_string_codec; | ||||
|  | ||||
| pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; | ||||
| pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; | ||||
| pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; | ||||
| pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; | ||||
| pub use self::facet_value_string_codec::FacetValueStringCodec; | ||||
| pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; | ||||
| pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; | ||||
|   | ||||
| @@ -31,7 +31,7 @@ | ||||
| //! | ||||
| //! ### Example of what a facet number LMDB database contain | ||||
| //! | ||||
| //! | level | left-bound | right-bound | docs             | | ||||
| //! | level | left-bound | right-bound | documents ids    | | ||||
| //! |-------|------------|-------------|------------------| | ||||
| //! | 0     | 0          | _skipped_   | 1, 2             | | ||||
| //! | 0     | 1          | _skipped_   | 6, 7             | | ||||
| @@ -48,7 +48,7 @@ | ||||
| //! The next levels have two different bounds and the associated documents ids are simply the result | ||||
| //! of an union of all the documents ids associated with the aggregated groups above. | ||||
| //! | ||||
| //! ## The complexity of defining groups of facet strings | ||||
| //! ## The complexity of defining groups for facet strings | ||||
| //! | ||||
| //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in | ||||
| //! lexicographical order, it means that whatever the key represent the bytes are read in their raw | ||||
| @@ -77,22 +77,25 @@ | ||||
| //! | ||||
| //! #### Example of facet strings with numbered groups | ||||
| //! | ||||
| //! | level | left-bound | right-bound | left-string | right-string | docs             | | ||||
| //! | level | left-bound | right-bound | left-string | right-string | documents ids    | | ||||
| //! |-------|------------|-------------|-------------|--------------|------------------| | ||||
| //! | 0     | alpha      | _skipped_   | _skipped_   | _skipped_    | 1, 2             | | ||||
| //! | 0     | beta       | _skipped_   | _skipped_   | _skipped_    | 6, 7             | | ||||
| //! | 0     | gamma      | _skipped_   | _skipped_   | _skipped_    | 4, 7             | | ||||
| //! | 0     | omega      | _skipped_   | _skipped_   | _skipped_    | 2, 3, 4          | | ||||
| //! | 1     | 0          | 1           | alpha       | beta         | 1, 2, 6, 7       | | ||||
| //! | 1     | 3          | 5           | gamma       | omega        | 2, 3, 4, 7       | | ||||
| //! | 2     | 0          | 5           | _skipped_   | _skipped_    | 1, 2, 3, 4, 6, 7 | | ||||
| //! | 1     | 2          | 3           | gamma       | omega        | 2, 3, 4, 7       | | ||||
| //! | 2     | 0          | 3           | _skipped_   | _skipped_    | 1, 2, 3, 4, 6, 7 | | ||||
| //! | ||||
| //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not | ||||
| //! need to store the facet string value two times. | ||||
| //! | ||||
| //! In the value, not in the key, you can see that we added two new values: | ||||
| //! the left-string and the right-string, which defines the original facet strings associated with | ||||
| //! the given group. | ||||
| //! The number in the left-bound and right-bound columns are incremental numbers representing the | ||||
| //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering | ||||
| //! of the LMDB keys. | ||||
| //! | ||||
| //! In the value, not in the key, you can see that we added two new values: the left-string and the | ||||
| //! right-string, which defines the original facet strings associated with the given group. | ||||
| //! | ||||
| //! We put those two strings inside of the value, this way we do not limit the maximum size of the | ||||
| //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big | ||||
| @@ -121,3 +124,124 @@ | ||||
| //! If the group doesn't contain one of our documents ids, we continue to the next group at this | ||||
| //! same level. | ||||
| //! | ||||
|  | ||||
| use std::num::NonZeroU8; | ||||
| use std::ops::Bound; | ||||
| use std::ops::Bound::{Excluded, Included}; | ||||
|  | ||||
| use heed::types::{ByteSlice, Str}; | ||||
| use heed::{Database, LazyDecode, RoRange}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, | ||||
| }; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::FieldId; | ||||
|  | ||||
| /// An iterator that is used to explore the facets level strings | ||||
| /// from the level 1 to infinity. | ||||
| /// | ||||
| /// It yields the level, group id that an entry covers, the optional group strings | ||||
| /// that it covers of the level 0 only if it is an entry from the level 1 and | ||||
| /// the roaring bitmap associated. | ||||
| pub struct FacetStringGroupRange<'t> { | ||||
|     iter: RoRange< | ||||
|         't, | ||||
|         FacetLevelValueU32Codec, | ||||
|         LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>, | ||||
|     >, | ||||
|     end: Bound<u32>, | ||||
| } | ||||
|  | ||||
| impl<'t> FacetStringGroupRange<'t> { | ||||
|     pub fn new( | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         db: Database< | ||||
|             FacetLevelValueU32Codec, | ||||
|             FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>, | ||||
|         >, | ||||
|         field_id: FieldId, | ||||
|         level: NonZeroU8, | ||||
|         left: Bound<u32>, | ||||
|         right: Bound<u32>, | ||||
|     ) -> heed::Result<FacetStringGroupRange<'t>> { | ||||
|         let left_bound = match left { | ||||
|             Included(left) => Included((field_id, level, left, u32::MIN)), | ||||
|             Excluded(left) => Excluded((field_id, level, left, u32::MIN)), | ||||
|             Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), | ||||
|         }; | ||||
|         let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); | ||||
|         let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; | ||||
|         Ok(FacetStringGroupRange { iter, end: right }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> Iterator for FacetStringGroupRange<'t> { | ||||
|     type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         match self.iter.next() { | ||||
|             Some(Ok(((_fid, level, left, right), docids))) => { | ||||
|                 let must_be_returned = match self.end { | ||||
|                     Included(end) => right <= end, | ||||
|                     Excluded(end) => right < end, | ||||
|                     Unbounded => true, | ||||
|                 }; | ||||
|                 if must_be_returned { | ||||
|                     match docids.decode() { | ||||
|                         Ok(docids) => Some(Ok(((level, left, right), docids))), | ||||
|                         Err(e) => Some(Err(e)), | ||||
|                     } | ||||
|                 } else { | ||||
|                     None | ||||
|                 } | ||||
|             } | ||||
|             Some(Err(e)) => Some(Err(e)), | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// An iterator that is used to explore the level 0 of the facets string database. | ||||
| /// | ||||
| /// It yields the facet string and the roaring bitmap associated with it. | ||||
| pub struct FacetStringLevelZeroRange<'t> { | ||||
|     iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, | ||||
| } | ||||
|  | ||||
| impl<'t> FacetStringLevelZeroRange<'t> { | ||||
|     pub fn new( | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         db: Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, | ||||
|         field_id: FieldId, | ||||
|         left: Bound<&str>, | ||||
|         right: Bound<&str>, | ||||
|     ) -> heed::Result<FacetStringLevelZeroRange<'t>> { | ||||
|         let left_bound = match left { | ||||
|             Included(left) => Included((field_id, left)), | ||||
|             Excluded(left) => Excluded((field_id, left)), | ||||
|             Unbounded => Included((field_id, "")), | ||||
|         }; | ||||
|  | ||||
|         let right_bound = match right { | ||||
|             Included(right) => Included((field_id, right)), | ||||
|             Excluded(right) => Excluded((field_id, right)), | ||||
|             Unbounded => Excluded((field_id + 1, "")), | ||||
|         }; | ||||
|  | ||||
|         db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> Iterator for FacetStringLevelZeroRange<'t> { | ||||
|     type Item = heed::Result<(&'t str, RoaringBitmap)>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         match self.iter.next() { | ||||
|             Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), | ||||
|             Some(Err(e)) => Some(Err(e)), | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user