mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Prepare refactor of facets database
Prepare refactor of facets database
This commit is contained in:
		
				
					committed by
					
						 Loïc Lecrenier
						Loïc Lecrenier
					
				
			
			
				
	
			
			
			
						parent
						
							004c09a8e2
						
					
				
				
					commit
					c3f49f766d
				
			| @@ -10,9 +10,7 @@ use time::OffsetDateTime; | ||||
|  | ||||
| use super::ClearDocuments; | ||||
| use crate::error::{InternalError, SerializationError, UserError}; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, | ||||
| }; | ||||
| use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::index::{db_name, main_key}; | ||||
| use crate::{ | ||||
| @@ -442,11 +440,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|         } | ||||
|  | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         remove_docids_from_facet_field_id_docids( | ||||
|             self.wtxn, | ||||
|             facet_id_f64_docids, | ||||
|             &self.to_delete_docids, | ||||
|         )?; | ||||
|         // TODO: remove_docids_from_facet_field_id_docids( | ||||
|         //     self.wtxn, | ||||
|         //     facet_id_f64_docids, | ||||
|         //     &self.to_delete_docids, | ||||
|         // )?; | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         remove_docids_from_facet_field_id_docids( | ||||
|             self.wtxn, | ||||
| @@ -587,57 +585,57 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( | ||||
|     db: &heed::Database<C, D>, | ||||
|     to_remove: &RoaringBitmap, | ||||
| ) -> crate::Result<()> { | ||||
|     let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); | ||||
|     let mut iter = db.remap_types::<ByteSlice, ByteSlice>().iter_mut(wtxn)?; | ||||
|     while let Some(result) = iter.next() { | ||||
|         let (key, val) = result?; | ||||
|         match FacetLevelValueU32Codec::bytes_decode(key) { | ||||
|             Some(_) => { | ||||
|                 // If we are able to parse this key it means it is a facet string group | ||||
|                 // level key. We must then parse the value using the appropriate codec. | ||||
|                 let (group, mut docids) = | ||||
|                     FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_decode(val) | ||||
|                         .ok_or_else(|| SerializationError::Decoding { db_name })?; | ||||
|     // let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); | ||||
|     // let mut iter = db.remap_types::<ByteSlice, ByteSlice>().iter_mut(wtxn)?; | ||||
|     // while let Some(result) = iter.next() { | ||||
|     //     let (key, val) = result?; | ||||
|     //     match FacetLevelValueU32Codec::bytes_decode(key) { | ||||
|     //         Some(_) => { | ||||
|     //             // If we are able to parse this key it means it is a facet string group | ||||
|     //             // level key. We must then parse the value using the appropriate codec. | ||||
|     //             let (group, mut docids) = | ||||
|     //                 FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_decode(val) | ||||
|     //                     .ok_or_else(|| SerializationError::Decoding { db_name })?; | ||||
|  | ||||
|                 let previous_len = docids.len(); | ||||
|                 docids -= to_remove; | ||||
|                 if docids.is_empty() { | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.del_current()? }; | ||||
|                 } else if docids.len() != previous_len { | ||||
|                     let key = key.to_owned(); | ||||
|                     let val = &(group, docids); | ||||
|                     let value_bytes = | ||||
|                         FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_encode(val) | ||||
|                             .ok_or_else(|| SerializationError::Encoding { db_name })?; | ||||
|     //             let previous_len = docids.len(); | ||||
|     //             docids -= to_remove; | ||||
|     //             if docids.is_empty() { | ||||
|     //                 // safety: we don't keep references from inside the LMDB database. | ||||
|     //                 unsafe { iter.del_current()? }; | ||||
|     //             } else if docids.len() != previous_len { | ||||
|     //                 let key = key.to_owned(); | ||||
|     //                 let val = &(group, docids); | ||||
|     //                 let value_bytes = | ||||
|     //                     FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_encode(val) | ||||
|     //                         .ok_or_else(|| SerializationError::Encoding { db_name })?; | ||||
|  | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.put_current(&key, &value_bytes)? }; | ||||
|                 } | ||||
|             } | ||||
|             None => { | ||||
|                 // The key corresponds to a level zero facet string. | ||||
|                 let (original_value, mut docids) = | ||||
|                     FacetStringLevelZeroValueCodec::bytes_decode(val) | ||||
|                         .ok_or_else(|| SerializationError::Decoding { db_name })?; | ||||
|     //                 // safety: we don't keep references from inside the LMDB database. | ||||
|     //                 unsafe { iter.put_current(&key, &value_bytes)? }; | ||||
|     //             } | ||||
|     //         } | ||||
|     //         None => { | ||||
|     //             // The key corresponds to a level zero facet string. | ||||
|     //             let (original_value, mut docids) = | ||||
|     //                 FacetStringLevelZeroValueCodec::bytes_decode(val) | ||||
|     //                     .ok_or_else(|| SerializationError::Decoding { db_name })?; | ||||
|  | ||||
|                 let previous_len = docids.len(); | ||||
|                 docids -= to_remove; | ||||
|                 if docids.is_empty() { | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.del_current()? }; | ||||
|                 } else if docids.len() != previous_len { | ||||
|                     let key = key.to_owned(); | ||||
|                     let val = &(original_value, docids); | ||||
|                     let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) | ||||
|                         .ok_or_else(|| SerializationError::Encoding { db_name })?; | ||||
|     //             let previous_len = docids.len(); | ||||
|     //             docids -= to_remove; | ||||
|     //             if docids.is_empty() { | ||||
|     //                 // safety: we don't keep references from inside the LMDB database. | ||||
|     //                 unsafe { iter.del_current()? }; | ||||
|     //             } else if docids.len() != previous_len { | ||||
|     //                 let key = key.to_owned(); | ||||
|     //                 let val = &(original_value, docids); | ||||
|     //                 let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) | ||||
|     //                     .ok_or_else(|| SerializationError::Encoding { db_name })?; | ||||
|  | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.put_current(&key, &value_bytes)? }; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     //                 // safety: we don't keep references from inside the LMDB database. | ||||
|     //                 unsafe { iter.put_current(&key, &value_bytes)? }; | ||||
|     //             } | ||||
|     //         } | ||||
|     //     } | ||||
|     // } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -136,11 +136,12 @@ use roaring::RoaringBitmap; | ||||
| use time::OffsetDateTime; | ||||
|  | ||||
| use crate::error::InternalError; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, | ||||
|     FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, | ||||
| use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; | ||||
| use crate::heed_codec::facet::new::str_ref::StrRefCodec; | ||||
| use crate::heed_codec::facet::new::{ | ||||
|     FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, | ||||
| }; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| // use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; | ||||
| use crate::{FieldId, Index, Result}; | ||||
|  | ||||
| @@ -187,16 +188,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|  | ||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||
|  | ||||
|         let mut nested_wtxn = self.index.env.nested_write_txn(self.wtxn).unwrap(); | ||||
|  | ||||
|         for field_id in faceted_fields { | ||||
|             // Clear the facet string levels. | ||||
|             clear_field_string_levels( | ||||
|                 self.wtxn, | ||||
|                 self.index.facet_id_string_docids.remap_types::<ByteSlice, DecodeIgnore>(), | ||||
|                 field_id, | ||||
|             )?; | ||||
|             // clear_field_string_levels( | ||||
|             //     &mut nested_wtxn, | ||||
|             //     self.index.facet_id_string_docids.remap_types::<ByteSlice, DecodeIgnore>(), | ||||
|             //     field_id, | ||||
|             // )?; | ||||
|  | ||||
|             let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( | ||||
|                 self.wtxn, | ||||
|                 &mut nested_wtxn, | ||||
|                 self.index.facet_id_string_docids, | ||||
|                 self.chunk_compression_type, | ||||
|                 self.chunk_compression_level, | ||||
| @@ -206,13 +209,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|             )?; | ||||
|  | ||||
|             self.index.put_string_faceted_documents_ids( | ||||
|                 self.wtxn, | ||||
|                 &mut nested_wtxn, | ||||
|                 field_id, | ||||
|                 &string_documents_ids, | ||||
|             )?; | ||||
|             for facet_strings_level in facet_string_levels { | ||||
|                 write_into_lmdb_database( | ||||
|                     self.wtxn, | ||||
|                     &mut nested_wtxn, | ||||
|                     *self.index.facet_id_string_docids.as_polymorph(), | ||||
|                     facet_strings_level, | ||||
|                     |_, _| { | ||||
| @@ -221,11 +224,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             // Clear the facet number levels. | ||||
|             clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; | ||||
|             // // Clear the facet number levels. | ||||
|             // clear_field_number_levels(&mut nested_wtxn, self.index.facet_id_f64_docids, field_id)?; | ||||
|  | ||||
|             let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( | ||||
|                 self.wtxn, | ||||
|                 &mut nested_wtxn, | ||||
|                 self.index.facet_id_f64_docids, | ||||
|                 self.chunk_compression_type, | ||||
|                 self.chunk_compression_level, | ||||
| @@ -235,14 +238,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|             )?; | ||||
|  | ||||
|             self.index.put_number_faceted_documents_ids( | ||||
|                 self.wtxn, | ||||
|                 &mut nested_wtxn, | ||||
|                 field_id, | ||||
|                 &number_documents_ids, | ||||
|             )?; | ||||
|  | ||||
|             for facet_number_level in facet_number_levels { | ||||
|                 write_into_lmdb_database( | ||||
|                     self.wtxn, | ||||
|                     &mut nested_wtxn, | ||||
|                     *self.index.facet_id_f64_docids.as_polymorph(), | ||||
|                     facet_number_level, | ||||
|                     |_, _| { | ||||
| @@ -263,8 +266,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
| /// that must be inserted into the database. | ||||
| /// 2. a roaring bitmap of all the document ids present in the database | ||||
| fn compute_facet_number_levels<'t>( | ||||
|     rtxn: &'t heed::RoTxn, | ||||
|     db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, | ||||
|     rtxn: &'t mut heed::RwTxn, | ||||
|     db: heed::Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     level_group_size: NonZeroUsize, | ||||
| @@ -277,7 +280,7 @@ fn compute_facet_number_levels<'t>( | ||||
|         .remap_types::<DecodeIgnore, DecodeIgnore>() | ||||
|         .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; | ||||
|  | ||||
|     let level_0_start = (field_id, 0, f64::MIN, f64::MIN); | ||||
|     let level_0_start = FacetKey { field_id, level: 0, left_bound: f64::MIN }; | ||||
|  | ||||
|     // Groups sizes are always a power of the original level_group_size and therefore a group | ||||
|     // always maps groups of the previous level and never splits previous levels groups in half. | ||||
| @@ -289,37 +292,31 @@ fn compute_facet_number_levels<'t>( | ||||
|     let mut number_document_ids = RoaringBitmap::new(); | ||||
|  | ||||
|     if let Some((top_level, _)) = group_size_iter.last() { | ||||
|         let subwriters = | ||||
|             recursive_compute_levels::<FacetLevelValueF64Codec, CboRoaringBitmapCodec, f64>( | ||||
|                 rtxn, | ||||
|                 db, | ||||
|                 compression_type, | ||||
|                 compression_level, | ||||
|                 *top_level, | ||||
|                 level_0_start, | ||||
|                 &(level_0_start..), | ||||
|                 first_level_size, | ||||
|                 level_group_size, | ||||
|                 &mut |bitmaps, _, _| { | ||||
|                     for bitmap in bitmaps { | ||||
|                         number_document_ids |= bitmap; | ||||
|                     } | ||||
|                     Ok(()) | ||||
|                 }, | ||||
|                 &|_i, (_field_id, _level, left, _right)| *left, | ||||
|                 &|bitmap| bitmap, | ||||
|                 &|writer, level, left, right, docids| { | ||||
|                     write_number_entry(writer, field_id, level.get(), left, right, &docids)?; | ||||
|                     Ok(()) | ||||
|                 }, | ||||
|             )?; | ||||
|         let subwriters = recursive_compute_levels::<OrderedF64Codec>( | ||||
|             rtxn, | ||||
|             db, | ||||
|             compression_type, | ||||
|             compression_level, | ||||
|             field_id, | ||||
|             *top_level, | ||||
|             level_0_start, | ||||
|             &(level_0_start..), | ||||
|             first_level_size, | ||||
|             level_group_size, | ||||
|             &mut |bitmaps, _| { | ||||
|                 for bitmap in bitmaps { | ||||
|                     number_document_ids |= bitmap; | ||||
|                 } | ||||
|                 Ok(()) | ||||
|             }, | ||||
|         )?; | ||||
|  | ||||
|         Ok((subwriters, number_document_ids)) | ||||
|     } else { | ||||
|         let mut documents_ids = RoaringBitmap::new(); | ||||
|         for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { | ||||
|             let (_key, docids) = result?; | ||||
|             documents_ids |= docids; | ||||
|             let (_key, group_value) = result?; | ||||
|             documents_ids |= group_value.bitmap; | ||||
|         } | ||||
|  | ||||
|         Ok((vec![], documents_ids)) | ||||
| @@ -333,8 +330,8 @@ fn compute_facet_number_levels<'t>( | ||||
| /// that must be inserted into the database. | ||||
| /// 2. a roaring bitmap of all the document ids present in the database | ||||
| fn compute_facet_strings_levels<'t>( | ||||
|     rtxn: &'t heed::RoTxn, | ||||
|     db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, | ||||
|     rtxn: &'t mut heed::RwTxn, | ||||
|     db: heed::Database<FacetKeyCodec<StrRefCodec>, FacetGroupValueCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     level_group_size: NonZeroUsize, | ||||
| @@ -347,7 +344,7 @@ fn compute_facet_strings_levels<'t>( | ||||
|         .remap_types::<DecodeIgnore, DecodeIgnore>() | ||||
|         .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; | ||||
|  | ||||
|     let level_0_start = (field_id, ""); | ||||
|     let level_0_start = FacetKey { field_id, level: 0, left_bound: "" }; | ||||
|  | ||||
|     // Groups sizes are always a power of the original level_group_size and therefore a group | ||||
|     // always maps groups of the previous level and never splits previous levels groups in half. | ||||
| @@ -359,40 +356,31 @@ fn compute_facet_strings_levels<'t>( | ||||
|     let mut strings_document_ids = RoaringBitmap::new(); | ||||
|  | ||||
|     if let Some((top_level, _)) = group_size_iter.last() { | ||||
|         let subwriters = recursive_compute_levels::< | ||||
|             FacetStringLevelZeroCodec, | ||||
|             FacetStringLevelZeroValueCodec, | ||||
|             (u32, &str), | ||||
|         >( | ||||
|         let subwriters = recursive_compute_levels::<StrRefCodec>( | ||||
|             rtxn, | ||||
|             db, | ||||
|             compression_type, | ||||
|             compression_level, | ||||
|             field_id, | ||||
|             *top_level, | ||||
|             level_0_start, | ||||
|             &(level_0_start..), | ||||
|             first_level_size, | ||||
|             level_group_size, | ||||
|             &mut |bitmaps, _, _| { | ||||
|             &mut |bitmaps, _| { | ||||
|                 for bitmap in bitmaps { | ||||
|                     strings_document_ids |= bitmap; | ||||
|                 } | ||||
|                 Ok(()) | ||||
|             }, | ||||
|             &|i, (_field_id, value)| (i as u32, *value), | ||||
|             &|value| value.1, | ||||
|             &|writer, level, start_bound, end_bound, docids| { | ||||
|                 write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; | ||||
|                 Ok(()) | ||||
|             }, | ||||
|         )?; | ||||
|  | ||||
|         Ok((subwriters, strings_document_ids)) | ||||
|     } else { | ||||
|         let mut documents_ids = RoaringBitmap::new(); | ||||
|         for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { | ||||
|             let (_key, (_original_value, docids)) = result?; | ||||
|             documents_ids |= docids; | ||||
|             let (_key, group_value) = result?; | ||||
|             documents_ids |= group_value.bitmap; | ||||
|         } | ||||
|  | ||||
|         Ok((vec![], documents_ids)) | ||||
| @@ -436,29 +424,26 @@ from the level below were read/created. Its arguments are: | ||||
| A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` | ||||
| that must be inserted into the database. | ||||
| */ | ||||
| fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( | ||||
|     rtxn: &'t heed::RoTxn, | ||||
|     db: heed::Database<KeyCodec, ValueCodec>, | ||||
| fn recursive_compute_levels<'t, BoundCodec>( | ||||
|     rtxn: &'t mut heed::RwTxn, | ||||
|     db: heed::Database<FacetKeyCodec<BoundCodec>, FacetGroupValueCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     field_id: FieldId, | ||||
|     level: u8, | ||||
|     level_0_start: <KeyCodec as BytesDecode<'t>>::DItem, | ||||
|     level_0_range: &'t RangeFrom<<KeyCodec as BytesDecode<'t>>::DItem>, | ||||
|     level_0_start: FacetKey<<BoundCodec as BytesEncode<'t>>::EItem>, | ||||
|     level_0_range: &'t RangeFrom<FacetKey<<BoundCodec as BytesEncode<'t>>::EItem>>, | ||||
|     level_0_size: usize, | ||||
|     level_group_size: NonZeroUsize, | ||||
|     computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, | ||||
|     bound_from_db_key: &dyn for<'a> Fn(usize, &'a <KeyCodec as BytesDecode<'t>>::DItem) -> Bound, | ||||
|     bitmap_from_db_value: &dyn Fn(<ValueCodec as BytesDecode<'t>>::DItem) -> RoaringBitmap, | ||||
|     write_entry: &dyn Fn(&mut Writer<File>, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, | ||||
|     computed_group_bitmap: &mut dyn FnMut( | ||||
|         &[RoaringBitmap], | ||||
|         <BoundCodec as BytesEncode<'t>>::EItem, | ||||
|     ) -> Result<()>, | ||||
| ) -> Result<Vec<Reader<File>>> | ||||
| where | ||||
|     KeyCodec: for<'a> BytesEncode<'a> | ||||
|         + for<'a> BytesDecode<'a, DItem = <KeyCodec as BytesEncode<'a>>::EItem>, | ||||
|     for<'a> <KeyCodec as BytesEncode<'a>>::EItem: Sized, | ||||
|     ValueCodec: for<'a> BytesEncode<'a> | ||||
|         + for<'a> BytesDecode<'a, DItem = <ValueCodec as BytesEncode<'a>>::EItem>, | ||||
|     for<'a> <ValueCodec as BytesEncode<'a>>::EItem: Sized, | ||||
|     Bound: Copy, | ||||
|     for<'a> BoundCodec: | ||||
|         BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>, | ||||
|     for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Copy + Sized, | ||||
| { | ||||
|     if level == 0 { | ||||
|         // base case for the recursion | ||||
| @@ -468,31 +453,32 @@ where | ||||
|         // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read | ||||
|         let mut bitmaps = vec![]; | ||||
|  | ||||
|         let mut start_bound = bound_from_db_key(0, &level_0_start); | ||||
|         let mut end_bound = bound_from_db_key(0, &level_0_start); | ||||
|         let mut start_bound = level_0_start.left_bound; | ||||
|         // let mut end_bound = level_0_start.bound; | ||||
|  | ||||
|         let mut first_iteration_for_new_group = true; | ||||
|         for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { | ||||
|             let (key, value) = db_result_item?; | ||||
|  | ||||
|             let bound = bound_from_db_key(i, &key); | ||||
|             let docids = bitmap_from_db_value(value); | ||||
|             let bound = key.left_bound; | ||||
|             let docids = value.bitmap; | ||||
|  | ||||
|             if first_iteration_for_new_group { | ||||
|                 start_bound = bound; | ||||
|                 first_iteration_for_new_group = false; | ||||
|             } | ||||
|             end_bound = bound; | ||||
|             // end_bound = bound; | ||||
|             bitmaps.push(docids); | ||||
|  | ||||
|             if bitmaps.len() == level_group_size.get() { | ||||
|                 computed_group_bitmap(&bitmaps, start_bound, end_bound)?; | ||||
|                 computed_group_bitmap(&bitmaps, start_bound)?; | ||||
|                 first_iteration_for_new_group = true; | ||||
|                 bitmaps.clear(); | ||||
|             } | ||||
|         } | ||||
|         // don't forget to give the leftover bitmaps as well | ||||
|         if !bitmaps.is_empty() { | ||||
|             computed_group_bitmap(&bitmaps, start_bound, end_bound)?; | ||||
|             computed_group_bitmap(&bitmaps, start_bound)?; | ||||
|             bitmaps.clear(); | ||||
|         } | ||||
|         // level 0 is already stored in the DB | ||||
| @@ -516,48 +502,52 @@ where | ||||
|             db, | ||||
|             compression_type, | ||||
|             compression_level, | ||||
|             field_id, | ||||
|             level - 1, | ||||
|             level_0_start, | ||||
|             level_0_range, | ||||
|             level_0_size, | ||||
|             level_group_size, | ||||
|             &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { | ||||
|             &mut |sub_bitmaps: &[RoaringBitmap], | ||||
|                   start_range: <BoundCodec as BytesEncode<'t>>::EItem| { | ||||
|                 let mut combined_bitmap = RoaringBitmap::default(); | ||||
|                 for bitmap in sub_bitmaps { | ||||
|                     combined_bitmap |= bitmap; | ||||
|                 } | ||||
|                 range_for_bitmaps.push((start_range, end_range)); | ||||
|                 range_for_bitmaps.push(start_range); | ||||
|  | ||||
|                 bitmaps.push(combined_bitmap); | ||||
|                 if bitmaps.len() == level_group_size.get() { | ||||
|                     let start_bound = range_for_bitmaps.first().unwrap().0; | ||||
|                     let end_bound = range_for_bitmaps.last().unwrap().1; | ||||
|                     computed_group_bitmap(&bitmaps, start_bound, end_bound)?; | ||||
|                     for (bitmap, (start_bound, end_bound)) in | ||||
|                         bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) | ||||
|                     let start_bound = range_for_bitmaps.first().unwrap(); | ||||
|                     computed_group_bitmap(&bitmaps, *start_bound)?; | ||||
|                     for (bitmap, start_bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) | ||||
|                     { | ||||
|                         write_entry( | ||||
|                         write_entry::<BoundCodec>( | ||||
|                             &mut cur_writer, | ||||
|                             field_id, | ||||
|                             NonZeroU8::new(level).unwrap(), | ||||
|                             start_bound, | ||||
|                             end_bound, | ||||
|                             bitmap, | ||||
|                         )?; | ||||
|                     } | ||||
|                 } | ||||
|                 Ok(()) | ||||
|             }, | ||||
|             bound_from_db_key, | ||||
|             bitmap_from_db_value, | ||||
|             write_entry, | ||||
|         )?; | ||||
|  | ||||
|         // don't forget to insert the leftover elements into the writer as well | ||||
|         if !bitmaps.is_empty() { | ||||
|             let start_range = range_for_bitmaps.first().unwrap().0; | ||||
|             let end_range = range_for_bitmaps.last().unwrap().1; | ||||
|             computed_group_bitmap(&bitmaps, start_range, end_range)?; | ||||
|             for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { | ||||
|                 write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; | ||||
|             let start_range = range_for_bitmaps.first().unwrap(); | ||||
|             let end_range = range_for_bitmaps.last().unwrap(); | ||||
|             computed_group_bitmap(&bitmaps, *start_range)?; | ||||
|             for (bitmap, bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { | ||||
|                 write_entry( | ||||
|                     &mut cur_writer, | ||||
|                     field_id, | ||||
|                     NonZeroU8::new(level).unwrap(), | ||||
|                     bound, | ||||
|                     bitmap, | ||||
|                 )?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -566,60 +556,25 @@ where | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn clear_field_number_levels<'t>( | ||||
|     wtxn: &'t mut heed::RwTxn, | ||||
|     db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>, | ||||
|     field_id: FieldId, | ||||
| ) -> heed::Result<()> { | ||||
|     let left = (field_id, 1, f64::MIN, f64::MIN); | ||||
|     let right = (field_id, u8::MAX, f64::MAX, f64::MAX); | ||||
|     let range = left..=right; | ||||
|     db.delete_range(wtxn, &range).map(drop) | ||||
| } | ||||
|  | ||||
| fn clear_field_string_levels<'t>( | ||||
|     wtxn: &'t mut heed::RwTxn, | ||||
|     db: heed::Database<ByteSlice, DecodeIgnore>, | ||||
|     field_id: FieldId, | ||||
| ) -> heed::Result<()> { | ||||
|     let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); | ||||
|     let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); | ||||
|     let range = left..=right; | ||||
|     db.remap_key_type::<FacetLevelValueU32Codec>().delete_range(wtxn, &range).map(drop) | ||||
| } | ||||
|  | ||||
| fn write_number_entry( | ||||
|     writer: &mut Writer<File>, | ||||
|     field_id: FieldId, | ||||
|     level: u8, | ||||
|     left: f64, | ||||
|     right: f64, | ||||
|     ids: &RoaringBitmap, | ||||
| ) -> Result<()> { | ||||
|     let key = (field_id, level, left, right); | ||||
|     let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||
|     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; | ||||
|     writer.insert(&key, &data)?; | ||||
|     Ok(()) | ||||
| } | ||||
| fn write_string_entry( | ||||
| fn write_entry<BoundCodec>( | ||||
|     writer: &mut Writer<File>, | ||||
|     field_id: FieldId, | ||||
|     level: NonZeroU8, | ||||
|     (left_id, left_value): (u32, &str), | ||||
|     (right_id, right_value): (u32, &str), | ||||
|     bound: <BoundCodec as BytesEncode<'_>>::EItem, | ||||
|     docids: RoaringBitmap, | ||||
| ) -> Result<()> { | ||||
|     let key = (field_id, level, left_id, right_id); | ||||
|     let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||
|     let data = match level.get() { | ||||
|         1 => (Some((left_value, right_value)), docids), | ||||
|         _ => (None, docids), | ||||
|     }; | ||||
|     let data = FacetStringZeroBoundsValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data) | ||||
|         .ok_or(Error::Encoding)?; | ||||
|     writer.insert(&key, &data)?; | ||||
|     Ok(()) | ||||
| ) -> Result<()> | ||||
| where | ||||
|     for<'a> BoundCodec: BytesEncode<'a>, | ||||
|     for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Copy + Sized, | ||||
| { | ||||
|     todo!() | ||||
|     // let key = FacetKey { field_id, level: level.get(), left_bound: bound }; | ||||
|     // let key_bytes = FacetKeyCodec::<BoundCodec>::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||
|     // let value_bytes = | ||||
|     //     FacetGroupValueCodec::bytes_encode(&FacetGroupValue { size: 4, bitmap: docids }) | ||||
|     //         .ok_or(Error::Encoding)?; | ||||
|     // writer.insert(&key_bytes, &value_bytes)?; | ||||
|     // Ok(()) | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
|   | ||||
| @@ -6,7 +6,7 @@ use heed::{BytesDecode, BytesEncode}; | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, | ||||
| }; | ||||
| use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; | ||||
| use crate::heed_codec::facet::FieldDocIdFacetF64Codec; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Extracts the facet number and the documents ids where this facet number appear. | ||||
| @@ -31,13 +31,14 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let mut cursor = docid_fid_facet_number.into_cursor()?; | ||||
|     while let Some((key_bytes, _)) = cursor.move_on_next()? { | ||||
|         let (field_id, document_id, number) = | ||||
|             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); | ||||
|         todo!() | ||||
|         // let (field_id, document_id, number) = | ||||
|         //     FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); | ||||
|  | ||||
|         let key = (field_id, 0, number, number); | ||||
|         let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); | ||||
|         // let key = (field_id, 0, number, number); | ||||
|         // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); | ||||
|  | ||||
|         facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; | ||||
|         // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(facet_number_docids_sorter, indexer) | ||||
|   | ||||
| @@ -4,11 +4,9 @@ use std::{io, str}; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
| }; | ||||
| use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; | ||||
| use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; | ||||
| use crate::update::index_documents::merge_cbo_roaring_bitmaps; | ||||
| // use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extracts the facet string and the documents ids where this facet string appear. | ||||
| @@ -24,7 +22,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let mut facet_string_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Stable, | ||||
|         keep_first_prefix_value_merge_roaring_bitmaps, | ||||
|         merge_cbo_roaring_bitmaps, // TODO: check | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
| @@ -42,14 +40,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||
|         let original_value = str::from_utf8(original_value_bytes)?; | ||||
|  | ||||
|         key_buffer.clear(); | ||||
|         FacetStringLevelZeroCodec::serialize_into( | ||||
|             field_id, | ||||
|             str::from_utf8(normalized_value_bytes)?, | ||||
|             &mut key_buffer, | ||||
|         ); | ||||
|         // TODO | ||||
|         // FacetStringLevelZeroCodec::serialize_into( | ||||
|         //     field_id, | ||||
|         //     str::from_utf8(normalized_value_bytes)?, | ||||
|         //     &mut key_buffer, | ||||
|         // ); | ||||
|  | ||||
|         value_buffer.clear(); | ||||
|         encode_prefix_string(original_value, &mut value_buffer)?; | ||||
|         // TODO | ||||
|         // encode_prefix_string(original_value, &mut value_buffer)?; | ||||
|         let bitmap = RoaringBitmap::from_iter(Some(document_id)); | ||||
|         bitmap.serialize_into(&mut value_buffer)?; | ||||
|  | ||||
|   | ||||
| @@ -25,8 +25,8 @@ use self::extract_word_docids::extract_word_docids; | ||||
| use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | ||||
| use self::extract_word_position_docids::extract_word_position_docids; | ||||
| use super::helpers::{ | ||||
|     as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, | ||||
|     merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, | ||||
|     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, | ||||
|     GrenadParameters, MergeFn, MergeableReader, | ||||
| }; | ||||
| use super::{helpers, TypedChunk}; | ||||
| use crate::{FieldId, Result}; | ||||
| @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_facet_string_docids, | ||||
|         keep_first_prefix_value_merge_roaring_bitmaps, | ||||
|         merge_roaring_bitmaps, // TODO: check (cbo?) | ||||
|         TypedChunk::FieldIdFacetStringDocids, | ||||
|         "field-id-facet-string-docids", | ||||
|     ); | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use std::result::Result as StdResult; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::read_u32_ne_bytes; | ||||
| use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; | ||||
| // use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::Result; | ||||
|  | ||||
| @@ -49,32 +49,32 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( | ||||
|     _key: &[u8], | ||||
|     values: &[Cow<'a, [u8]>], | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         let original = decode_prefix_string(&values[0]).unwrap().0; | ||||
|         let merged_bitmaps = values | ||||
|             .iter() | ||||
|             .map(AsRef::as_ref) | ||||
|             .map(decode_prefix_string) | ||||
|             .map(Option::unwrap) | ||||
|             .map(|(_, bitmap_bytes)| bitmap_bytes) | ||||
|             .map(RoaringBitmap::deserialize_from) | ||||
|             .map(StdResult::unwrap) | ||||
|             .reduce(|a, b| a | b) | ||||
|             .unwrap(); | ||||
| // pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( | ||||
| //     _key: &[u8], | ||||
| //     values: &[Cow<'a, [u8]>], | ||||
| // ) -> Result<Cow<'a, [u8]>> { | ||||
| //     if values.len() == 1 { | ||||
| //         Ok(values[0].clone()) | ||||
| //     } else { | ||||
| //         let original = decode_prefix_string(&values[0]).unwrap().0; | ||||
| //         let merged_bitmaps = values | ||||
| //             .iter() | ||||
| //             .map(AsRef::as_ref) | ||||
| //             .map(decode_prefix_string) | ||||
| //             .map(Option::unwrap) | ||||
| //             .map(|(_, bitmap_bytes)| bitmap_bytes) | ||||
| //             .map(RoaringBitmap::deserialize_from) | ||||
| //             .map(StdResult::unwrap) | ||||
| //             .reduce(|a, b| a | b) | ||||
| //             .unwrap(); | ||||
|  | ||||
|         let cap = std::mem::size_of::<u16>() + original.len() + merged_bitmaps.serialized_size(); | ||||
|         let mut buffer = Vec::with_capacity(cap); | ||||
|         encode_prefix_string(original, &mut buffer)?; | ||||
|         merged_bitmaps.serialize_into(&mut buffer)?; | ||||
|         Ok(Cow::Owned(buffer)) | ||||
|     } | ||||
| } | ||||
| //         let cap = std::mem::size_of::<u16>() + original.len() + merged_bitmaps.serialized_size(); | ||||
| //         let mut buffer = Vec::with_capacity(cap); | ||||
| //         encode_prefix_string(original, &mut buffer)?; | ||||
| //         merged_bitmaps.serialize_into(&mut buffer)?; | ||||
| //         Ok(Cow::Owned(buffer)) | ||||
| //     } | ||||
| // } | ||||
|  | ||||
| pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     Ok(values[0].clone()) | ||||
|   | ||||
| @@ -13,9 +13,9 @@ pub use grenad_helpers::{ | ||||
|     writer_into_reader, GrenadParameters, MergeableReader, | ||||
| }; | ||||
| pub use merge_functions::{ | ||||
|     concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, | ||||
|     merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, | ||||
|     roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, | ||||
|     concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, | ||||
|     merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array, | ||||
|     serialize_roaring_bitmap, MergeFn, | ||||
| }; | ||||
|  | ||||
| /// The maximum length a word can be | ||||
|   | ||||
| @@ -13,7 +13,6 @@ use super::helpers::{ | ||||
|     valid_lmdb_key, CursorClonableMmap, | ||||
| }; | ||||
| use super::{ClonableMmap, MergeFn}; | ||||
| use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; | ||||
| use crate::update::index_documents::helpers::as_cloneable_grenad; | ||||
| use crate::{ | ||||
|     lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, | ||||
| @@ -197,13 +196,14 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 |new_values, db_values, buffer| { | ||||
|                     let (_, new_values) = decode_prefix_string(new_values).unwrap(); | ||||
|                     let new_values = RoaringBitmap::deserialize_from(new_values)?; | ||||
|                     let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); | ||||
|                     let db_values = RoaringBitmap::deserialize_from(db_values)?; | ||||
|                     let values = new_values | db_values; | ||||
|                     encode_prefix_string(db_original, buffer)?; | ||||
|                     Ok(values.serialize_into(buffer)?) | ||||
|                     todo!() | ||||
|                     // let (_, new_values) = decode_prefix_string(new_values).unwrap(); | ||||
|                     // let new_values = RoaringBitmap::deserialize_from(new_values)?; | ||||
|                     // let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); | ||||
|                     // let db_values = RoaringBitmap::deserialize_from(db_values)?; | ||||
|                     // let values = new_values | db_values; | ||||
|                     // encode_prefix_string(db_original, buffer)?; | ||||
|                     // Ok(values.serialize_into(buffer)?) | ||||
|                 }, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user