mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Update extract_facet_string_docids to support deladd obkvs
This commit is contained in:
		
				
					committed by
					
						 Louis Dureuil
						Louis Dureuil
					
				
			
			
				
	
			
			
			
						parent
						
							fcd3a1434d
						
					
				
				
					commit
					e2bc054604
				
			| @@ -1,13 +1,15 @@ | |||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::BufReader; | ||||||
|  | use std::{io, str}; | ||||||
|  |  | ||||||
| use heed::BytesEncode; | use heed::BytesEncode; | ||||||
|  |  | ||||||
| use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; | use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; | ||||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; | use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; | ||||||
| use crate::heed_codec::StrRefCodec; | use crate::heed_codec::StrRefCodec; | ||||||
| use crate::update::index_documents::merge_cbo_roaring_bitmaps; | use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; | ||||||
|  | use crate::{FieldId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the facet string and the documents ids where this facet string appear. | /// Extracts the facet string and the documents ids where this facet string appear. | ||||||
| /// | /// | ||||||
| @@ -15,7 +17,6 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | |||||||
| /// documents ids from the given chunk of docid facet string positions. | /// documents ids from the given chunk of docid facet string positions. | ||||||
| #[logging_timer::time] | #[logging_timer::time] | ||||||
| pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||||
|     // TODO Reader<Key, Obkv<DelAdd, OriginalString>> |  | ||||||
|     docid_fid_facet_string: grenad::Reader<R>, |     docid_fid_facet_string: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
| @@ -25,17 +26,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let mut facet_string_docids_sorter = create_sorter( |     let mut facet_string_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Stable, |         grenad::SortAlgorithm::Stable, | ||||||
|         // TODO We must modify the merger to do unions of Del and Add separately |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         merge_cbo_roaring_bitmaps, |  | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory, |         max_memory, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|     let mut cursor = docid_fid_facet_string.into_cursor()?; |     let mut cursor = docid_fid_facet_string.into_cursor()?; | ||||||
|     while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { |     while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { | ||||||
|         // TODO the value is a Obkv<DelAdd, OriginalString> and must be taken into account |  | ||||||
|         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); |         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); | ||||||
|         let field_id = FieldId::from_be_bytes(field_id_bytes); |         let field_id = FieldId::from_be_bytes(field_id_bytes); | ||||||
|  |  | ||||||
| @@ -43,22 +43,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|             try_split_array_at::<_, 4>(bytes).unwrap(); |             try_split_array_at::<_, 4>(bytes).unwrap(); | ||||||
|         let document_id = u32::from_be_bytes(document_id_bytes); |         let document_id = u32::from_be_bytes(document_id_bytes); | ||||||
|  |  | ||||||
|         let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; |         let normalized_value = str::from_utf8(normalized_value_bytes)?; | ||||||
|  |         let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; | ||||||
|         let normalised_truncated_value: String; |  | ||||||
|         if normalised_value.len() > MAX_FACET_VALUE_LENGTH { |  | ||||||
|             normalised_truncated_value = normalised_value |  | ||||||
|                 .char_indices() |  | ||||||
|                 .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) |  | ||||||
|                 .map(|(_, c)| c) |  | ||||||
|                 .collect(); |  | ||||||
|             normalised_value = normalised_truncated_value.as_str(); |  | ||||||
|         } |  | ||||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; |  | ||||||
|         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); |         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); | ||||||
|         // document id is encoded in native-endian because of the CBO roaring bitmap codec |  | ||||||
|         // TODO Reader<KeyBytes, Obkv<DelAdd, RoaringBitmap>> |         buffer.clear(); | ||||||
|         facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; |         let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |         for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { | ||||||
|  |             obkv.insert(deladd_key, document_id.to_ne_bytes())?; | ||||||
|  |         } | ||||||
|  |         obkv.finish()?; | ||||||
|  |         facet_string_docids_sorter.insert(&key_bytes, &buffer)?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(facet_string_docids_sorter, indexer) |     sorter_into_reader(facet_string_docids_sorter, indexer) | ||||||
|   | |||||||
| @@ -193,6 +193,7 @@ pub fn obkvs_keep_last_addition_merge_deletions<'a>( | |||||||
|     inner_merge_del_add_obkvs(obkvs, false) |     inner_merge_del_add_obkvs(obkvs, false) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Do a union of all the CboRoaringBitmaps in the values. | ||||||
| pub fn merge_cbo_roaring_bitmaps<'a>( | pub fn merge_cbo_roaring_bitmaps<'a>( | ||||||
|     _key: &[u8], |     _key: &[u8], | ||||||
|     values: &[Cow<'a, [u8]>], |     values: &[Cow<'a, [u8]>], | ||||||
| @@ -206,6 +207,8 @@ pub fn merge_cbo_roaring_bitmaps<'a>( | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv | ||||||
|  | /// separately and outputs a new DelAdd with both unions. | ||||||
| pub fn merge_deladd_cbo_roaring_bitmaps<'a>( | pub fn merge_deladd_cbo_roaring_bitmaps<'a>( | ||||||
|     _key: &[u8], |     _key: &[u8], | ||||||
|     values: &[Cow<'a, [u8]>], |     values: &[Cow<'a, [u8]>], | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user