mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	Try to make facet indexing incremental
This commit is contained in:
		
				
					committed by
					
						
						Loïc Lecrenier
					
				
			
			
				
	
			
			
			
						parent
						
							d30c89e345
						
					
				
				
					commit
					85824ee203
				
			@@ -32,6 +32,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
 | 
			
		||||
        let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
 | 
			
		||||
        let field_id = FieldId::from_be_bytes(field_id_bytes);
 | 
			
		||||
 | 
			
		||||
        // document_id_bytes is a big-endian u32
 | 
			
		||||
        // merge_cbo_roaring_bitmap works with native endian u32s
 | 
			
		||||
        // that is a problem, I think
 | 
			
		||||
 | 
			
		||||
        let (document_id_bytes, normalized_value_bytes) =
 | 
			
		||||
            try_split_array_at::<_, 4>(bytes).unwrap();
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -34,7 +34,6 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 | 
			
		||||
pub use self::transform::{Transform, TransformOutput};
 | 
			
		||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
 | 
			
		||||
use crate::error::UserError;
 | 
			
		||||
use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
 | 
			
		||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
 | 
			
		||||
use crate::update::{
 | 
			
		||||
    self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
 | 
			
		||||
@@ -431,23 +430,6 @@ where
 | 
			
		||||
        // Merged databases are already been indexed, we start from this count;
 | 
			
		||||
        let mut databases_seen = MERGED_DATABASE_COUNT;
 | 
			
		||||
 | 
			
		||||
        // Run the facets update operation.
 | 
			
		||||
        for facet_db in [
 | 
			
		||||
            (&self.index.facet_id_string_docids).remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
 | 
			
		||||
            (&self.index.facet_id_f64_docids).remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
 | 
			
		||||
        ] {
 | 
			
		||||
            let mut builder = FacetsUpdateBulk::new(self.index, facet_db);
 | 
			
		||||
            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
 | 
			
		||||
            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
 | 
			
		||||
            if let Some(value) = self.config.facet_level_group_size {
 | 
			
		||||
                builder.level_group_size(value);
 | 
			
		||||
            }
 | 
			
		||||
            if let Some(value) = self.config.facet_min_level_size {
 | 
			
		||||
                builder.min_level_size(value);
 | 
			
		||||
            }
 | 
			
		||||
            builder.execute(self.wtxn)?;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        databases_seen += 1;
 | 
			
		||||
        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
 | 
			
		||||
            databases_seen,
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,9 @@ use super::helpers::{
 | 
			
		||||
    valid_lmdb_key, CursorClonableMmap,
 | 
			
		||||
};
 | 
			
		||||
use super::{ClonableMmap, MergeFn};
 | 
			
		||||
use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
 | 
			
		||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
 | 
			
		||||
use crate::update::FacetsUpdateIncremental;
 | 
			
		||||
use crate::{
 | 
			
		||||
    lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
 | 
			
		||||
    Result,
 | 
			
		||||
@@ -146,6 +148,34 @@ pub(crate) fn write_typed_chunk_into_index(
 | 
			
		||||
            )?;
 | 
			
		||||
            is_merged_database = true;
 | 
			
		||||
        }
 | 
			
		||||
        TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
 | 
			
		||||
            // merge cbo roaring bitmaps is not the correct merger because the data in the DB
 | 
			
		||||
            // is FacetGroupValue and not RoaringBitmap
 | 
			
		||||
            // so I need to create my own merging function
 | 
			
		||||
 | 
			
		||||
            // facet_id_string_docids is encoded as:
 | 
			
		||||
            // key: FacetKeyCodec<StrRefCodec>
 | 
			
		||||
            // value: CboRoaringBitmapCodec
 | 
			
		||||
            // basically
 | 
			
		||||
 | 
			
		||||
            // TODO: a condition saying "if I have more than 1/50th of the DB to add,
 | 
			
		||||
            // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
 | 
			
		||||
            // it is a ratio I determine empirically
 | 
			
		||||
 | 
			
		||||
            // for now I only do it incrementally, to see if things work
 | 
			
		||||
            let builder = FacetsUpdateIncremental::new(
 | 
			
		||||
                index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
 | 
			
		||||
            );
 | 
			
		||||
            let mut cursor = facet_id_string_docids.into_cursor()?;
 | 
			
		||||
            while let Some((key, value)) = cursor.move_on_next()? {
 | 
			
		||||
                let key =
 | 
			
		||||
                    FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
 | 
			
		||||
                let value =
 | 
			
		||||
                    CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
 | 
			
		||||
                builder.insert(wtxn, key.field_id, key.left_bound, &value)?;
 | 
			
		||||
            }
 | 
			
		||||
            is_merged_database = true;
 | 
			
		||||
        }
 | 
			
		||||
        TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
 | 
			
		||||
            append_entries_into_database(
 | 
			
		||||
                facet_id_exists_docids,
 | 
			
		||||
@@ -188,17 +218,6 @@ pub(crate) fn write_typed_chunk_into_index(
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
 | 
			
		||||
            // facet_id_string_docids contains the thing that the extractor put into it,
 | 
			
		||||
            // so: (FacetKey { field id, level: 0, left_bound } , docids: RoaringBitmap )
 | 
			
		||||
            // now we need to either:
 | 
			
		||||
            // 1. incrementally add the keys/docids pairs into the DB
 | 
			
		||||
            // 2. add the keys/docids into level 0 and then call Facets::execute
 | 
			
		||||
            // the choice of solution should be determined by their performance
 | 
			
		||||
            // characteristics
 | 
			
		||||
 | 
			
		||||
            is_merged_database = true;
 | 
			
		||||
        }
 | 
			
		||||
        TypedChunk::GeoPoints(geo_points) => {
 | 
			
		||||
            let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
 | 
			
		||||
            let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user