mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	Simplify indexing task for facet_exists_docids database
This commit is contained in:
		@@ -1,40 +0,0 @@
 | 
			
		||||
use std::fs::File;
 | 
			
		||||
use std::io;
 | 
			
		||||
 | 
			
		||||
use heed::{BytesDecode, BytesEncode};
 | 
			
		||||
 | 
			
		||||
use super::helpers::{
 | 
			
		||||
    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
 | 
			
		||||
};
 | 
			
		||||
use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec};
 | 
			
		||||
use crate::Result;
 | 
			
		||||
 | 
			
		||||
/// Extracts the documents ids where this field appears.
 | 
			
		||||
///
 | 
			
		||||
/// Returns a grenad reader whose key is the field id encoded
 | 
			
		||||
/// with `FieldIdCodec` and the value is a document_id (u32)
 | 
			
		||||
/// encoded as native-endian bytes.
 | 
			
		||||
#[logging_timer::time]
 | 
			
		||||
pub fn extract_facet_exists_docids<R: io::Read + io::Seek>(
 | 
			
		||||
    docid_fid_facet_number: grenad::Reader<R>,
 | 
			
		||||
    indexer: GrenadParameters,
 | 
			
		||||
) -> Result<grenad::Reader<File>> {
 | 
			
		||||
    let max_memory = indexer.max_memory_by_thread();
 | 
			
		||||
 | 
			
		||||
    let mut facet_exists_docids_sorter = create_sorter(
 | 
			
		||||
        merge_cbo_roaring_bitmaps,
 | 
			
		||||
        indexer.chunk_compression_type,
 | 
			
		||||
        indexer.chunk_compression_level,
 | 
			
		||||
        indexer.max_nb_chunks,
 | 
			
		||||
        max_memory,
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    let mut cursor = docid_fid_facet_number.into_cursor()?;
 | 
			
		||||
    while let Some((key_bytes, _)) = cursor.move_on_next()? {
 | 
			
		||||
        let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap();
 | 
			
		||||
        let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap();
 | 
			
		||||
        facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    sorter_into_reader(facet_exists_docids_sorter, indexer)
 | 
			
		||||
}
 | 
			
		||||
@@ -1,15 +1,16 @@
 | 
			
		||||
use heed::zerocopy::AsBytes;
 | 
			
		||||
use serde_json::Value;
 | 
			
		||||
use std::collections::HashSet;
 | 
			
		||||
use std::convert::TryInto;
 | 
			
		||||
use std::fs::File;
 | 
			
		||||
use std::io;
 | 
			
		||||
use std::mem::size_of;
 | 
			
		||||
 | 
			
		||||
use heed::zerocopy::AsBytes;
 | 
			
		||||
use serde_json::Value;
 | 
			
		||||
 | 
			
		||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 | 
			
		||||
use crate::error::InternalError;
 | 
			
		||||
use crate::facet::value_encoding::f64_into_bytes;
 | 
			
		||||
use crate::{DocumentId, FieldId, Result};
 | 
			
		||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
 | 
			
		||||
use crate::{DocumentId, FieldId, Result, BEU32};
 | 
			
		||||
 | 
			
		||||
/// Extracts the facet values of each faceted field of each document.
 | 
			
		||||
///
 | 
			
		||||
@@ -40,7 +41,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    let mut fid_docid_facet_exists_sorter = create_sorter(
 | 
			
		||||
        keep_first,
 | 
			
		||||
        merge_cbo_roaring_bitmaps,
 | 
			
		||||
        indexer.chunk_compression_type,
 | 
			
		||||
        indexer.chunk_compression_level,
 | 
			
		||||
        indexer.max_nb_chunks,
 | 
			
		||||
@@ -56,12 +57,17 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 | 
			
		||||
            if faceted_fields.contains(&field_id) {
 | 
			
		||||
                key_buffer.clear();
 | 
			
		||||
 | 
			
		||||
                // here, we know already that the document must be added to the “field id exists” database
 | 
			
		||||
                // prefix key with the field_id and the document_id
 | 
			
		||||
 | 
			
		||||
                // Set key to the field_id
 | 
			
		||||
                // Note: this encoding is consistent with FieldIdCodec
 | 
			
		||||
                key_buffer.extend_from_slice(&field_id.to_be_bytes());
 | 
			
		||||
 | 
			
		||||
                // Here, we know already that the document must be added to the “field id exists” database
 | 
			
		||||
                let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
 | 
			
		||||
                let document = BEU32::from(document).get();
 | 
			
		||||
                fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?;
 | 
			
		||||
 | 
			
		||||
                // For the other extraction tasks, prefix the key with the field_id and the document_id
 | 
			
		||||
                key_buffer.extend_from_slice(&docid_bytes);
 | 
			
		||||
                fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?;
 | 
			
		||||
 | 
			
		||||
                let value =
 | 
			
		||||
                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,4 @@
 | 
			
		||||
mod extract_docid_word_positions;
 | 
			
		||||
mod extract_facet_exists_docids;
 | 
			
		||||
mod extract_facet_number_docids;
 | 
			
		||||
mod extract_facet_string_docids;
 | 
			
		||||
mod extract_fid_docid_facet_values;
 | 
			
		||||
@@ -17,7 +16,6 @@ use log::debug;
 | 
			
		||||
use rayon::prelude::*;
 | 
			
		||||
 | 
			
		||||
use self::extract_docid_word_positions::extract_docid_word_positions;
 | 
			
		||||
use self::extract_facet_exists_docids::extract_facet_exists_docids;
 | 
			
		||||
use self::extract_facet_number_docids::extract_facet_number_docids;
 | 
			
		||||
use self::extract_facet_string_docids::extract_facet_string_docids;
 | 
			
		||||
use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
 | 
			
		||||
@@ -142,15 +140,12 @@ pub(crate) fn data_from_obkv_documents(
 | 
			
		||||
        TypedChunk::FieldIdFacetNumberDocids,
 | 
			
		||||
        "field-id-facet-number-docids",
 | 
			
		||||
    );
 | 
			
		||||
    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
 | 
			
		||||
        docid_fid_facet_exists_chunks.clone(),
 | 
			
		||||
        indexer.clone(),
 | 
			
		||||
        lmdb_writer_sx.clone(),
 | 
			
		||||
        extract_facet_exists_docids,
 | 
			
		||||
        merge_cbo_roaring_bitmaps,
 | 
			
		||||
        TypedChunk::FieldIdFacetExistsDocids,
 | 
			
		||||
        "field-id-facet-exists-docids",
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    // spawn extraction task for field-id-facet-exists-docids
 | 
			
		||||
    rayon::spawn(move || {
 | 
			
		||||
        let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer);
 | 
			
		||||
        let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids));
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
@@ -226,7 +221,7 @@ fn send_and_extract_flattened_documents_data(
 | 
			
		||||
    grenad::Reader<CursorClonableMmap>,
 | 
			
		||||
    (
 | 
			
		||||
        grenad::Reader<CursorClonableMmap>,
 | 
			
		||||
        (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
 | 
			
		||||
        (grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
 | 
			
		||||
    ),
 | 
			
		||||
)> {
 | 
			
		||||
    let flattened_documents_chunk =
 | 
			
		||||
@@ -294,9 +289,6 @@ fn send_and_extract_flattened_documents_data(
 | 
			
		||||
                    docid_fid_facet_strings_chunk.clone(),
 | 
			
		||||
                )));
 | 
			
		||||
 | 
			
		||||
                let docid_fid_facet_exists_chunk =
 | 
			
		||||
                    unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? };
 | 
			
		||||
 | 
			
		||||
                Ok((
 | 
			
		||||
                    docid_fid_facet_numbers_chunk,
 | 
			
		||||
                    (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk),
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user