mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	Make the Transform read from an EnrichedDocumentsBatchReader
This commit is contained in:
		@@ -27,7 +27,7 @@ pub use self::helpers::{
 | 
			
		||||
};
 | 
			
		||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 | 
			
		||||
pub use self::transform::{Transform, TransformOutput};
 | 
			
		||||
use self::validate::validate_documents_batch;
 | 
			
		||||
use self::validate::validate_and_enrich_documents_batch;
 | 
			
		||||
pub use self::validate::{
 | 
			
		||||
    extract_float_from_value, validate_document_id, validate_document_id_value,
 | 
			
		||||
    validate_geo_from_json,
 | 
			
		||||
@@ -141,7 +141,7 @@ where
 | 
			
		||||
        // We check for user errors in this validator and if there is one, we can return
 | 
			
		||||
        // the `IndexDocument` struct as it is valid to send more documents into it.
 | 
			
		||||
        // However, if there is an internal error we throw it away!
 | 
			
		||||
        let reader = match validate_documents_batch(
 | 
			
		||||
        let enriched_documents_reader = match validate_and_enrich_documents_batch(
 | 
			
		||||
            self.wtxn,
 | 
			
		||||
            self.index,
 | 
			
		||||
            self.config.autogenerate_docids,
 | 
			
		||||
@@ -155,7 +155,7 @@ where
 | 
			
		||||
            .transform
 | 
			
		||||
            .as_mut()
 | 
			
		||||
            .expect("Invalid document addition state")
 | 
			
		||||
            .read_documents(reader, self.wtxn, &self.progress)?
 | 
			
		||||
            .read_documents(enriched_documents_reader, self.wtxn, &self.progress)?
 | 
			
		||||
            as u64;
 | 
			
		||||
 | 
			
		||||
        self.added_documents += indexed_documents;
 | 
			
		||||
 
 | 
			
		||||
@@ -14,7 +14,7 @@ use smartstring::SmartString;
 | 
			
		||||
 | 
			
		||||
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
 | 
			
		||||
use super::{IndexDocumentsMethod, IndexerConfig};
 | 
			
		||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 | 
			
		||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 | 
			
		||||
use crate::error::{Error, InternalError, UserError};
 | 
			
		||||
use crate::index::db_name;
 | 
			
		||||
use crate::update::index_documents::validate_document_id_value;
 | 
			
		||||
@@ -153,7 +153,7 @@ impl<'a, 'i> Transform<'a, 'i> {
 | 
			
		||||
 | 
			
		||||
    pub fn read_documents<R, F>(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        reader: DocumentsBatchReader<R>,
 | 
			
		||||
        reader: EnrichedDocumentsBatchReader<R>,
 | 
			
		||||
        wtxn: &mut heed::RwTxn,
 | 
			
		||||
        progress_callback: F,
 | 
			
		||||
    ) -> Result<usize>
 | 
			
		||||
@@ -189,7 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> {
 | 
			
		||||
        let mut external_id_buffer = Vec::new();
 | 
			
		||||
        let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
 | 
			
		||||
        let addition_index = cursor.documents_batch_index().clone();
 | 
			
		||||
        while let Some(document) = cursor.next_document()? {
 | 
			
		||||
        while let Some(enriched_document) = cursor.next_enriched_document()? {
 | 
			
		||||
            let EnrichedDocument { document, external_id } = enriched_document;
 | 
			
		||||
 | 
			
		||||
            let mut field_buffer_cache = drop_and_reuse(field_buffer);
 | 
			
		||||
            if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
 | 
			
		||||
                progress_callback(UpdateIndexingStep::RemapDocumentAddition {
 | 
			
		||||
 
 | 
			
		||||
@@ -4,27 +4,28 @@ use std::result::Result as StdResult;
 | 
			
		||||
 | 
			
		||||
use serde_json::Value;
 | 
			
		||||
 | 
			
		||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 | 
			
		||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
 | 
			
		||||
use crate::error::{GeoError, InternalError, UserError};
 | 
			
		||||
use crate::update::index_documents::obkv_to_object;
 | 
			
		||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
 | 
			
		||||
use crate::{FieldId, Index, Object, Result};
 | 
			
		||||
 | 
			
		||||
/// The symbol used to define levels in a nested primary key.
 | 
			
		||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 | 
			
		||||
 | 
			
		||||
/// This function validates a documents by checking that:
 | 
			
		||||
/// This function validates and enrich the documents by checking that:
 | 
			
		||||
///  - we can infer a primary key,
 | 
			
		||||
///  - all the documents id exist and,
 | 
			
		||||
///  - all the documents id exist and are extracted,
 | 
			
		||||
///  - the validity of them but also,
 | 
			
		||||
///  - the validity of the `_geo` field depending on the settings.
 | 
			
		||||
pub fn validate_documents_batch<R: Read + Seek>(
 | 
			
		||||
pub fn validate_and_enrich_documents_batch<R: Read + Seek>(
 | 
			
		||||
    rtxn: &heed::RoTxn,
 | 
			
		||||
    index: &Index,
 | 
			
		||||
    autogenerate_docids: bool,
 | 
			
		||||
    reader: DocumentsBatchReader<R>,
 | 
			
		||||
) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> {
 | 
			
		||||
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
 | 
			
		||||
    let mut cursor = reader.into_cursor();
 | 
			
		||||
    let mut documents_batch_index = cursor.documents_batch_index().clone();
 | 
			
		||||
    let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
 | 
			
		||||
 | 
			
		||||
    // The primary key *field id* that has already been set for this index or the one
 | 
			
		||||
    // we will guess by searching for the first key that contains "id" as a substring.
 | 
			
		||||
@@ -82,6 +83,8 @@ pub fn validate_documents_batch<R: Read + Seek>(
 | 
			
		||||
            Err(user_error) => return Ok(Err(user_error)),
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        external_ids.insert(count.to_be_bytes(), &document_id)?;
 | 
			
		||||
 | 
			
		||||
        if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
 | 
			
		||||
            if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? {
 | 
			
		||||
                return Ok(Err(UserError::from(user_error)));
 | 
			
		||||
@@ -90,7 +93,10 @@ pub fn validate_documents_batch<R: Read + Seek>(
 | 
			
		||||
        count += 1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Ok(Ok(cursor.into_reader()))
 | 
			
		||||
    let external_ids = writer_into_reader(external_ids)?;
 | 
			
		||||
    let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?;
 | 
			
		||||
 | 
			
		||||
    Ok(Ok(reader))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Retrieve the document id after validating it, returning a `UserError`
 | 
			
		||||
@@ -100,7 +106,7 @@ fn fetch_document_id(
 | 
			
		||||
    documents_batch_index: &DocumentsBatchIndex,
 | 
			
		||||
    primary_key: PrimaryKey,
 | 
			
		||||
    autogenerate_docids: bool,
 | 
			
		||||
    count: usize,
 | 
			
		||||
    count: u32,
 | 
			
		||||
) -> Result<StdResult<String, UserError>> {
 | 
			
		||||
    match primary_key {
 | 
			
		||||
        PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user