Expose the DocumentId struct to be sure to inject the generated ids

This commit is contained in:
Kerollmops
2022-06-21 14:41:19 +02:00
parent d1a4da9812
commit 0bbcc7b180
7 changed files with 41 additions and 25 deletions

View File

@ -2,6 +2,7 @@ use std::io::{Read, Seek};
use std::result::Result as StdResult;
use std::{fmt, iter};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
@ -89,14 +90,15 @@ pub fn enrich_documents_batch<R: Read + Seek>(
Err(user_error) => return Ok(Err(user_error)),
};
external_ids.insert(count.to_be_bytes(), document_id.value())?;
if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
return Ok(Err(UserError::from(user_error)));
}
}
let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?;
external_ids.insert(count.to_be_bytes(), document_id)?;
count += 1;
}
@ -210,7 +212,7 @@ impl PrimaryKey<'_> {
///
/// In case the document id has been auto-generated, the document nth is kept to help
/// users debug if there is an issue with the document itself.
#[derive(Clone)]
#[derive(Serialize, Deserialize, Clone)]
pub enum DocumentId {
Retrieved { value: String },
Generated { value: String, document_nth: u32 },
@ -225,16 +227,20 @@ impl DocumentId {
DocumentId::Generated { value, document_nth }
}
fn value(&self) -> &str {
fn debug(&self) -> String {
format!("{:?}", self)
}
pub fn is_generated(&self) -> bool {
matches!(self, DocumentId::Generated { .. })
}
pub fn value(&self) -> &str {
match self {
DocumentId::Retrieved { value } => value,
DocumentId::Generated { value, .. } => value,
}
}
fn debug(&self) -> String {
format!("{:?}", self)
}
}
impl fmt::Debug for DocumentId {

View File

@ -22,7 +22,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
use self::enrich::enrich_documents_batch;
pub use self::enrich::{
extract_float_from_value, validate_document_id, validate_document_id_value,
validate_geo_from_json,
validate_geo_from_json, DocumentId,
};
pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,

View File

@ -153,8 +153,9 @@ impl<'a, 'i> Transform<'a, 'i> {
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
let primary_key = cursor.primary_key().to_string();
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
let primary_key_id_nested = primary_key.contains('.');
let primary_key_id =
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
let mut flattened_document = None;
let mut obkv_buffer = Vec::new();
@ -162,7 +163,7 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut documents_count = 0;
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
while let Some(enriched_document) = cursor.next_enriched_document()? {
let EnrichedDocument { document, external_id } = enriched_document;
let EnrichedDocument { document, document_id } = enriched_document;
let mut field_buffer_cache = drop_and_reuse(field_buffer);
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
@ -171,6 +172,14 @@ impl<'a, 'i> Transform<'a, 'i> {
});
}
// When the document id has been auto-generated by the `enrich_documents_batch`
// we must insert this document id into the remaped document.
let external_id = document_id.value();
if document_id.is_generated() {
let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?;
field_buffer_cache.push((primary_key_id, Cow::from(docid)));
}
for (k, v) in document.iter() {
let mapped_id =
*mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;

View File

@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult};
pub use self::facets::Facets;
pub use self::index_documents::{
DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
};
pub use self::indexer_config::IndexerConfig;
pub use self::settings::{Setting, Settings};