mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Enrich uses the new type
This commit is contained in:
		| @@ -1,20 +1,17 @@ | |||||||
|  | use std::fmt; | ||||||
| use std::io::{BufWriter, Read, Seek}; | use std::io::{BufWriter, Read, Seek}; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
| use std::{fmt, iter}; |  | ||||||
|  |  | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; | use crate::documents::{ | ||||||
|  |     DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, | ||||||
|  |     EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, | ||||||
|  | }; | ||||||
| use crate::error::{GeoError, InternalError, UserError}; | use crate::error::{GeoError, InternalError, UserError}; | ||||||
| use crate::update::index_documents::{obkv_to_object, writer_into_reader}; | use crate::update::index_documents::{obkv_to_object, writer_into_reader}; | ||||||
| use crate::{FieldId, Index, Object, Result}; | use crate::{FieldId, Index, Result}; | ||||||
|  |  | ||||||
| /// The symbol used to define levels in a nested primary key. |  | ||||||
| const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; |  | ||||||
|  |  | ||||||
| /// The default primary that is used when not specified. |  | ||||||
| const DEFAULT_PRIMARY_KEY: &str = "id"; |  | ||||||
|  |  | ||||||
| /// This function validates and enrich the documents by checking that: | /// This function validates and enrich the documents by checking that: | ||||||
| ///  - we can infer a primary key, | ///  - we can infer a primary key, | ||||||
| @@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>( | |||||||
|     // The primary key *field id* that has already been set for this index or the one |     // The primary key *field id* that has already been set for this index or the one | ||||||
|     // we will guess by searching for the first key that contains "id" as a substring. |     // we will guess by searching for the first key that contains "id" as a substring. | ||||||
|     let primary_key = match index.primary_key(rtxn)? { |     let primary_key = match index.primary_key(rtxn)? { | ||||||
|         Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { |         Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { | ||||||
|             PrimaryKey::nested(primary_key) |             Some(primary_key) => primary_key, | ||||||
|         } |             None if autogenerate_docids => PrimaryKey::Flat { | ||||||
|         Some(primary_key) => match documents_batch_index.id(primary_key) { |                 name: primary_key, | ||||||
|             Some(id) => PrimaryKey::flat(primary_key, id), |                 field_id: documents_batch_index.insert(primary_key), | ||||||
|             None if autogenerate_docids => { |             }, | ||||||
|                 PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) |  | ||||||
|             } |  | ||||||
|             None => { |             None => { | ||||||
|                 return match cursor.next_document()? { |                 return match cursor.next_document()? { | ||||||
|                     Some(first_document) => Ok(Err(UserError::MissingDocumentId { |                     Some(first_document) => Ok(Err(UserError::MissingDocumentId { | ||||||
| @@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>( | |||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             match guesses.as_slice() { |             match guesses.as_slice() { | ||||||
|                 [] if autogenerate_docids => PrimaryKey::flat( |                 [] if autogenerate_docids => PrimaryKey::Flat { | ||||||
|                     DEFAULT_PRIMARY_KEY, |                     name: DEFAULT_PRIMARY_KEY, | ||||||
|                     documents_batch_index.insert(DEFAULT_PRIMARY_KEY), |                     field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), | ||||||
|                 ), |                 }, | ||||||
|                 [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), |                 [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), | ||||||
|                 [(field_id, name)] => { |                 [(field_id, name)] => { | ||||||
|                     log::info!("Primary key was not specified in index. Inferred to '{name}'"); |                     log::info!("Primary key was not specified in index. Inferred to '{name}'"); | ||||||
|                     PrimaryKey::flat(name, *field_id) |                     PrimaryKey::Flat { name, field_id: *field_id } | ||||||
|                 } |                 } | ||||||
|                 multiple => { |                 multiple => { | ||||||
|                     return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { |                     return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { | ||||||
| @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( | |||||||
|     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], |     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], | ||||||
|     count: u32, |     count: u32, | ||||||
| ) -> Result<StdResult<DocumentId, UserError>> { | ) -> Result<StdResult<DocumentId, UserError>> { | ||||||
|     match primary_key { |     Ok(match primary_key.document_id(document, documents_batch_index)? { | ||||||
|         PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { |         Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), | ||||||
|             match document.get(primary_key_id) { |         Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), | ||||||
|                 Some(document_id_bytes) => { |         Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { | ||||||
|                     let document_id = serde_json::from_slice(document_id_bytes) |             let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); | ||||||
|                         .map_err(InternalError::SerdeJson)?; |             Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) | ||||||
|                     match validate_document_id_value(document_id)? { |  | ||||||
|                         Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), |  | ||||||
|                         Err(user_error) => Ok(Err(user_error)), |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 None if autogenerate_docids => { |  | ||||||
|                     let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); |  | ||||||
|                     Ok(Ok(DocumentId::generated(uuid.to_string(), count))) |  | ||||||
|                 } |  | ||||||
|                 None => Ok(Err(UserError::MissingDocumentId { |  | ||||||
|                     primary_key: primary_key.to_string(), |  | ||||||
|                     document: obkv_to_object(document, documents_batch_index)?, |  | ||||||
|                 })), |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|         nested @ PrimaryKey::Nested { .. } => { |         Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { | ||||||
|             let mut matching_documents_ids = Vec::new(); |             primary_key: primary_key.name().to_string(), | ||||||
|             for (first_level_name, right) in nested.possible_level_names() { |             document: obkv_to_object(document, documents_batch_index)?, | ||||||
|                 if let Some(field_id) = documents_batch_index.id(first_level_name) { |         }), | ||||||
|                     if let Some(value_bytes) = document.get(field_id) { |         Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { | ||||||
|                         let object = serde_json::from_slice(value_bytes) |             Err(UserError::TooManyDocumentIds { | ||||||
|                             .map_err(InternalError::SerdeJson)?; |                 primary_key: primary_key.name().to_string(), | ||||||
|                         fetch_matching_values(object, right, &mut matching_documents_ids); |                 document: obkv_to_object(document, documents_batch_index)?, | ||||||
|  |             }) | ||||||
|                         if matching_documents_ids.len() >= 2 { |  | ||||||
|                             return Ok(Err(UserError::TooManyDocumentIds { |  | ||||||
|                                 primary_key: nested.name().to_string(), |  | ||||||
|                                 document: obkv_to_object(document, documents_batch_index)?, |  | ||||||
|                             })); |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             match matching_documents_ids.pop() { |  | ||||||
|                 Some(document_id) => match validate_document_id_value(document_id)? { |  | ||||||
|                     Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), |  | ||||||
|                     Err(user_error) => Ok(Err(user_error)), |  | ||||||
|                 }, |  | ||||||
|                 None => Ok(Err(UserError::MissingDocumentId { |  | ||||||
|                     primary_key: nested.name().to_string(), |  | ||||||
|                     document: obkv_to_object(document, documents_batch_index)?, |  | ||||||
|                 })), |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     }) | ||||||
| } |  | ||||||
|  |  | ||||||
| /// A type that represent the type of primary key that has been set |  | ||||||
| /// for this index, a classic flat one or a nested one. |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| enum PrimaryKey<'a> { |  | ||||||
|     Flat { name: &'a str, field_id: FieldId }, |  | ||||||
|     Nested { name: &'a str }, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl PrimaryKey<'_> { |  | ||||||
|     fn flat(name: &str, field_id: FieldId) -> PrimaryKey { |  | ||||||
|         PrimaryKey::Flat { name, field_id } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn nested(name: &str) -> PrimaryKey { |  | ||||||
|         PrimaryKey::Nested { name } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         match self { |  | ||||||
|             PrimaryKey::Flat { name, .. } => name, |  | ||||||
|             PrimaryKey::Nested { name } => name, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Returns an `Iterator` that gives all the possible fields names the primary key |  | ||||||
|     /// can have depending of the first level name and deepnes of the objects. |  | ||||||
|     fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ { |  | ||||||
|         let name = self.name(); |  | ||||||
|         name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) |  | ||||||
|             .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) |  | ||||||
|             .chain(iter::once((name, ""))) |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /// A type that represents a document id that has been retrieved from a document or auto-generated. | /// A type that represents a document id that has been retrieved from a document or auto-generated. | ||||||
| @@ -255,14 +182,6 @@ pub enum DocumentId { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl DocumentId { | impl DocumentId { | ||||||
|     fn retrieved(value: String) -> DocumentId { |  | ||||||
|         DocumentId::Retrieved { value } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn generated(value: String, document_nth: u32) -> DocumentId { |  | ||||||
|         DocumentId::Generated { value, document_nth } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn debug(&self) -> String { |     fn debug(&self) -> String { | ||||||
|         format!("{:?}", self) |         format!("{:?}", self) | ||||||
|     } |     } | ||||||
| @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn starts_with(selector: &str, key: &str) -> bool { |  | ||||||
|     selector.strip_prefix(key).map_or(false, |tail| { |  | ||||||
|         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) |  | ||||||
|     }) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) { |  | ||||||
|     match value { |  | ||||||
|         Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), |  | ||||||
|         otherwise => output.push(otherwise), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn fetch_matching_values_in_object( |  | ||||||
|     object: Object, |  | ||||||
|     selector: &str, |  | ||||||
|     base_key: &str, |  | ||||||
|     output: &mut Vec<Value>, |  | ||||||
| ) { |  | ||||||
|     for (key, value) in object { |  | ||||||
|         let base_key = if base_key.is_empty() { |  | ||||||
|             key.to_string() |  | ||||||
|         } else { |  | ||||||
|             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         if starts_with(selector, &base_key) { |  | ||||||
|             match value { |  | ||||||
|                 Value::Object(object) => { |  | ||||||
|                     fetch_matching_values_in_object(object, selector, &base_key, output) |  | ||||||
|                 } |  | ||||||
|                 value => output.push(value), |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn validate_document_id(document_id: &str) -> Option<&str> { |  | ||||||
|     if !document_id.is_empty() |  | ||||||
|         && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) |  | ||||||
|     { |  | ||||||
|         Some(document_id) |  | ||||||
|     } else { |  | ||||||
|         None |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Parses a Json encoded document id and validate it, returning a user error when it is one. |  | ||||||
| pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> { |  | ||||||
|     match document_id { |  | ||||||
|         Value::String(string) => match validate_document_id(&string) { |  | ||||||
|             Some(s) if s.len() == string.len() => Ok(Ok(string)), |  | ||||||
|             Some(s) => Ok(Ok(s.to_string())), |  | ||||||
|             None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), |  | ||||||
|         }, |  | ||||||
|         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), |  | ||||||
|         content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Try to extract an `f64` from a JSON `Value` and return the `Value` | /// Try to extract an `f64` from a JSON `Value` and return the `Value` | ||||||
| /// in the `Err` variant if it failed. | /// in the `Err` variant if it failed. | ||||||
| pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> { | pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> { | ||||||
|   | |||||||
| @@ -20,10 +20,7 @@ use slice_group_by::GroupBy; | |||||||
| use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | ||||||
|  |  | ||||||
| use self::enrich::enrich_documents_batch; | use self::enrich::enrich_documents_batch; | ||||||
| pub use self::enrich::{ | pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; | ||||||
|     extract_finite_float_from_value, validate_document_id, validate_document_id_value, |  | ||||||
|     validate_geo_from_json, DocumentId, |  | ||||||
| }; |  | ||||||
| pub use self::helpers::{ | pub use self::helpers::{ | ||||||
|     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, |     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, | ||||||
|     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, |     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user