mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	nested fields
This commit is contained in:
		| @@ -70,7 +70,8 @@ fn indexing_songs_default(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -120,7 +121,8 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { | ||||
|                 let config = IndexerConfig::default(); | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
|                 builder.execute().unwrap(); | ||||
| @@ -134,14 +136,16 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
|                 builder.execute().unwrap(); | ||||
|  | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
|                 builder.execute().unwrap(); | ||||
| @@ -190,7 +194,8 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||
|  | ||||
| @@ -236,7 +241,8 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -281,7 +287,8 @@ fn indexing_wiki(c: &mut Criterion) { | ||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -323,7 +330,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | ||||
|                 let indexing_config = | ||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|                 let documents = | ||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -339,7 +347,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | ||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = | ||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); | ||||
| @@ -349,7 +358,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | ||||
|                 let indexing_config = | ||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = | ||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); | ||||
| @@ -400,7 +410,8 @@ fn indexing_movies_default(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -447,7 +458,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | ||||
|                 let config = IndexerConfig::default(); | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -462,7 +474,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -470,7 +483,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | ||||
|  | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
| @@ -525,7 +539,8 @@ fn indexing_geo(c: &mut Criterion) { | ||||
|                 let indexing_config = IndexDocumentsConfig::default(); | ||||
|                 let mut wtxn = index.write_txn().unwrap(); | ||||
|                 let mut builder = | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||
|                         .unwrap(); | ||||
|  | ||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); | ||||
|                 builder.add_documents(documents).unwrap(); | ||||
|   | ||||
| @@ -96,7 +96,8 @@ pub fn base_setup(conf: &Conf) -> Index { | ||||
|         update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||
|         ..Default::default() | ||||
|     }; | ||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|     let mut builder = | ||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|     let documents = documents_from(conf.dataset, conf.dataset_format); | ||||
|  | ||||
|     builder.add_documents(documents).unwrap(); | ||||
|   | ||||
| @@ -261,7 +261,8 @@ impl Performer for DocumentAddition { | ||||
|             &config, | ||||
|             indexing_config, | ||||
|             |step| indexing_callback(step, &bars), | ||||
|         ); | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         addition.add_documents(reader)?; | ||||
|  | ||||
|         std::thread::spawn(move || { | ||||
|   | ||||
| @@ -410,7 +410,7 @@ async fn main() -> anyhow::Result<()> { | ||||
|                         GLOBAL_CONFIG.get().unwrap(), | ||||
|                         indexing_config, | ||||
|                         indexing_callback, | ||||
|                     ); | ||||
|                     )?; | ||||
|  | ||||
|                     let reader = match encoding.as_deref() { | ||||
|                         Some("gzip") => Box::new(GzDecoder::new(content)), | ||||
|   | ||||
| @@ -14,6 +14,7 @@ crossbeam-channel = "0.5.2" | ||||
| either = "1.6.1" | ||||
| fst = "0.4.7" | ||||
| fxhash = "0.2.1" | ||||
| flatten-serde-json = "0.1.0" | ||||
| grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | ||||
| geoutils = "0.4.1" | ||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||
|   | ||||
| @@ -49,6 +49,24 @@ impl DocumentsBatchIndex { | ||||
|     pub fn name(&self, id: FieldId) -> Option<&String> { | ||||
|         self.0.get_by_left(&id) | ||||
|     } | ||||
|  | ||||
|     pub fn recreate_json( | ||||
|         &self, | ||||
|         document: &obkv::KvReaderU16, | ||||
|     ) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> { | ||||
|         let mut map = serde_json::Map::new(); | ||||
|  | ||||
|         for (k, v) in document.iter() { | ||||
|             // TODO: TAMO: update the error type | ||||
|             let key = | ||||
|                 self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone(); | ||||
|             let value = serde_json::from_slice::<serde_json::Value>(v) | ||||
|                 .map_err(crate::error::InternalError::SerdeJson)?; | ||||
|             map.insert(key, value); | ||||
|         } | ||||
|  | ||||
|         Ok(map) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize, Deserialize)] | ||||
|   | ||||
| @@ -27,6 +27,7 @@ pub enum InternalError { | ||||
|     DatabaseClosing, | ||||
|     DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, | ||||
|     FieldIdMapMissingEntry(FieldIdMapMissingEntry), | ||||
|     FieldIdMappingMissingEntry { key: FieldId }, | ||||
|     Fst(fst::Error), | ||||
|     GrenadInvalidCompressionType, | ||||
|     GrenadInvalidFormatVersion, | ||||
| @@ -59,7 +60,7 @@ pub enum UserError { | ||||
|     DocumentLimitReached, | ||||
|     InvalidDocumentId { document_id: Value }, | ||||
|     InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> }, | ||||
|     InvalidGeoField { document_id: Value, object: Value }, | ||||
|     InvalidGeoField { document_id: Value }, | ||||
|     InvalidFilter(String), | ||||
|     InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, | ||||
|     SortRankingRuleMissing, | ||||
| @@ -187,6 +188,9 @@ impl fmt::Display for InternalError { | ||||
|                 write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) | ||||
|             } | ||||
|             Self::FieldIdMapMissingEntry(error) => error.fmt(f), | ||||
|             Self::FieldIdMappingMissingEntry { key } => { | ||||
|                 write!(f, "Missing {} in the field id mapping.", key) | ||||
|             } | ||||
|             Self::Fst(error) => error.fmt(f), | ||||
|             Self::GrenadInvalidCompressionType => { | ||||
|                 f.write_str("Invalid compression type have been specified to grenad.") | ||||
| @@ -226,19 +230,15 @@ impl fmt::Display for UserError { | ||||
|                     name_list | ||||
|                 ) | ||||
|             } | ||||
|             Self::InvalidGeoField { document_id, object } => { | ||||
|             Self::InvalidGeoField { document_id } => { | ||||
|                 let document_id = match document_id { | ||||
|                     Value::String(id) => id.clone(), | ||||
|                     _ => document_id.to_string(), | ||||
|                 }; | ||||
|                 let object = match object { | ||||
|                     Value::String(id) => id.clone(), | ||||
|                     _ => object.to_string(), | ||||
|                 }; | ||||
|                 write!( | ||||
|                     f, | ||||
|                     "The document with the id: `{}` contains an invalid _geo field: `{}`.", | ||||
|                     document_id, object | ||||
|                     "The document with the id: `{}` contains an invalid `_geo` field.", | ||||
|                     document_id | ||||
|                 ) | ||||
|             }, | ||||
|             Self::InvalidDocumentId { document_id } => { | ||||
|   | ||||
| @@ -31,6 +31,7 @@ pub mod main_key { | ||||
|     pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | ||||
|     pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; | ||||
|     pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||
|     pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; | ||||
|     pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; | ||||
|     pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; | ||||
|     pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; | ||||
| @@ -567,12 +568,46 @@ impl Index { | ||||
|         Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) | ||||
|     } | ||||
|  | ||||
|     /* faceted documents ids */ | ||||
|     /* faceted fields */ | ||||
|  | ||||
|     /// Writes the faceted fields in the database. | ||||
|     pub(crate) fn put_faceted_fields( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         fields: &HashSet<String>, | ||||
|     ) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::HIDDEN_FACETED_FIELDS_KEY, fields) | ||||
|     } | ||||
|  | ||||
|     /// Returns the faceted fields names. | ||||
|     pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> { | ||||
|         Ok(self | ||||
|             .main | ||||
|             .get::<_, Str, SerdeJson<_>>(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? | ||||
|             .unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     /// Identical to `faceted_fields`, but returns ids instead. | ||||
|     pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { | ||||
|         let fields = self.faceted_fields(rtxn)?; | ||||
|         let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||
|  | ||||
|         let mut fields_ids = HashSet::new(); | ||||
|         for name in fields { | ||||
|             if let Some(field_id) = fields_ids_map.id(&name) { | ||||
|                 fields_ids.insert(field_id); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(fields_ids) | ||||
|     } | ||||
|  | ||||
|     /* faceted documents ids */ | ||||
|  | ||||
|     /// Returns the user defined faceted fields names. | ||||
|     /// | ||||
|     /// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. | ||||
|     pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { | ||||
|     /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. | ||||
|     pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { | ||||
|         let filterable_fields = self.filterable_fields(rtxn)?; | ||||
|         let sortable_fields = self.sortable_fields(rtxn)?; | ||||
|         let distinct_field = self.distinct_field(rtxn)?; | ||||
| @@ -592,8 +627,8 @@ impl Index { | ||||
|         Ok(faceted_fields) | ||||
|     } | ||||
|  | ||||
|     /// Identical to `faceted_fields`, but returns ids instead. | ||||
|     pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { | ||||
|     /// Identical to `user_defined_faceted_fields`, but returns ids instead. | ||||
|     pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { | ||||
|         let fields = self.faceted_fields(rtxn)?; | ||||
|         let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||
|  | ||||
| @@ -1040,13 +1075,14 @@ pub(crate) mod tests { | ||||
|         let content = documents!([ | ||||
|             { "id": 1, "name": "kevin" }, | ||||
|             { "id": 2, "name": "bob", "age": 20 }, | ||||
|             { "id": 2, "name": "bob", "age": 20 } | ||||
|             { "id": 2, "name": "bob", "age": 20 }, | ||||
|         ]); | ||||
|  | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1067,11 +1103,12 @@ pub(crate) mod tests { | ||||
|         // field_distribution in the end | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         let content = documents!([ | ||||
|             { "id": 1, "name": "kevin" }, | ||||
|             { "id": 2, "name": "bob", "age": 20 }, | ||||
|             { "id": 2, "name": "bob", "age": 20 } | ||||
|             { "id": 2, "name": "bob", "age": 20 }, | ||||
|         ]); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
| @@ -1097,7 +1134,8 @@ pub(crate) mod tests { | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|   | ||||
| @@ -183,6 +183,43 @@ pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { | ||||
|     [x, y, z] | ||||
| } | ||||
|  | ||||
| /// Returns `true` if the field match one of the faceted fields. | ||||
| /// See the function [`is_faceted_by`] below to see what “matching” means. | ||||
| pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool { | ||||
|     faceted_fields.into_iter().find(|facet| is_faceted_by(field, facet.as_ref())).is_some() | ||||
| } | ||||
|  | ||||
| /// Returns `true` if the field match the facet. | ||||
| /// ``` | ||||
| /// use milli::is_faceted_by; | ||||
| /// // -- the valid basics | ||||
| /// assert!(is_faceted_by("animaux", "animaux")); | ||||
| /// assert!(is_faceted_by("animaux.chien", "animaux")); | ||||
| /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux")); | ||||
| /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien")); | ||||
| /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois")); | ||||
| /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure")); | ||||
| /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur")); | ||||
| /// | ||||
| /// // -- the wrongs | ||||
| /// assert!(!is_faceted_by("chien", "chat")); | ||||
| /// assert!(!is_faceted_by("animaux", "animaux.chien")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animaux.chat")); | ||||
| /// | ||||
| /// // -- the strange edge cases | ||||
| /// assert!(!is_faceted_by("animaux.chien", "anima")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animau")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animaux.")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animaux.c")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animaux.ch")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animaux.chi")); | ||||
| /// assert!(!is_faceted_by("animaux.chien", "animaux.chie")); | ||||
| /// ``` | ||||
| pub fn is_faceted_by(field: &str, facet: &str) -> bool { | ||||
|     field.starts_with(facet) | ||||
|         && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use serde_json::json; | ||||
|   | ||||
| @@ -97,7 +97,8 @@ mod test { | ||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||
|             ..Default::default() | ||||
|         }; | ||||
|         let mut addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut addition = | ||||
|             IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|  | ||||
|         let reader = | ||||
|             crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); | ||||
|   | ||||
| @@ -220,9 +220,13 @@ impl<'a> FacetDistribution<'a> { | ||||
|     pub fn execute(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> { | ||||
|         let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; | ||||
|         let filterable_fields = self.index.filterable_fields(self.rtxn)?; | ||||
|  | ||||
|         let fields = match self.facets { | ||||
|             Some(ref facets) => { | ||||
|                 let invalid_fields: HashSet<_> = facets.difference(&filterable_fields).collect(); | ||||
|                 let invalid_fields: HashSet<_> = facets | ||||
|                     .iter() | ||||
|                     .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) | ||||
|                     .collect(); | ||||
|                 if !invalid_fields.is_empty() { | ||||
|                     return Err(UserError::InvalidFacetsDistribution { | ||||
|                         invalid_facets_name: invalid_fields.into_iter().cloned().collect(), | ||||
| @@ -236,10 +240,12 @@ impl<'a> FacetDistribution<'a> { | ||||
|         }; | ||||
|  | ||||
|         let mut distribution = BTreeMap::new(); | ||||
|         for name in fields { | ||||
|             if let Some(fid) = fields_ids_map.id(&name) { | ||||
|         for (fid, name) in fields_ids_map.iter() { | ||||
|             if crate::is_faceted(name, &fields) { | ||||
|                 let values = self.facet_values(fid)?; | ||||
|                 distribution.insert(name, values); | ||||
|                 if !values.is_empty() { | ||||
|                     distribution.insert(name.to_string(), values); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|   | ||||
| @@ -353,7 +353,8 @@ impl<'a> Filter<'a> { | ||||
|         match &self.condition { | ||||
|             FilterCondition::Condition { fid, op } => { | ||||
|                 let filterable_fields = index.filterable_fields(rtxn)?; | ||||
|                 if filterable_fields.contains(fid.value()) { | ||||
|  | ||||
|                 if crate::is_faceted(fid.value(), &filterable_fields) { | ||||
|                     let field_ids_map = index.fields_ids_map(rtxn)?; | ||||
|                     if let Some(fid) = field_ids_map.id(fid.value()) { | ||||
|                         Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) | ||||
| @@ -549,7 +550,6 @@ mod tests { | ||||
|             Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)") | ||||
|                 .unwrap() | ||||
|                 .unwrap(); | ||||
|         println!("\nExpecting: {:#?}\nGot: {:#?}\n", expected, condition); | ||||
|         assert_eq!(condition, expected); | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -159,7 +159,7 @@ impl<'a> Search<'a> { | ||||
|             let sortable_fields = self.index.sortable_fields(self.rtxn)?; | ||||
|             for asc_desc in sort_criteria { | ||||
|                 match asc_desc.member() { | ||||
|                     Member::Field(ref field) if !sortable_fields.contains(field) => { | ||||
|                     Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { | ||||
|                         return Err(UserError::InvalidSortableAttribute { | ||||
|                             field: field.to_string(), | ||||
|                             valid_fields: sortable_fields.into_iter().collect(), | ||||
|   | ||||
| @@ -98,7 +98,8 @@ mod tests { | ||||
|         ]); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let config = IndexerConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -110,7 +111,8 @@ mod tests { | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 5); | ||||
|         // the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]` | ||||
|         assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7); | ||||
|  | ||||
|         assert!(index.words_fst(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); | ||||
|   | ||||
| @@ -647,7 +647,8 @@ mod tests { | ||||
|         ]); | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -681,7 +682,8 @@ mod tests { | ||||
|  | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -733,7 +735,8 @@ mod tests { | ||||
|  | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -790,7 +793,8 @@ mod tests { | ||||
|  | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|  | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -2,7 +2,6 @@ use std::fs::File; | ||||
| use std::io; | ||||
|  | ||||
| use concat_arrays::concat_arrays; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||
| use crate::{FieldId, InternalError, Result, UserError}; | ||||
| @@ -14,7 +13,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | ||||
|     obkv_documents: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     primary_key_id: FieldId, | ||||
|     geo_field_id: FieldId, | ||||
|     (lat_fid, lng_fid): (FieldId, FieldId), | ||||
| ) -> Result<grenad::Reader<File>> { | ||||
|     let mut writer = create_writer( | ||||
|         indexer.chunk_compression_type, | ||||
| @@ -25,22 +24,18 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||
|         let obkv = obkv::KvReader::new(value); | ||||
|         let point: Value = match obkv.get(geo_field_id) { | ||||
|             Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?, | ||||
|             None => continue, | ||||
|         }; | ||||
|         let (lat, lng) = obkv.get(lat_fid).zip(obkv.get(lng_fid)).ok_or_else(|| { | ||||
|             let primary_key = obkv.get(primary_key_id).unwrap(); | ||||
|             let primary_key = serde_json::from_slice(primary_key).unwrap(); | ||||
|             UserError::InvalidGeoField { document_id: primary_key } | ||||
|         })?; | ||||
|         let (lat, lng): (f64, f64) = ( | ||||
|             serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, | ||||
|             serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, | ||||
|         ); | ||||
|  | ||||
|         if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { | ||||
|             // this will create an array of 16 bytes (two 8 bytes floats) | ||||
|         let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||
|         writer.insert(docid_bytes, bytes)?; | ||||
|         } else { | ||||
|             // All document must have a primary key so we can unwrap safely here | ||||
|             let primary_key = obkv.get(primary_key_id).unwrap(); | ||||
|             let primary_key = | ||||
|                 serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; | ||||
|             Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(writer_into_reader(writer)?) | ||||
|   | ||||
| @@ -34,28 +34,36 @@ use crate::{FieldId, Result}; | ||||
| /// Extract data for each databases from obkv documents in parallel. | ||||
| /// Send data in grenad file over provided Sender. | ||||
| pub(crate) fn data_from_obkv_documents( | ||||
|     obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, | ||||
|     original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, | ||||
|     flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, | ||||
|     indexer: GrenadParameters, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
|     searchable_fields: Option<HashSet<FieldId>>, | ||||
|     faceted_fields: HashSet<FieldId>, | ||||
|     primary_key_id: FieldId, | ||||
|     geo_field_id: Option<FieldId>, | ||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||
|     stop_words: Option<fst::Set<&[u8]>>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
|     exact_attributes: HashSet<FieldId>, | ||||
| ) -> Result<()> { | ||||
|     let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks | ||||
|     original_obkv_chunks | ||||
|         .par_bridge() | ||||
|         .map(|result| { | ||||
|             extract_documents_data( | ||||
|                 result, | ||||
|         .map(|original_documents_chunk| { | ||||
|             send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) | ||||
|         }) | ||||
|         .collect::<Result<()>>()?; | ||||
|  | ||||
|     let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks | ||||
|         .par_bridge() | ||||
|         .map(|flattened_obkv_chunks| { | ||||
|             send_and_extract_flattened_documents_data( | ||||
|                 flattened_obkv_chunks, | ||||
|                 indexer, | ||||
|                 lmdb_writer_sx.clone(), | ||||
|                 &searchable_fields, | ||||
|                 &faceted_fields, | ||||
|                 primary_key_id, | ||||
|                 geo_field_id, | ||||
|                 geo_fields_ids, | ||||
|                 &stop_words, | ||||
|                 max_positions_per_attributes, | ||||
|             ) | ||||
| @@ -170,36 +178,48 @@ fn spawn_extraction_task<FE, FS, M>( | ||||
|     }); | ||||
| } | ||||
|  | ||||
| /// Extract chuncked data and send it into lmdb_writer_sx sender: | ||||
| /// Extract chunked data and send it into lmdb_writer_sx sender: | ||||
| /// - documents | ||||
| fn send_original_documents_data( | ||||
|     original_documents_chunk: Result<grenad::Reader<File>>, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
| ) -> Result<()> { | ||||
|     let original_documents_chunk = | ||||
|         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||
|  | ||||
|     // TODO: create a custom internal error | ||||
|     lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Extract chunked data and send it into lmdb_writer_sx sender: | ||||
| /// - documents_ids | ||||
| /// - docid_word_positions | ||||
| /// - docid_fid_facet_numbers | ||||
| /// - docid_fid_facet_strings | ||||
| fn extract_documents_data( | ||||
|     documents_chunk: Result<grenad::Reader<File>>, | ||||
| fn send_and_extract_flattened_documents_data( | ||||
|     flattened_documents_chunk: Result<grenad::Reader<File>>, | ||||
|     indexer: GrenadParameters, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     faceted_fields: &HashSet<FieldId>, | ||||
|     primary_key_id: FieldId, | ||||
|     geo_field_id: Option<FieldId>, | ||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||
|     stop_words: &Option<fst::Set<&[u8]>>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
| ) -> Result<( | ||||
|     grenad::Reader<CursorClonableMmap>, | ||||
|     (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>), | ||||
| )> { | ||||
|     let documents_chunk = documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||
|     let flattened_documents_chunk = | ||||
|         flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||
|  | ||||
|     let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); | ||||
|  | ||||
|     if let Some(geo_field_id) = geo_field_id { | ||||
|         let documents_chunk_cloned = documents_chunk.clone(); | ||||
|     if let Some(geo_fields_ids) = geo_fields_ids { | ||||
|         let documents_chunk_cloned = flattened_documents_chunk.clone(); | ||||
|         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             let result = | ||||
|                 extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id); | ||||
|                 extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids); | ||||
|             let _ = match result { | ||||
|                 Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), | ||||
|                 Err(error) => lmdb_writer_sx_cloned.send(Err(error)), | ||||
| @@ -211,7 +231,7 @@ fn extract_documents_data( | ||||
|         rayon::join( | ||||
|             || { | ||||
|                 let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( | ||||
|                     documents_chunk.clone(), | ||||
|                     flattened_documents_chunk.clone(), | ||||
|                     indexer.clone(), | ||||
|                     searchable_fields, | ||||
|                     stop_words.as_ref(), | ||||
| @@ -232,7 +252,7 @@ fn extract_documents_data( | ||||
|             || { | ||||
|                 let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = | ||||
|                     extract_fid_docid_facet_values( | ||||
|                         documents_chunk.clone(), | ||||
|                         flattened_documents_chunk.clone(), | ||||
|                         indexer.clone(), | ||||
|                         faceted_fields, | ||||
|                     )?; | ||||
|   | ||||
| @@ -30,7 +30,7 @@ use crate::update::{ | ||||
|     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, | ||||
|     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, | ||||
| }; | ||||
| use crate::{Index, Result, RoaringBitmapCodec}; | ||||
| use crate::{Index, Result, RoaringBitmapCodec, UserError}; | ||||
|  | ||||
| static MERGED_DATABASE_COUNT: usize = 7; | ||||
| static PREFIX_DATABASE_COUNT: usize = 5; | ||||
| @@ -94,15 +94,16 @@ where | ||||
|         indexer_config: &'a IndexerConfig, | ||||
|         config: IndexDocumentsConfig, | ||||
|         progress: F, | ||||
|     ) -> IndexDocuments<'t, 'u, 'i, 'a, F> { | ||||
|     ) -> Result<IndexDocuments<'t, 'u, 'i, 'a, F>> { | ||||
|         let transform = Some(Transform::new( | ||||
|             wtxn, | ||||
|             &index, | ||||
|             indexer_config, | ||||
|             config.update_method, | ||||
|             config.autogenerate_docids, | ||||
|         )); | ||||
|         )?); | ||||
|  | ||||
|         IndexDocuments { | ||||
|         Ok(IndexDocuments { | ||||
|             transform, | ||||
|             config, | ||||
|             indexer_config, | ||||
| @@ -110,7 +111,7 @@ where | ||||
|             wtxn, | ||||
|             index, | ||||
|             added_documents: 0, | ||||
|         } | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     /// Adds a batch of documents to the current builder. | ||||
| @@ -151,6 +152,10 @@ where | ||||
|             .take() | ||||
|             .expect("Invalid document addition state") | ||||
|             .output_from_sorter(self.wtxn, &self.progress)?; | ||||
|  | ||||
|         let new_facets = output.compute_real_facets(self.wtxn, self.index)?; | ||||
|         self.index.put_faceted_fields(self.wtxn, &new_facets)?; | ||||
|  | ||||
|         let indexed_documents = output.documents_count as u64; | ||||
|         let number_of_documents = self.execute_raw(output)?; | ||||
|  | ||||
| @@ -171,7 +176,8 @@ where | ||||
|             new_documents_ids, | ||||
|             replaced_documents_ids, | ||||
|             documents_count, | ||||
|             documents_file, | ||||
|             original_documents, | ||||
|             flattened_documents, | ||||
|         } = output; | ||||
|  | ||||
|         // The fields_ids_map is put back to the store now so the rest of the transaction sees an | ||||
| @@ -197,7 +203,8 @@ where | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         let documents_file = grenad::Reader::new(documents_file)?; | ||||
|         let original_documents = grenad::Reader::new(original_documents)?; | ||||
|         let flattened_documents = grenad::Reader::new(flattened_documents)?; | ||||
|  | ||||
|         // create LMDB writer channel | ||||
|         let (lmdb_writer_sx, lmdb_writer_rx): ( | ||||
| @@ -213,13 +220,20 @@ where | ||||
|             self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); | ||||
|         // get filterable fields for facet databases | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
|         // get the fid of the `_geo` field. | ||||
|         let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { | ||||
|         // get the fid of the `_geo.lat` and `_geo.lng` fields. | ||||
|         let geo_fields_ids = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { | ||||
|             Some(gfid) => { | ||||
|                 let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); | ||||
|                 let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); | ||||
|                 // if `_geo` is faceted then we get the `lat` and `lng` | ||||
|                 if is_sortable || is_filterable { | ||||
|                     Some(gfid) | ||||
|                     let field_ids = self | ||||
|                         .index | ||||
|                         .fields_ids_map(self.wtxn)? | ||||
|                         .insert("_geo.lat") | ||||
|                         .zip(self.index.fields_ids_map(self.wtxn)?.insert("_geo.lng")) | ||||
|                         .ok_or(UserError::AttributeLimitReached)?; | ||||
|                     Some(field_ids) | ||||
|                 } else { | ||||
|                     None | ||||
|                 } | ||||
| @@ -239,23 +253,33 @@ where | ||||
|                 max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. | ||||
|             }; | ||||
|  | ||||
|             // split obkv file into several chuncks | ||||
|             let chunk_iter = grenad_obkv_into_chunks( | ||||
|                 documents_file, | ||||
|             // split obkv file into several chunks | ||||
|             let original_chunk_iter = grenad_obkv_into_chunks( | ||||
|                 original_documents, | ||||
|                 params.clone(), | ||||
|                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB | ||||
|             ); | ||||
|  | ||||
|             let result = chunk_iter.map(|chunk_iter| { | ||||
|             // split obkv file into several chunks | ||||
|             let flattened_chunk_iter = grenad_obkv_into_chunks( | ||||
|                 flattened_documents, | ||||
|                 params.clone(), | ||||
|                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB | ||||
|             ); | ||||
|  | ||||
|             let result = original_chunk_iter | ||||
|                 .and_then(|original_chunk_iter| Ok((original_chunk_iter, flattened_chunk_iter?))) | ||||
|                 .map(|(original_chunk, flattened_chunk)| { | ||||
|                     // extract all databases from the chunked obkv douments | ||||
|                     extract::data_from_obkv_documents( | ||||
|                     chunk_iter, | ||||
|                         original_chunk, | ||||
|                         flattened_chunk, | ||||
|                         params, | ||||
|                         lmdb_writer_sx.clone(), | ||||
|                         searchable_fields, | ||||
|                         faceted_fields, | ||||
|                         primary_key_id, | ||||
|                     geo_field_id, | ||||
|                         geo_fields_ids, | ||||
|                         stop_words, | ||||
|                         self.indexer_config.max_positions_per_attributes, | ||||
|                         exact_attributes, | ||||
| @@ -550,6 +574,7 @@ mod tests { | ||||
|  | ||||
|     use big_s::S; | ||||
|     use heed::EnvOpenOptions; | ||||
|     use maplit::hashset; | ||||
|  | ||||
|     use super::*; | ||||
|     use crate::documents::DocumentBatchBuilder; | ||||
| @@ -574,7 +599,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -589,7 +615,8 @@ mod tests { | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = documents!([ { "id": 1, "name": "updated kevin" } ]); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -607,7 +634,8 @@ mod tests { | ||||
|             { "id": 2, "name": "updated kevina" }, | ||||
|             { "id": 3, "name": "updated benoit" } | ||||
|         ]); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
| @@ -639,7 +667,8 @@ mod tests { | ||||
|             ..Default::default() | ||||
|         }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -665,7 +694,8 @@ mod tests { | ||||
|         // Second we send 1 document with id 1, to force it to be merged with the previous one. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = documents!([ { "id": 1, "age": 25 } ]); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -706,7 +736,8 @@ mod tests { | ||||
|         ]); | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         assert!(builder.add_documents(content).is_err()); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
| @@ -735,7 +766,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -753,7 +785,8 @@ mod tests { | ||||
|         // Second we send 1 document with the generated uuid, to erase the previous ones. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -793,7 +826,8 @@ mod tests { | ||||
|         ]); | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -809,7 +843,8 @@ mod tests { | ||||
|         let content = documents!([ { "name": "new kevin" } ]); | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -833,7 +868,8 @@ mod tests { | ||||
|         let content = documents!([]); | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -859,7 +895,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         assert!(builder.add_documents(content).is_err()); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
| @@ -867,7 +904,8 @@ mod tests { | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         // There is a space in the document id. | ||||
|         let content = documents!([ { "id": 32, "name": "kevin" } ]); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -895,7 +933,8 @@ mod tests { | ||||
|         ]); | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -912,7 +951,7 @@ mod tests { | ||||
|         assert_eq!(result.documents_ids, vec![1]); | ||||
|  | ||||
|         // Search for a sub array sub object key | ||||
|         let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap(); | ||||
|         let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap(); | ||||
|         assert_eq!(result.documents_ids, vec![2]); | ||||
|  | ||||
|         drop(rtxn); | ||||
| @@ -940,7 +979,8 @@ mod tests { | ||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||
|             ..Default::default() | ||||
|         }; | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         builder.add_documents(documents).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -950,7 +990,8 @@ mod tests { | ||||
|             update_method: IndexDocumentsMethod::UpdateDocuments, | ||||
|             ..Default::default() | ||||
|         }; | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|         let documents = documents!([ | ||||
|           { | ||||
|             "id": 2, | ||||
| @@ -981,7 +1022,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -1000,7 +1042,8 @@ mod tests { | ||||
|         ]); | ||||
|  | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); | ||||
| @@ -1011,7 +1054,8 @@ mod tests { | ||||
|         ]); | ||||
|  | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -1046,7 +1090,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -1080,7 +1125,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -1137,13 +1183,333 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
|         wtxn.commit().unwrap(); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_documents_with_nested_fields() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = documents!([ | ||||
|             { | ||||
|                 "id": 0, | ||||
|                 "title": "The zeroth document", | ||||
|             }, | ||||
|             { | ||||
|                 "id": 1, | ||||
|                 "title": "The first document", | ||||
|                 "nested": { | ||||
|                     "object": "field", | ||||
|                     "machin": "bidule", | ||||
|                 }, | ||||
|             }, | ||||
|             { | ||||
|                 "id": 2, | ||||
|                 "title": "The second document", | ||||
|                 "nested": [ | ||||
|                     "array", | ||||
|                     { | ||||
|                         "object": "field", | ||||
|                     }, | ||||
|                     { | ||||
|                         "prout": "truc", | ||||
|                         "machin": "lol", | ||||
|                     }, | ||||
|                 ], | ||||
|             }, | ||||
|             { | ||||
|                 "id": 3, | ||||
|                 "title": "The third document", | ||||
|                 "nested": "I lied", | ||||
|             }, | ||||
|         ]); | ||||
|  | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||
|  | ||||
|         let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; | ||||
|         builder.set_searchable_fields(searchable_fields); | ||||
|  | ||||
|         let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); | ||||
|         builder.set_filterable_fields(faceted_fields); | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
|         assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); | ||||
|  | ||||
|         // testing the simple query search | ||||
|         let mut search = crate::Search::new(&rtxn, &index); | ||||
|         search.query("document"); | ||||
|         search.authorize_typos(true); | ||||
|         search.optional_words(true); | ||||
|         // all documents should be returned | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids.len(), 4); | ||||
|  | ||||
|         search.query("zeroth"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![0]); | ||||
|         search.query("first"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1]); | ||||
|         search.query("second"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![2]); | ||||
|         search.query("third"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![3]); | ||||
|  | ||||
|         search.query("field"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1, 2]); | ||||
|  | ||||
|         search.query("lol"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![2]); | ||||
|  | ||||
|         search.query("object"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert!(documents_ids.is_empty()); | ||||
|  | ||||
|         search.query("array"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert!(documents_ids.is_empty()); // nested is not searchable | ||||
|  | ||||
|         search.query("lied"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert!(documents_ids.is_empty()); // nested is not searchable | ||||
|  | ||||
|         // testing the filters | ||||
|         let mut search = crate::Search::new(&rtxn, &index); | ||||
|         search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap()); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1]); | ||||
|  | ||||
|         search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap()); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1, 2]); | ||||
|  | ||||
|         search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap()); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1]); | ||||
|  | ||||
|         search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap()); | ||||
|         let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable | ||||
|         assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); | ||||
|  | ||||
|         search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap()); | ||||
|         let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable | ||||
|         assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_documents_with_nested_primary_key() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|         let config = IndexerConfig::default(); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||
|         builder.set_primary_key("nested.id".to_owned()); | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = documents!([ | ||||
|             { | ||||
|                 "nested": { | ||||
|                     "id": 0, | ||||
|                 }, | ||||
|                 "title": "The zeroth document", | ||||
|             }, | ||||
|             { | ||||
|                 "nested": { | ||||
|                     "id": 1, | ||||
|                 }, | ||||
|                 "title": "The first document", | ||||
|             }, | ||||
|             { | ||||
|                 "nested": { | ||||
|                     "id": 2, | ||||
|                 }, | ||||
|                 "title": "The second document", | ||||
|             }, | ||||
|             { | ||||
|                 "nested.id": 3, | ||||
|                 "title": "The third document", | ||||
|             }, | ||||
|         ]); | ||||
|  | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         // testing the simple query search | ||||
|         let mut search = crate::Search::new(&rtxn, &index); | ||||
|         search.query("document"); | ||||
|         search.authorize_typos(true); | ||||
|         search.optional_words(true); | ||||
|         // all documents should be returned | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids.len(), 4); | ||||
|  | ||||
|         search.query("zeroth"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![0]); | ||||
|         search.query("first"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1]); | ||||
|         search.query("second"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![2]); | ||||
|         search.query("third"); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![3]); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_facets_generation() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let content = documents!([ | ||||
|             { | ||||
|                 "id": 0, | ||||
|                 "dog": { | ||||
|                     "race": { | ||||
|                         "bernese mountain": "zeroth", | ||||
|                     }, | ||||
|                 }, | ||||
|             }, | ||||
|             { | ||||
|                 "id": 1, | ||||
|                 "dog.race": { | ||||
|                     "bernese mountain": "first", | ||||
|                 }, | ||||
|             }, | ||||
|             { | ||||
|                 "id": 2, | ||||
|                 "dog.race.bernese mountain": "second", | ||||
|             }, | ||||
|             { | ||||
|                 "id": 3, | ||||
|                 "dog": { | ||||
|                     "race.bernese mountain": "third" | ||||
|                 }, | ||||
|             }, | ||||
|         ]); | ||||
|  | ||||
|         // index the documents | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         // ---- ADD THE SETTING TO TEST THE FILTERABLE | ||||
|  | ||||
|         // add the settings | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||
|  | ||||
|         builder.set_filterable_fields(hashset!(String::from("dog"))); | ||||
|  | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let hidden = index.faceted_fields(&rtxn).unwrap(); | ||||
|  | ||||
|         assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); | ||||
|  | ||||
|         for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { | ||||
|             let mut search = crate::Search::new(&rtxn, &index); | ||||
|             let filter = format!(r#""dog.race.bernese mountain" = {s}"#); | ||||
|             search.filter(crate::Filter::from_str(&filter).unwrap().unwrap()); | ||||
|             let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|             assert_eq!(documents_ids, vec![i]); | ||||
|         } | ||||
|  | ||||
|         // ---- RESET THE SETTINGS | ||||
|  | ||||
|         // update the settings | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||
|  | ||||
|         builder.reset_filterable_fields(); | ||||
|  | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
|  | ||||
|         assert_eq!(facets, hashset!()); | ||||
|  | ||||
|         // ---- UPDATE THE SETTINGS TO TEST THE SORTABLE | ||||
|  | ||||
|         // update the settings | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||
|  | ||||
|         builder.set_sortable_fields(hashset!(S("dog.race"))); | ||||
|  | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
|  | ||||
|         assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); | ||||
|  | ||||
|         let mut search = crate::Search::new(&rtxn, &index); | ||||
|         search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( | ||||
|             "dog.race.bernese mountain", | ||||
|         )))]); | ||||
|         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||
|         assert_eq!(documents_ids, vec![1, 2, 3, 0]); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_2_times_documents_split_by_zero_document_indexation() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
| @@ -1162,7 +1528,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1178,7 +1545,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1199,7 +1567,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1226,7 +1595,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|   | ||||
| @@ -1,24 +1,27 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::btree_map::Entry; | ||||
| use std::collections::HashMap; | ||||
| use std::collections::hash_map::Entry; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::fs::File; | ||||
| use std::io::{Read, Seek, SeekFrom}; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use byteorder::ReadBytesExt; | ||||
| use fxhash::FxHashMap; | ||||
| use heed::RoTxn; | ||||
| use itertools::Itertools; | ||||
| use log::info; | ||||
| use obkv::{KvReader, KvWriter}; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::{Map, Value}; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, | ||||
| }; | ||||
| use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; | ||||
| use super::{IndexDocumentsMethod, IndexerConfig}; | ||||
| use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; | ||||
| use crate::error::{Error, InternalError, UserError}; | ||||
| use crate::index::db_name; | ||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||
| use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; | ||||
| use crate::{ | ||||
|     ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, | ||||
|     Result, BEU32, | ||||
| }; | ||||
|  | ||||
| const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | ||||
|  | ||||
| @@ -30,7 +33,8 @@ pub struct TransformOutput { | ||||
|     pub new_documents_ids: RoaringBitmap, | ||||
|     pub replaced_documents_ids: RoaringBitmap, | ||||
|     pub documents_count: usize, | ||||
|     pub documents_file: File, | ||||
|     pub original_documents: File, | ||||
|     pub flattened_documents: File, | ||||
| } | ||||
|  | ||||
| /// Extract the external ids, deduplicate and compute the new internal documents ids | ||||
| @@ -41,11 +45,17 @@ pub struct TransformOutput { | ||||
| /// containing all those documents. | ||||
| pub struct Transform<'a, 'i> { | ||||
|     pub index: &'i Index, | ||||
|     fields_ids_map: FieldsIdsMap, | ||||
|  | ||||
|     indexer_settings: &'a IndexerConfig, | ||||
|     pub autogenerate_docids: bool, | ||||
|     pub index_documents_method: IndexDocumentsMethod, | ||||
|  | ||||
|     sorter: grenad::Sorter<MergeFn>, | ||||
|     original_sorter: grenad::Sorter<MergeFn>, | ||||
|     flattened_sorter: grenad::Sorter<MergeFn>, | ||||
|     replaced_documents_ids: RoaringBitmap, | ||||
|     new_documents_ids: RoaringBitmap, | ||||
|     new_external_documents_ids_builder: FxHashMap<Vec<u8>, u64>, | ||||
|     documents_count: usize, | ||||
| } | ||||
|  | ||||
| @@ -72,6 +82,9 @@ fn create_fields_mapping( | ||||
|         .collect() | ||||
| } | ||||
|  | ||||
| /// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. | ||||
| /// It doesn't look in the subfield because we don't want to enable the | ||||
| /// primary key inference on nested objects. | ||||
| fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { | ||||
|     index | ||||
|         .iter() | ||||
| @@ -83,11 +96,12 @@ fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { | ||||
|  | ||||
| impl<'a, 'i> Transform<'a, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &mut heed::RwTxn, | ||||
|         index: &'i Index, | ||||
|         indexer_settings: &'a IndexerConfig, | ||||
|         index_documents_method: IndexDocumentsMethod, | ||||
|         autogenerate_docids: bool, | ||||
|     ) -> Self { | ||||
|     ) -> Result<Self> { | ||||
|         // We must choose the appropriate merge function for when two or more documents | ||||
|         // with the same user id must be merged or fully replaced in the same batch. | ||||
|         let merge_function = match index_documents_method { | ||||
| @@ -96,22 +110,36 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         }; | ||||
|  | ||||
|         // We initialize the sorter with the user indexing settings. | ||||
|         let sorter = create_sorter( | ||||
|         let original_sorter = create_sorter( | ||||
|             merge_function, | ||||
|             indexer_settings.chunk_compression_type, | ||||
|             indexer_settings.chunk_compression_level, | ||||
|             indexer_settings.max_nb_chunks, | ||||
|             indexer_settings.max_memory, | ||||
|             indexer_settings.max_memory.map(|mem| mem / 2), | ||||
|         ); | ||||
|  | ||||
|         Transform { | ||||
|         // We initialize the sorter with the user indexing settings. | ||||
|         let flattened_sorter = create_sorter( | ||||
|             merge_function, | ||||
|             indexer_settings.chunk_compression_type, | ||||
|             indexer_settings.chunk_compression_level, | ||||
|             indexer_settings.max_nb_chunks, | ||||
|             indexer_settings.max_memory.map(|mem| mem / 2), | ||||
|         ); | ||||
|  | ||||
|         Ok(Transform { | ||||
|             index, | ||||
|             fields_ids_map: index.fields_ids_map(wtxn)?, | ||||
|             indexer_settings, | ||||
|             autogenerate_docids, | ||||
|             sorter, | ||||
|             documents_count: 0, | ||||
|             original_sorter, | ||||
|             flattened_sorter, | ||||
|             index_documents_method, | ||||
|         } | ||||
|             replaced_documents_ids: RoaringBitmap::new(), | ||||
|             new_documents_ids: RoaringBitmap::new(), | ||||
|             new_external_documents_ids_builder: FxHashMap::default(), | ||||
|             documents_count: 0, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn read_documents<R, F>( | ||||
| @@ -125,8 +153,11 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         F: Fn(UpdateIndexingStep) + Sync, | ||||
|     { | ||||
|         let fields_index = reader.index(); | ||||
|         let mut fields_ids_map = self.index.fields_ids_map(wtxn)?; | ||||
|         let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?; | ||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||
|         let documents_ids = self.index.documents_ids(wtxn)?; | ||||
|         let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); | ||||
|  | ||||
|         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; | ||||
|  | ||||
|         let alternative_name = self | ||||
|             .index | ||||
| @@ -136,15 +167,19 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|  | ||||
|         let (primary_key_id, primary_key_name) = compute_primary_key_pair( | ||||
|             self.index.primary_key(wtxn)?, | ||||
|             &mut fields_ids_map, | ||||
|             &mut self.fields_ids_map, | ||||
|             alternative_name, | ||||
|             self.autogenerate_docids, | ||||
|         )?; | ||||
|  | ||||
|         let primary_key_id_nested = primary_key_name.contains('.'); | ||||
|  | ||||
|         let mut flattened_document = None; | ||||
|         let mut obkv_buffer = Vec::new(); | ||||
|         let mut flattened_obkv_buffer = Vec::new(); | ||||
|         let mut documents_count = 0; | ||||
|         let mut external_id_buffer = Vec::new(); | ||||
|         let mut field_buffer: Vec<(u16, &[u8])> = Vec::new(); | ||||
|         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); | ||||
|         while let Some((addition_index, document)) = reader.next_document_with_index()? { | ||||
|             let mut field_buffer_cache = drop_and_reuse(field_buffer); | ||||
|             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { | ||||
| @@ -154,8 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             } | ||||
|  | ||||
|             for (k, v) in document.iter() { | ||||
|                 let mapped_id = *mapping.get(&k).unwrap(); | ||||
|                 field_buffer_cache.push((mapped_id, v)); | ||||
|                 let mapped_id = | ||||
|                     *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; | ||||
|                 field_buffer_cache.push((mapped_id, Cow::from(v))); | ||||
|             } | ||||
|  | ||||
|             // We need to make sure that every document has a primary key. After we have remapped | ||||
| @@ -164,56 +200,38 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             // document. If none is found, and we were told to generate missing document ids, then | ||||
|             // we create the missing field, and update the new document. | ||||
|             let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; | ||||
|             let external_id = | ||||
|                 match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { | ||||
|                     Some((_, bytes)) => { | ||||
|                         let value = match serde_json::from_slice(bytes).unwrap() { | ||||
|                             Value::String(string) => match validate_document_id(&string) { | ||||
|                                 Some(s) if s.len() == string.len() => string, | ||||
|                                 Some(s) => s.to_string(), | ||||
|                                 None => { | ||||
|                                     return Err(UserError::InvalidDocumentId { | ||||
|                                         document_id: Value::String(string), | ||||
|                                     } | ||||
|                                     .into()) | ||||
|                                 } | ||||
|                             }, | ||||
|                             Value::Number(number) => number.to_string(), | ||||
|                             content => { | ||||
|                                 return Err(UserError::InvalidDocumentId { | ||||
|                                     document_id: content.clone(), | ||||
|                                 } | ||||
|                                 .into()) | ||||
|                             } | ||||
|                         }; | ||||
|                         serde_json::to_writer(&mut external_id_buffer, &value).unwrap(); | ||||
|                         Cow::Owned(value) | ||||
|                     } | ||||
|                     None => { | ||||
|                         if !self.autogenerate_docids { | ||||
|                             let mut json = Map::new(); | ||||
|                             for (key, value) in document.iter() { | ||||
|                                 let key = addition_index.name(key).cloned(); | ||||
|                                 let value = serde_json::from_slice::<Value>(&value).ok(); | ||||
|             let external_id = if primary_key_id_nested { | ||||
|                 let mut field_buffer_cache = field_buffer_cache.clone(); | ||||
|                 self.flatten_from_field_mapping( | ||||
|                     &mapping, | ||||
|                     &document, | ||||
|                     &mut flattened_obkv_buffer, | ||||
|                     &mut field_buffer_cache, | ||||
|                 )?; | ||||
|                 flattened_document = Some(&flattened_obkv_buffer); | ||||
|                 let document = KvReader::new(&flattened_obkv_buffer); | ||||
|  | ||||
|                                 if let Some((k, v)) = key.zip(value) { | ||||
|                                     json.insert(k, v); | ||||
|                                 } | ||||
|                             } | ||||
|  | ||||
|                             return Err(UserError::MissingDocumentId { | ||||
|                                 primary_key: primary_key_name, | ||||
|                                 document: json, | ||||
|                             } | ||||
|                             .into()); | ||||
|                         } | ||||
|  | ||||
|                         let uuid = | ||||
|                             uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); | ||||
|                         serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap(); | ||||
|                         field_buffer_cache.push((primary_key_id, &external_id_buffer)); | ||||
|                         Cow::Borrowed(&*uuid) | ||||
|                     } | ||||
|                 update_primary_key( | ||||
|                     document, | ||||
|                     &addition_index, | ||||
|                     primary_key_id, | ||||
|                     &primary_key_name, | ||||
|                     &mut uuid_buffer, | ||||
|                     &mut field_buffer_cache, | ||||
|                     &mut external_id_buffer, | ||||
|                     self.autogenerate_docids, | ||||
|                 )? | ||||
|             } else { | ||||
|                 update_primary_key( | ||||
|                     document, | ||||
|                     &addition_index, | ||||
|                     primary_key_id, | ||||
|                     &primary_key_name, | ||||
|                     &mut uuid_buffer, | ||||
|                     &mut field_buffer_cache, | ||||
|                     &mut external_id_buffer, | ||||
|                     self.autogenerate_docids, | ||||
|                 )? | ||||
|             }; | ||||
|  | ||||
|             // Insertion in a obkv need to be done with keys ordered. For now they are ordered | ||||
| @@ -221,30 +239,86 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             // fieldids map keys order. | ||||
|             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); | ||||
|  | ||||
|             // The last step is to build the new obkv document, and insert it in the sorter. | ||||
|             // Build the new obkv document. | ||||
|             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); | ||||
|             for (k, v) in field_buffer_cache.iter() { | ||||
|                 writer.insert(*k, v)?; | ||||
|             } | ||||
|  | ||||
|             let (docid, should_insert_original_document) = | ||||
|                 match external_documents_ids.get(&*external_id) { | ||||
|                     // if the document is in the db but has already been inserted | ||||
|                     // (ie: already exists in the list of replaced documents ids), | ||||
|                     // we should not add the original document a second time. | ||||
|                     Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)), | ||||
|                     None => { | ||||
|                         // if the document has already been inserted in this | ||||
|                         // batch we need to get its docid | ||||
|                         match self | ||||
|                             .new_external_documents_ids_builder | ||||
|                             .entry(external_id.as_bytes().to_vec()) | ||||
|                         { | ||||
|                             Entry::Occupied(entry) => (*entry.get() as u32, false), | ||||
|                             // if the document has never been encountered we give it a new docid | ||||
|                             // and push this new docid to the external documents ids builder | ||||
|                             Entry::Vacant(entry) => { | ||||
|                                 let new_docid = available_documents_ids | ||||
|                                     .next() | ||||
|                                     .ok_or(UserError::DocumentLimitReached)?; | ||||
|                                 entry.insert(new_docid as u64); | ||||
|                                 (new_docid, false) | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 }; | ||||
|  | ||||
|             if should_insert_original_document { | ||||
|                 self.replaced_documents_ids.insert(docid); | ||||
|  | ||||
|                 let key = BEU32::new(docid); | ||||
|                 let base_obkv = self | ||||
|                     .index | ||||
|                     .documents | ||||
|                     .remap_data_type::<heed::types::ByteSlice>() | ||||
|                     .get(wtxn, &key)? | ||||
|                     .ok_or(InternalError::DatabaseMissingEntry { | ||||
|                         db_name: db_name::DOCUMENTS, | ||||
|                         key: None, | ||||
|                     })?; | ||||
|  | ||||
|                 self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; | ||||
|                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; | ||||
|  | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; | ||||
|             } else { | ||||
|                 self.new_documents_ids.insert(docid); | ||||
|             } | ||||
|  | ||||
|             // We use the extracted/generated user id as the key for this document. | ||||
|             self.sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?; | ||||
|             self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; | ||||
|             documents_count += 1; | ||||
|  | ||||
|             progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||
|                 documents_seen: documents_count, | ||||
|             }); | ||||
|  | ||||
|             obkv_buffer.clear(); | ||||
|             field_buffer = drop_and_reuse(field_buffer_cache); | ||||
|             external_id_buffer.clear(); | ||||
|             if let Some(flatten) = flattened_document { | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; | ||||
|             } else { | ||||
|                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; | ||||
|             } | ||||
|  | ||||
|             progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||
|                 documents_seen: documents_count, | ||||
|             }); | ||||
|  | ||||
|         self.index.put_fields_ids_map(wtxn, &fields_ids_map)?; | ||||
|             field_buffer = drop_and_reuse(field_buffer_cache); | ||||
|             external_id_buffer.clear(); | ||||
|             obkv_buffer.clear(); | ||||
|         } | ||||
|  | ||||
|         progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||
|             documents_seen: documents_count, | ||||
|         }); | ||||
|  | ||||
|         self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; | ||||
|         self.index.put_primary_key(wtxn, &primary_key_name)?; | ||||
|         self.documents_count += documents_count; | ||||
|         // Now that we have a valid sorter that contains the user id and the obkv we | ||||
| @@ -252,6 +326,87 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         Ok(documents_count) | ||||
|     } | ||||
|  | ||||
|     // Flatten a document from the fields ids map contained in self and insert the new | ||||
|     // created fields. | ||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> { | ||||
|         let mut doc = serde_json::Map::new(); | ||||
|  | ||||
|         for (k, v) in obkv.iter() { | ||||
|             let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                 field_id: k, | ||||
|                 process: "Flatten from fields ids map.", | ||||
|             })?; | ||||
|             let value = serde_json::from_slice::<serde_json::Value>(v) | ||||
|                 .map_err(crate::error::InternalError::SerdeJson)?; | ||||
|             doc.insert(key.to_string(), value); | ||||
|         } | ||||
|  | ||||
|         let flattened = flatten_serde_json::flatten(&doc); | ||||
|  | ||||
|         // Once we have the flattened version we can convert it back to obkv and | ||||
|         // insert all the new generated fields_ids (if any) in the fields ids map. | ||||
|         let mut buffer: Vec<u8> = Vec::new(); | ||||
|         let mut writer = KvWriter::new(&mut buffer); | ||||
|         let mut flattened: Vec<_> = flattened.into_iter().collect(); | ||||
|         // we reorder the field to get all the known field first | ||||
|         flattened | ||||
|             .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); | ||||
|  | ||||
|         for (key, value) in flattened { | ||||
|             let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||
|             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||
|             writer.insert(fid, &value)?; | ||||
|         } | ||||
|  | ||||
|         Ok(buffer) | ||||
|     } | ||||
|  | ||||
|     // Flatten a document from a field mapping generated by [create_fields_mapping] | ||||
|     fn flatten_from_field_mapping( | ||||
|         &mut self, | ||||
|         mapping: &HashMap<FieldId, FieldId>, | ||||
|         obkv: &KvReader<FieldId>, | ||||
|         output_buffer: &mut Vec<u8>, | ||||
|         field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, | ||||
|     ) -> Result<()> { | ||||
|         // if the primary_key is nested we need to flatten the document before being able to do anything | ||||
|         let mut doc = serde_json::Map::new(); | ||||
|  | ||||
|         for (k, v) in obkv.iter() { | ||||
|             let key = | ||||
|                 mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; | ||||
|             let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                 field_id: *key, | ||||
|                 process: "Flatten from field mapping.", | ||||
|             })?; | ||||
|             let value = | ||||
|                 serde_json::from_slice::<serde_json::Value>(v).map_err(InternalError::SerdeJson)?; | ||||
|             doc.insert(key.to_string(), value); | ||||
|         } | ||||
|  | ||||
|         let flattened = flatten_serde_json::flatten(&doc); | ||||
|  | ||||
|         // Once we have the flattened version we can convert it back to obkv and | ||||
|         // insert all the new generated fields_ids (if any) in the fields ids map. | ||||
|         output_buffer.clear(); | ||||
|         let mut writer = KvWriter::new(output_buffer); | ||||
|         let mut flattened: Vec<_> = flattened.into_iter().collect(); | ||||
|         // we reorder the field to get all the known field first | ||||
|         flattened | ||||
|             .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); | ||||
|  | ||||
|         for (key, value) in flattened { | ||||
|             let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||
|             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||
|             writer.insert(fid, &value)?; | ||||
|             if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { | ||||
|                 field_buffer_cache.push((fid, value.into())); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     /// Generate the `TransformOutput` based on the given sorter that can be generated from any | ||||
|     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document | ||||
|     /// id for the user side and the value must be an obkv where keys are valid fields ids. | ||||
| @@ -268,110 +423,8 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             .primary_key(&wtxn)? | ||||
|             .ok_or(Error::UserError(UserError::MissingPrimaryKey))? | ||||
|             .to_string(); | ||||
|         let fields_ids_map = self.index.fields_ids_map(wtxn)?; | ||||
|         let approximate_number_of_documents = self.documents_count; | ||||
|  | ||||
|         let mut external_documents_ids = self.index.external_documents_ids(wtxn).unwrap(); | ||||
|         let documents_ids = self.index.documents_ids(wtxn)?; | ||||
|         let mut field_distribution = self.index.field_distribution(wtxn)?; | ||||
|         let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); | ||||
|  | ||||
|         // consume sorter, in order to free the internal allocation, before creating a new one. | ||||
|         let mut iter = self.sorter.into_stream_merger_iter()?; | ||||
|  | ||||
|         // Once we have sort and deduplicated the documents we write them into a final file. | ||||
|         let mut final_sorter = create_sorter( | ||||
|             |_id, obkvs| { | ||||
|                 if obkvs.len() == 1 { | ||||
|                     Ok(obkvs[0].clone()) | ||||
|                 } else { | ||||
|                     Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) | ||||
|                 } | ||||
|             }, | ||||
|             self.indexer_settings.chunk_compression_type, | ||||
|             self.indexer_settings.chunk_compression_level, | ||||
|             self.indexer_settings.max_nb_chunks, | ||||
|             self.indexer_settings.max_memory, | ||||
|         ); | ||||
|         let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); | ||||
|         let mut replaced_documents_ids = RoaringBitmap::new(); | ||||
|         let mut new_documents_ids = RoaringBitmap::new(); | ||||
|         let mut obkv_buffer = Vec::new(); | ||||
|  | ||||
|         // While we write into final file we get or generate the internal documents ids. | ||||
|         let mut documents_count = 0; | ||||
|         while let Some((external_id, update_obkv)) = iter.next()? { | ||||
|             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { | ||||
|                 progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { | ||||
|                     documents_seen: documents_count, | ||||
|                     total_documents: approximate_number_of_documents, | ||||
|                 }); | ||||
|             } | ||||
|  | ||||
|             let (docid, obkv) = match external_documents_ids.get(external_id) { | ||||
|                 Some(docid) => { | ||||
|                     // If we find the user id in the current external documents ids map | ||||
|                     // we use it and insert it in the list of replaced documents. | ||||
|                     replaced_documents_ids.insert(docid); | ||||
|  | ||||
|                     let key = BEU32::new(docid); | ||||
|                     let base_obkv = self.index.documents.get(wtxn, &key)?.ok_or( | ||||
|                         InternalError::DatabaseMissingEntry { | ||||
|                             db_name: db_name::DOCUMENTS, | ||||
|                             key: None, | ||||
|                         }, | ||||
|                     )?; | ||||
|  | ||||
|                     // we remove all the fields that were already counted | ||||
|                     for (field_id, _) in base_obkv.iter() { | ||||
|                         let field_name = fields_ids_map.name(field_id).unwrap(); | ||||
|                         if let Entry::Occupied(mut entry) = | ||||
|                             field_distribution.entry(field_name.to_string()) | ||||
|                         { | ||||
|                             match entry.get().checked_sub(1) { | ||||
|                                 Some(0) | None => entry.remove(), | ||||
|                                 Some(count) => entry.insert(count), | ||||
|                             }; | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // Depending on the update indexing method we will merge | ||||
|                     // the document update with the current document or not. | ||||
|                     match self.index_documents_method { | ||||
|                         IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), | ||||
|                         IndexDocumentsMethod::UpdateDocuments => { | ||||
|                             let update_obkv = obkv::KvReader::new(update_obkv); | ||||
|                             merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); | ||||
|                             (docid, obkv_buffer.as_slice()) | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 None => { | ||||
|                     // If this user id is new we add it to the external documents ids map | ||||
|                     // for new ids and into the list of new documents. | ||||
|                     let new_docid = | ||||
|                         available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?; | ||||
|                     new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; | ||||
|                     new_documents_ids.insert(new_docid); | ||||
|                     (new_docid, update_obkv) | ||||
|                 } | ||||
|             }; | ||||
|  | ||||
|             // We insert the document under the documents ids map into the final file. | ||||
|             final_sorter.insert(docid.to_be_bytes(), obkv)?; | ||||
|             documents_count += 1; | ||||
|  | ||||
|             let reader = obkv::KvReader::new(obkv); | ||||
|             for (field_id, _) in reader.iter() { | ||||
|                 let field_name = fields_ids_map.name(field_id).unwrap(); | ||||
|                 *field_distribution.entry(field_name.to_string()).or_default() += 1; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { | ||||
|             documents_seen: documents_count, | ||||
|             total_documents: documents_count, | ||||
|         }); | ||||
|         let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||
|  | ||||
|         // We create a final writer to write the new documents in order from the sorter. | ||||
|         let mut writer = create_writer( | ||||
| @@ -380,28 +433,103 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             tempfile::tempfile()?, | ||||
|         ); | ||||
|  | ||||
|         // Once we have all the documents in the sorter, we write the documents | ||||
|         // in the writer. We also generate the field distribution. | ||||
|         let mut field_distribution = self.index.field_distribution(wtxn)?; | ||||
|         let mut iter = self.original_sorter.into_stream_merger_iter()?; | ||||
|         // used only for the callback | ||||
|         let mut documents_count = 0; | ||||
|  | ||||
|         while let Some((key, val)) = iter.next()? { | ||||
|             // send a callback to show at which step we are | ||||
|             documents_count += 1; | ||||
|             progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { | ||||
|                 documents_seen: documents_count, | ||||
|                 total_documents: self.documents_count, | ||||
|             }); | ||||
|  | ||||
|             let u32_key = key.clone().read_u32::<byteorder::BigEndian>()?; | ||||
|             // if the document was already in the db we remove all of its field | ||||
|             // from the field distribution. | ||||
|             if self.replaced_documents_ids.contains(u32_key) { | ||||
|                 let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or( | ||||
|                     InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, | ||||
|                 )?; | ||||
|  | ||||
|                 for (key, _) in obkv.iter() { | ||||
|                     let name = | ||||
|                         self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                             field_id: key, | ||||
|                             process: "Computing field distribution in transform.", | ||||
|                         })?; | ||||
|                     // We checked that the document was in the db earlier. If we can't find it it means | ||||
|                     // there is an inconsistency between the field distribution and the field id map. | ||||
|                     let field = field_distribution.get_mut(name).ok_or( | ||||
|                         FieldIdMapMissingEntry::FieldId { | ||||
|                             field_id: key, | ||||
|                             process: "Accessing field distribution in transform.", | ||||
|                         }, | ||||
|                     )?; | ||||
|                     *field -= 1; | ||||
|                     if *field == 0 { | ||||
|                         // since we were able to get the field right before it's safe to unwrap here | ||||
|                         field_distribution.remove(name).unwrap(); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // We increment all the field of the current document in the field distribution. | ||||
|             let obkv = KvReader::new(val); | ||||
|  | ||||
|             for (key, _) in obkv.iter() { | ||||
|                 let name = | ||||
|                     self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                         field_id: key, | ||||
|                         process: "Computing field distribution in transform.", | ||||
|                     })?; | ||||
|                 *field_distribution.entry(name.to_string()).or_insert(0) += 1; | ||||
|             } | ||||
|             writer.insert(key, val)?; | ||||
|         } | ||||
|  | ||||
|         let mut original_documents = writer.into_inner()?; | ||||
|         // We then extract the file and reset the seek to be able to read it again. | ||||
|         original_documents.seek(SeekFrom::Start(0))?; | ||||
|  | ||||
|         // We create a final writer to write the new documents in order from the sorter. | ||||
|         let mut writer = create_writer( | ||||
|             self.indexer_settings.chunk_compression_type, | ||||
|             self.indexer_settings.chunk_compression_level, | ||||
|             tempfile::tempfile()?, | ||||
|         ); | ||||
|         // Once we have written all the documents into the final sorter, we write the documents | ||||
|         // into this writer, extract the file and reset the seek to be able to read it again. | ||||
|         final_sorter.write_into_stream_writer(&mut writer)?; | ||||
|         let mut documents_file = writer.into_inner()?; | ||||
|         documents_file.seek(SeekFrom::Start(0))?; | ||||
|         self.flattened_sorter.write_into_stream_writer(&mut writer)?; | ||||
|         let mut flattened_documents = writer.into_inner()?; | ||||
|         flattened_documents.seek(SeekFrom::Start(0))?; | ||||
|  | ||||
|         let before_docids_merging = Instant::now(); | ||||
|         // We merge the new external ids with existing external documents ids. | ||||
|         let new_external_documents_ids = new_external_documents_ids_builder.into_map(); | ||||
|         let mut new_external_documents_ids_builder: Vec<_> = | ||||
|             self.new_external_documents_ids_builder.into_iter().collect(); | ||||
|  | ||||
|         new_external_documents_ids_builder | ||||
|             .sort_unstable_by(|(left, _), (right, _)| left.cmp(&right)); | ||||
|         let mut fst_new_external_documents_ids_builder = fst::MapBuilder::memory(); | ||||
|         new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { | ||||
|             fst_new_external_documents_ids_builder.insert(key, value) | ||||
|         })?; | ||||
|         let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); | ||||
|         external_documents_ids.insert_ids(&new_external_documents_ids)?; | ||||
|  | ||||
|         info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); | ||||
|  | ||||
|         Ok(TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map, | ||||
|             fields_ids_map: self.fields_ids_map, | ||||
|             field_distribution, | ||||
|             external_documents_ids: external_documents_ids.into_static(), | ||||
|             new_documents_ids, | ||||
|             replaced_documents_ids, | ||||
|             documents_count, | ||||
|             documents_file, | ||||
|             new_documents_ids: self.new_documents_ids, | ||||
|             replaced_documents_ids: self.replaced_documents_ids, | ||||
|             documents_count: self.documents_count, | ||||
|             original_documents, | ||||
|             flattened_documents, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
| @@ -412,7 +540,7 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         self, | ||||
|         wtxn: &mut heed::RwTxn, | ||||
|         old_fields_ids_map: FieldsIdsMap, | ||||
|         new_fields_ids_map: FieldsIdsMap, | ||||
|         mut new_fields_ids_map: FieldsIdsMap, | ||||
|     ) -> Result<TransformOutput> { | ||||
|         // There already has been a document addition, the primary key should be set by now. | ||||
|         let primary_key = | ||||
| @@ -423,7 +551,14 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         let documents_count = documents_ids.len() as usize; | ||||
|  | ||||
|         // We create a final writer to write the new documents in order from the sorter. | ||||
|         let mut writer = create_writer( | ||||
|         let mut original_writer = create_writer( | ||||
|             self.indexer_settings.chunk_compression_type, | ||||
|             self.indexer_settings.chunk_compression_level, | ||||
|             tempfile::tempfile()?, | ||||
|         ); | ||||
|  | ||||
|         // We create a final writer to write the new documents in order from the sorter. | ||||
|         let mut flattened_writer = create_writer( | ||||
|             self.indexer_settings.chunk_compression_type, | ||||
|             self.indexer_settings.chunk_compression_level, | ||||
|             tempfile::tempfile()?, | ||||
| @@ -445,13 +580,51 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             } | ||||
|  | ||||
|             let buffer = obkv_writer.into_inner()?; | ||||
|             writer.insert(docid.to_be_bytes(), buffer)?; | ||||
|             original_writer.insert(docid.to_be_bytes(), &buffer)?; | ||||
|  | ||||
|             // Once we have the document. We're going to flatten it | ||||
|             // and insert it in the flattened sorter. | ||||
|             let mut doc = serde_json::Map::new(); | ||||
|  | ||||
|             let reader = obkv::KvReader::new(buffer); | ||||
|             for (k, v) in reader.iter() { | ||||
|                 let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                     field_id: k, | ||||
|                     process: "Accessing field distribution in transform.", | ||||
|                 })?; | ||||
|                 let value = serde_json::from_slice::<serde_json::Value>(v) | ||||
|                     .map_err(InternalError::SerdeJson)?; | ||||
|                 doc.insert(key.to_string(), value); | ||||
|             } | ||||
|  | ||||
|             let flattened = flatten_serde_json::flatten(&doc); | ||||
|  | ||||
|             // Once we have the flattened version we can convert it back to obkv and | ||||
|             // insert all the new generated fields_ids (if any) in the fields ids map. | ||||
|             let mut buffer: Vec<u8> = Vec::new(); | ||||
|             let mut writer = KvWriter::new(&mut buffer); | ||||
|             let mut flattened: Vec<_> = flattened.into_iter().collect(); | ||||
|             // we reorder the field to get all the known field first | ||||
|             flattened.sort_unstable_by_key(|(key, _)| { | ||||
|                 new_fields_ids_map.id(&key).unwrap_or(FieldId::MAX) | ||||
|             }); | ||||
|  | ||||
|             for (key, value) in flattened { | ||||
|                 let fid = | ||||
|                     new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||
|                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||
|                 writer.insert(fid, &value)?; | ||||
|             } | ||||
|             flattened_writer.insert(docid.to_be_bytes(), &buffer)?; | ||||
|         } | ||||
|  | ||||
|         // Once we have written all the documents, we extract | ||||
|         // the file and reset the seek to be able to read it again. | ||||
|         let mut documents_file = writer.into_inner()?; | ||||
|         documents_file.seek(SeekFrom::Start(0))?; | ||||
|         let mut original_documents = original_writer.into_inner()?; | ||||
|         original_documents.seek(SeekFrom::Start(0))?; | ||||
|  | ||||
|         let mut flattened_documents = flattened_writer.into_inner()?; | ||||
|         flattened_documents.seek(SeekFrom::Start(0))?; | ||||
|  | ||||
|         Ok(TransformOutput { | ||||
|             primary_key, | ||||
| @@ -461,7 +634,8 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             new_documents_ids: documents_ids, | ||||
|             replaced_documents_ids: RoaringBitmap::default(), | ||||
|             documents_count, | ||||
|             documents_file, | ||||
|             original_documents, | ||||
|             flattened_documents, | ||||
|         }) | ||||
|     } | ||||
| } | ||||
| @@ -521,11 +695,84 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> { | ||||
|     vec.into_iter().map(|_| unreachable!()).collect() | ||||
| } | ||||
|  | ||||
| fn update_primary_key<'a>( | ||||
|     document: KvReader<'a, FieldId>, | ||||
|     addition_index: &DocumentsBatchIndex, | ||||
|     primary_key_id: FieldId, | ||||
|     primary_key_name: &str, | ||||
|     uuid_buffer: &'a mut [u8; uuid::adapter::Hyphenated::LENGTH], | ||||
|     field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, | ||||
|     mut external_id_buffer: &'a mut Vec<u8>, | ||||
|     autogenerate_docids: bool, | ||||
| ) -> Result<Cow<'a, str>> { | ||||
|     match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { | ||||
|         Some((_, bytes)) => { | ||||
|             let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { | ||||
|                 Value::String(string) => match validate_document_id(&string) { | ||||
|                     Some(s) if s.len() == string.len() => string, | ||||
|                     Some(s) => s.to_string(), | ||||
|                     None => { | ||||
|                         return Err(UserError::InvalidDocumentId { | ||||
|                             document_id: Value::String(string), | ||||
|                         } | ||||
|                         .into()) | ||||
|                     } | ||||
|                 }, | ||||
|                 Value::Number(number) => number.to_string(), | ||||
|                 content => { | ||||
|                     return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) | ||||
|                 } | ||||
|             }; | ||||
|             serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; | ||||
|             Ok(Cow::Owned(value)) | ||||
|         } | ||||
|         None if autogenerate_docids => { | ||||
|             let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); | ||||
|             serde_json::to_writer(&mut external_id_buffer, &uuid) | ||||
|                 .map_err(InternalError::SerdeJson)?; | ||||
|             field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); | ||||
|             Ok(Cow::Borrowed(&*uuid)) | ||||
|         } | ||||
|         None => { | ||||
|             let mut json = Map::new(); | ||||
|             for (key, value) in document.iter() { | ||||
|                 let key = addition_index.name(key).cloned(); | ||||
|                 let value = serde_json::from_slice::<Value>(&value).ok(); | ||||
|  | ||||
|                 if let Some((k, v)) = key.zip(value) { | ||||
|                     json.insert(k, v); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             Err(UserError::MissingDocumentId { | ||||
|                 primary_key: primary_key_name.to_string(), | ||||
|                 document: json, | ||||
|             })? | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl TransformOutput { | ||||
|     // find and insert the new field ids | ||||
|     pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> { | ||||
|         let user_defined_facets = index.user_defined_faceted_fields(rtxn)?; | ||||
|  | ||||
|         Ok(self | ||||
|             .fields_ids_map | ||||
|             .names() | ||||
|             .filter(|&field| crate::is_faceted(field, &user_defined_facets)) | ||||
|             .map(|field| field.to_string()) | ||||
|             .collect()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use super::*; | ||||
|  | ||||
|     mod compute_primary_key { | ||||
|         use big_s::S; | ||||
|  | ||||
|         use super::{compute_primary_key_pair, FieldsIdsMap}; | ||||
|  | ||||
|         #[test] | ||||
| @@ -540,6 +787,18 @@ mod test { | ||||
|             ); | ||||
|             assert_eq!(result.unwrap(), (0, "toto".to_string())); | ||||
|             assert_eq!(fields_map.len(), 1); | ||||
|  | ||||
|             // and with nested fields | ||||
|             let mut fields_map = FieldsIdsMap::new(); | ||||
|             fields_map.insert("toto.tata").unwrap(); | ||||
|             let result = compute_primary_key_pair( | ||||
|                 Some("toto.tata"), | ||||
|                 &mut fields_map, | ||||
|                 Some(S("titi")), | ||||
|                 false, | ||||
|             ); | ||||
|             assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); | ||||
|             assert_eq!(fields_map.len(), 1); | ||||
|         } | ||||
|  | ||||
|         #[test] | ||||
| @@ -547,7 +806,7 @@ mod test { | ||||
|             let mut fields_map = FieldsIdsMap::new(); | ||||
|             let result = | ||||
|                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); | ||||
|             assert_eq!(result.unwrap(), (0, "tata".to_string())); | ||||
|             assert_eq!(result.unwrap(), (0, S("tata"))); | ||||
|             assert_eq!(fields_map.len(), 1); | ||||
|         } | ||||
|  | ||||
| @@ -555,7 +814,7 @@ mod test { | ||||
|         fn should_return_default_if_both_are_none() { | ||||
|             let mut fields_map = FieldsIdsMap::new(); | ||||
|             let result = compute_primary_key_pair(None, &mut fields_map, None, true); | ||||
|             assert_eq!(result.unwrap(), (0, "id".to_string())); | ||||
|             assert_eq!(result.unwrap(), (0, S("id"))); | ||||
|             assert_eq!(fields_map.len(), 1); | ||||
|         } | ||||
|  | ||||
| @@ -569,6 +828,7 @@ mod test { | ||||
|     } | ||||
|  | ||||
|     mod primary_key_inference { | ||||
|         use big_s::S; | ||||
|         use bimap::BiHashMap; | ||||
|  | ||||
|         use crate::documents::DocumentsBatchIndex; | ||||
| @@ -579,11 +839,11 @@ mod test { | ||||
|             // We run the test multiple times to change the order in which the fields are iterated upon. | ||||
|             for _ in 1..50 { | ||||
|                 let mut map = BiHashMap::new(); | ||||
|                 map.insert(1, "fakeId".to_string()); | ||||
|                 map.insert(2, "fakeId".to_string()); | ||||
|                 map.insert(3, "fakeId".to_string()); | ||||
|                 map.insert(4, "fakeId".to_string()); | ||||
|                 map.insert(0, "realId".to_string()); | ||||
|                 map.insert(1, S("fakeId")); | ||||
|                 map.insert(2, S("fakeId")); | ||||
|                 map.insert(3, S("fakeId")); | ||||
|                 map.insert(4, S("fakeId")); | ||||
|                 map.insert(0, S("realId")); | ||||
|  | ||||
|                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); | ||||
|             } | ||||
|   | ||||
| @@ -249,11 +249,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         } | ||||
|  | ||||
|         let transform = Transform::new( | ||||
|             self.wtxn, | ||||
|             &self.index, | ||||
|             &self.indexer_config, | ||||
|             IndexDocumentsMethod::ReplaceDocuments, | ||||
|             false, | ||||
|         ); | ||||
|         )?; | ||||
|  | ||||
|         // We remap the documents fields based on the new `FieldsIdsMap`. | ||||
|         let output = transform.remap_index_documents( | ||||
| @@ -262,6 +263,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|             fields_ids_map.clone(), | ||||
|         )?; | ||||
|  | ||||
|         let new_facets = output.compute_real_facets(self.wtxn, self.index)?; | ||||
|         self.index.put_faceted_fields(self.wtxn, &new_facets)?; | ||||
|  | ||||
|         // We clear the full database (words-fst, documents ids and documents content). | ||||
|         ClearDocuments::new(self.wtxn, self.index).execute()?; | ||||
|  | ||||
| @@ -273,7 +277,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|             &self.indexer_config, | ||||
|             IndexDocumentsConfig::default(), | ||||
|             &cb, | ||||
|         ); | ||||
|         )?; | ||||
|         indexing_builder.execute_raw(output)?; | ||||
|  | ||||
|         Ok(()) | ||||
| @@ -583,7 +587,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|     { | ||||
|         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; | ||||
|  | ||||
|         let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; | ||||
|         let old_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; | ||||
|         let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; | ||||
|  | ||||
|         self.update_displayed()?; | ||||
| @@ -599,7 +603,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         // If there is new faceted fields we indicate that we must reindex as we must | ||||
|         // index new fields as facets. It means that the distinct attribute, | ||||
|         // an Asc/Desc criterion or a filtered attribute as be added or removed. | ||||
|         let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?; | ||||
|         let new_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; | ||||
|         let faceted_updated = old_faceted_fields != new_faceted_fields; | ||||
|  | ||||
|         let stop_words_updated = self.update_stop_words()?; | ||||
| @@ -651,7 +655,8 @@ mod tests { | ||||
|         let config = IndexerConfig::default(); | ||||
|         let indexing_config = IndexDocumentsConfig::default(); | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -713,7 +718,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -764,7 +770,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -793,7 +800,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -846,7 +854,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -858,7 +867,6 @@ mod tests { | ||||
|         // Only count the field_id 0 and level 0 facet values. | ||||
|         // TODO we must support typed CSVs for numbers to be understood. | ||||
|         let fidmap = index.fields_ids_map(&rtxn).unwrap(); | ||||
|         println!("fidmap: {:?}", fidmap); | ||||
|         for document in index.all_documents(&rtxn).unwrap() { | ||||
|             let document = document.unwrap(); | ||||
|             let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1) | ||||
| @@ -886,7 +894,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -927,7 +936,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -977,7 +987,51 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         // Run an empty query just to ensure that the search results are ordered. | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|         let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); | ||||
|  | ||||
|         // There must be at least one document with a 34 as the age. | ||||
|         assert_eq!(documents_ids.len(), 3); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn set_nested_distinct_field() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
|         let index = Index::new(options, &path).unwrap(); | ||||
|         let config = IndexerConfig::default(); | ||||
|  | ||||
|         // Set the filterable fields to be the age. | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = Settings::new(&mut wtxn, &index, &config); | ||||
|         // Don't display the generated `id` field. | ||||
|         builder.set_displayed_fields(vec![S("person")]); | ||||
|         builder.set_distinct_field(S("person.age")); | ||||
|         builder.execute(|_| ()).unwrap(); | ||||
|  | ||||
|         // Then index some documents. | ||||
|         let content = documents!([ | ||||
|             { "person": { "name": "kevin", "age": 23 }}, | ||||
|             { "person": { "name": "kevina", "age": 21 }}, | ||||
|             { "person": { "name": "benoit", "age": 34 }}, | ||||
|             { "person": { "name": "bernard", "age": 34 }}, | ||||
|             { "person": { "name": "bertrand", "age": 34 }}, | ||||
|             { "person": { "name": "bernie", "age": 34 }}, | ||||
|             { "person": { "name": "ben", "age": 34 }} | ||||
|         ]); | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1008,7 +1062,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1037,7 +1092,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -1115,7 +1171,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|  | ||||
| @@ -1252,7 +1309,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -1314,7 +1372,8 @@ mod tests { | ||||
|         let indexing_config = | ||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|         let mut builder = | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); | ||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||
|                 .unwrap(); | ||||
|         builder.add_documents(content).unwrap(); | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|   | ||||
| @@ -59,7 +59,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|  | ||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|     let mut builder = | ||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|     let mut cursor = Cursor::new(Vec::new()); | ||||
|     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|     let reader = Cursor::new(CONTENT.as_bytes()); | ||||
|   | ||||
| @@ -390,7 +390,8 @@ fn criteria_ascdesc() { | ||||
|     // index documents | ||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); | ||||
|     let mut builder = | ||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||
|  | ||||
|     let mut cursor = Cursor::new(Vec::new()); | ||||
|     let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user