mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	nested fields
This commit is contained in:
		| @@ -70,7 +70,8 @@ fn indexing_songs_default(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -120,7 +121,8 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
| @@ -134,14 +136,16 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
| @@ -190,7 +194,8 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|  |  | ||||||
| @@ -236,7 +241,8 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -281,7 +287,8 @@ fn indexing_wiki(c: &mut Criterion) { | |||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -323,7 +330,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | |||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|                 let documents = |                 let documents = | ||||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); |                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -339,7 +347,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | |||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = |                 let documents = | ||||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); |                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); | ||||||
| @@ -349,7 +358,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | |||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = |                 let documents = | ||||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); |                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); | ||||||
| @@ -400,7 +410,8 @@ fn indexing_movies_default(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -447,7 +458,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -462,7 +474,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -470,7 +483,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | |||||||
|  |  | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
| @@ -525,7 +539,8 @@ fn indexing_geo(c: &mut Criterion) { | |||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let mut builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|  |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); |                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 builder.add_documents(documents).unwrap(); | ||||||
|   | |||||||
| @@ -96,7 +96,8 @@ pub fn base_setup(conf: &Conf) -> Index { | |||||||
|         update_method: IndexDocumentsMethod::ReplaceDocuments, |         update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|         ..Default::default() |         ..Default::default() | ||||||
|     }; |     }; | ||||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |     let mut builder = | ||||||
|  |         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|     let documents = documents_from(conf.dataset, conf.dataset_format); |     let documents = documents_from(conf.dataset, conf.dataset_format); | ||||||
|  |  | ||||||
|     builder.add_documents(documents).unwrap(); |     builder.add_documents(documents).unwrap(); | ||||||
|   | |||||||
| @@ -261,7 +261,8 @@ impl Performer for DocumentAddition { | |||||||
|             &config, |             &config, | ||||||
|             indexing_config, |             indexing_config, | ||||||
|             |step| indexing_callback(step, &bars), |             |step| indexing_callback(step, &bars), | ||||||
|         ); |         ) | ||||||
|  |         .unwrap(); | ||||||
|         addition.add_documents(reader)?; |         addition.add_documents(reader)?; | ||||||
|  |  | ||||||
|         std::thread::spawn(move || { |         std::thread::spawn(move || { | ||||||
|   | |||||||
| @@ -410,7 +410,7 @@ async fn main() -> anyhow::Result<()> { | |||||||
|                         GLOBAL_CONFIG.get().unwrap(), |                         GLOBAL_CONFIG.get().unwrap(), | ||||||
|                         indexing_config, |                         indexing_config, | ||||||
|                         indexing_callback, |                         indexing_callback, | ||||||
|                     ); |                     )?; | ||||||
|  |  | ||||||
|                     let reader = match encoding.as_deref() { |                     let reader = match encoding.as_deref() { | ||||||
|                         Some("gzip") => Box::new(GzDecoder::new(content)), |                         Some("gzip") => Box::new(GzDecoder::new(content)), | ||||||
|   | |||||||
| @@ -14,6 +14,7 @@ crossbeam-channel = "0.5.2" | |||||||
| either = "1.6.1" | either = "1.6.1" | ||||||
| fst = "0.4.7" | fst = "0.4.7" | ||||||
| fxhash = "0.2.1" | fxhash = "0.2.1" | ||||||
|  | flatten-serde-json = "0.1.0" | ||||||
| grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | ||||||
| geoutils = "0.4.1" | geoutils = "0.4.1" | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||||
|   | |||||||
| @@ -49,6 +49,24 @@ impl DocumentsBatchIndex { | |||||||
|     pub fn name(&self, id: FieldId) -> Option<&String> { |     pub fn name(&self, id: FieldId) -> Option<&String> { | ||||||
|         self.0.get_by_left(&id) |         self.0.get_by_left(&id) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn recreate_json( | ||||||
|  |         &self, | ||||||
|  |         document: &obkv::KvReaderU16, | ||||||
|  |     ) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> { | ||||||
|  |         let mut map = serde_json::Map::new(); | ||||||
|  |  | ||||||
|  |         for (k, v) in document.iter() { | ||||||
|  |             // TODO: TAMO: update the error type | ||||||
|  |             let key = | ||||||
|  |                 self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone(); | ||||||
|  |             let value = serde_json::from_slice::<serde_json::Value>(v) | ||||||
|  |                 .map_err(crate::error::InternalError::SerdeJson)?; | ||||||
|  |             map.insert(key, value); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(map) | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Serialize, Deserialize)] | #[derive(Debug, Serialize, Deserialize)] | ||||||
|   | |||||||
| @@ -27,6 +27,7 @@ pub enum InternalError { | |||||||
|     DatabaseClosing, |     DatabaseClosing, | ||||||
|     DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, |     DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, | ||||||
|     FieldIdMapMissingEntry(FieldIdMapMissingEntry), |     FieldIdMapMissingEntry(FieldIdMapMissingEntry), | ||||||
|  |     FieldIdMappingMissingEntry { key: FieldId }, | ||||||
|     Fst(fst::Error), |     Fst(fst::Error), | ||||||
|     GrenadInvalidCompressionType, |     GrenadInvalidCompressionType, | ||||||
|     GrenadInvalidFormatVersion, |     GrenadInvalidFormatVersion, | ||||||
| @@ -59,7 +60,7 @@ pub enum UserError { | |||||||
|     DocumentLimitReached, |     DocumentLimitReached, | ||||||
|     InvalidDocumentId { document_id: Value }, |     InvalidDocumentId { document_id: Value }, | ||||||
|     InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> }, |     InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> }, | ||||||
|     InvalidGeoField { document_id: Value, object: Value }, |     InvalidGeoField { document_id: Value }, | ||||||
|     InvalidFilter(String), |     InvalidFilter(String), | ||||||
|     InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, |     InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, | ||||||
|     SortRankingRuleMissing, |     SortRankingRuleMissing, | ||||||
| @@ -187,6 +188,9 @@ impl fmt::Display for InternalError { | |||||||
|                 write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) |                 write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) | ||||||
|             } |             } | ||||||
|             Self::FieldIdMapMissingEntry(error) => error.fmt(f), |             Self::FieldIdMapMissingEntry(error) => error.fmt(f), | ||||||
|  |             Self::FieldIdMappingMissingEntry { key } => { | ||||||
|  |                 write!(f, "Missing {} in the field id mapping.", key) | ||||||
|  |             } | ||||||
|             Self::Fst(error) => error.fmt(f), |             Self::Fst(error) => error.fmt(f), | ||||||
|             Self::GrenadInvalidCompressionType => { |             Self::GrenadInvalidCompressionType => { | ||||||
|                 f.write_str("Invalid compression type have been specified to grenad.") |                 f.write_str("Invalid compression type have been specified to grenad.") | ||||||
| @@ -226,19 +230,15 @@ impl fmt::Display for UserError { | |||||||
|                     name_list |                     name_list | ||||||
|                 ) |                 ) | ||||||
|             } |             } | ||||||
|             Self::InvalidGeoField { document_id, object } => { |             Self::InvalidGeoField { document_id } => { | ||||||
|                 let document_id = match document_id { |                 let document_id = match document_id { | ||||||
|                     Value::String(id) => id.clone(), |                     Value::String(id) => id.clone(), | ||||||
|                     _ => document_id.to_string(), |                     _ => document_id.to_string(), | ||||||
|                 }; |                 }; | ||||||
|                 let object = match object { |  | ||||||
|                     Value::String(id) => id.clone(), |  | ||||||
|                     _ => object.to_string(), |  | ||||||
|                 }; |  | ||||||
|                 write!( |                 write!( | ||||||
|                     f, |                     f, | ||||||
|                     "The document with the id: `{}` contains an invalid _geo field: `{}`.", |                     "The document with the id: `{}` contains an invalid `_geo` field.", | ||||||
|                     document_id, object |                     document_id | ||||||
|                 ) |                 ) | ||||||
|             }, |             }, | ||||||
|             Self::InvalidDocumentId { document_id } => { |             Self::InvalidDocumentId { document_id } => { | ||||||
|   | |||||||
| @@ -31,6 +31,7 @@ pub mod main_key { | |||||||
|     pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; |     pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | ||||||
|     pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; |     pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; | ||||||
|     pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; |     pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||||
|  |     pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; | ||||||
|     pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; |     pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; | ||||||
|     pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; |     pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; | ||||||
|     pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; |     pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; | ||||||
| @@ -567,12 +568,46 @@ impl Index { | |||||||
|         Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) |         Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /* faceted documents ids */ |     /* faceted fields */ | ||||||
|  |  | ||||||
|  |     /// Writes the faceted fields in the database. | ||||||
|  |     pub(crate) fn put_faceted_fields( | ||||||
|  |         &self, | ||||||
|  |         wtxn: &mut RwTxn, | ||||||
|  |         fields: &HashSet<String>, | ||||||
|  |     ) -> heed::Result<()> { | ||||||
|  |         self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::HIDDEN_FACETED_FIELDS_KEY, fields) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /// Returns the faceted fields names. |     /// Returns the faceted fields names. | ||||||
|  |     pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> { | ||||||
|  |         Ok(self | ||||||
|  |             .main | ||||||
|  |             .get::<_, Str, SerdeJson<_>>(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? | ||||||
|  |             .unwrap_or_default()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Identical to `faceted_fields`, but returns ids instead. | ||||||
|  |     pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { | ||||||
|  |         let fields = self.faceted_fields(rtxn)?; | ||||||
|  |         let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||||
|  |  | ||||||
|  |         let mut fields_ids = HashSet::new(); | ||||||
|  |         for name in fields { | ||||||
|  |             if let Some(field_id) = fields_ids_map.id(&name) { | ||||||
|  |                 fields_ids.insert(field_id); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(fields_ids) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /* faceted documents ids */ | ||||||
|  |  | ||||||
|  |     /// Returns the user defined faceted fields names. | ||||||
|     /// |     /// | ||||||
|     /// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. |     /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. | ||||||
|     pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { |     pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn) -> Result<HashSet<String>> { | ||||||
|         let filterable_fields = self.filterable_fields(rtxn)?; |         let filterable_fields = self.filterable_fields(rtxn)?; | ||||||
|         let sortable_fields = self.sortable_fields(rtxn)?; |         let sortable_fields = self.sortable_fields(rtxn)?; | ||||||
|         let distinct_field = self.distinct_field(rtxn)?; |         let distinct_field = self.distinct_field(rtxn)?; | ||||||
| @@ -592,8 +627,8 @@ impl Index { | |||||||
|         Ok(faceted_fields) |         Ok(faceted_fields) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Identical to `faceted_fields`, but returns ids instead. |     /// Identical to `user_defined_faceted_fields`, but returns ids instead. | ||||||
|     pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { |     pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { | ||||||
|         let fields = self.faceted_fields(rtxn)?; |         let fields = self.faceted_fields(rtxn)?; | ||||||
|         let fields_ids_map = self.fields_ids_map(rtxn)?; |         let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||||
|  |  | ||||||
| @@ -1040,13 +1075,14 @@ pub(crate) mod tests { | |||||||
|         let content = documents!([ |         let content = documents!([ | ||||||
|             { "id": 1, "name": "kevin" }, |             { "id": 1, "name": "kevin" }, | ||||||
|             { "id": 2, "name": "bob", "age": 20 }, |             { "id": 2, "name": "bob", "age": 20 }, | ||||||
|             { "id": 2, "name": "bob", "age": 20 } |             { "id": 2, "name": "bob", "age": 20 }, | ||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1067,11 +1103,12 @@ pub(crate) mod tests { | |||||||
|         // field_distribution in the end |         // field_distribution in the end | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         let content = documents!([ |         let content = documents!([ | ||||||
|             { "id": 1, "name": "kevin" }, |             { "id": 1, "name": "kevin" }, | ||||||
|             { "id": 2, "name": "bob", "age": 20 }, |             { "id": 2, "name": "bob", "age": 20 }, | ||||||
|             { "id": 2, "name": "bob", "age": 20 } |             { "id": 2, "name": "bob", "age": 20 }, | ||||||
|         ]); |         ]); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
| @@ -1097,7 +1134,8 @@ pub(crate) mod tests { | |||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -183,6 +183,43 @@ pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { | |||||||
|     [x, y, z] |     [x, y, z] | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Returns `true` if the field match one of the faceted fields. | ||||||
|  | /// See the function [`is_faceted_by`] below to see what “matching” means. | ||||||
|  | pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator<Item = impl AsRef<str>>) -> bool { | ||||||
|  |     faceted_fields.into_iter().find(|facet| is_faceted_by(field, facet.as_ref())).is_some() | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Returns `true` if the field match the facet. | ||||||
|  | /// ``` | ||||||
|  | /// use milli::is_faceted_by; | ||||||
|  | /// // -- the valid basics | ||||||
|  | /// assert!(is_faceted_by("animaux", "animaux")); | ||||||
|  | /// assert!(is_faceted_by("animaux.chien", "animaux")); | ||||||
|  | /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux")); | ||||||
|  | /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien")); | ||||||
|  | /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois")); | ||||||
|  | /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure")); | ||||||
|  | /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur")); | ||||||
|  | /// | ||||||
|  | /// // -- the wrongs | ||||||
|  | /// assert!(!is_faceted_by("chien", "chat")); | ||||||
|  | /// assert!(!is_faceted_by("animaux", "animaux.chien")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animaux.chat")); | ||||||
|  | /// | ||||||
|  | /// // -- the strange edge cases | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "anima")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animau")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animaux.")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animaux.c")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animaux.ch")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animaux.chi")); | ||||||
|  | /// assert!(!is_faceted_by("animaux.chien", "animaux.chie")); | ||||||
|  | /// ``` | ||||||
|  | pub fn is_faceted_by(field: &str, facet: &str) -> bool { | ||||||
|  |     field.starts_with(facet) | ||||||
|  |         && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) | ||||||
|  | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use serde_json::json; |     use serde_json::json; | ||||||
|   | |||||||
| @@ -97,7 +97,8 @@ mod test { | |||||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); |         let mut addition = | ||||||
|  |             IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|  |  | ||||||
|         let reader = |         let reader = | ||||||
|             crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); |             crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); | ||||||
|   | |||||||
| @@ -220,9 +220,13 @@ impl<'a> FacetDistribution<'a> { | |||||||
|     pub fn execute(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> { |     pub fn execute(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> { | ||||||
|         let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; |         let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; | ||||||
|         let filterable_fields = self.index.filterable_fields(self.rtxn)?; |         let filterable_fields = self.index.filterable_fields(self.rtxn)?; | ||||||
|  |  | ||||||
|         let fields = match self.facets { |         let fields = match self.facets { | ||||||
|             Some(ref facets) => { |             Some(ref facets) => { | ||||||
|                 let invalid_fields: HashSet<_> = facets.difference(&filterable_fields).collect(); |                 let invalid_fields: HashSet<_> = facets | ||||||
|  |                     .iter() | ||||||
|  |                     .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) | ||||||
|  |                     .collect(); | ||||||
|                 if !invalid_fields.is_empty() { |                 if !invalid_fields.is_empty() { | ||||||
|                     return Err(UserError::InvalidFacetsDistribution { |                     return Err(UserError::InvalidFacetsDistribution { | ||||||
|                         invalid_facets_name: invalid_fields.into_iter().cloned().collect(), |                         invalid_facets_name: invalid_fields.into_iter().cloned().collect(), | ||||||
| @@ -236,10 +240,12 @@ impl<'a> FacetDistribution<'a> { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let mut distribution = BTreeMap::new(); |         let mut distribution = BTreeMap::new(); | ||||||
|         for name in fields { |         for (fid, name) in fields_ids_map.iter() { | ||||||
|             if let Some(fid) = fields_ids_map.id(&name) { |             if crate::is_faceted(name, &fields) { | ||||||
|                 let values = self.facet_values(fid)?; |                 let values = self.facet_values(fid)?; | ||||||
|                 distribution.insert(name, values); |                 if !values.is_empty() { | ||||||
|  |                     distribution.insert(name.to_string(), values); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -353,7 +353,8 @@ impl<'a> Filter<'a> { | |||||||
|         match &self.condition { |         match &self.condition { | ||||||
|             FilterCondition::Condition { fid, op } => { |             FilterCondition::Condition { fid, op } => { | ||||||
|                 let filterable_fields = index.filterable_fields(rtxn)?; |                 let filterable_fields = index.filterable_fields(rtxn)?; | ||||||
|                 if filterable_fields.contains(fid.value()) { |  | ||||||
|  |                 if crate::is_faceted(fid.value(), &filterable_fields) { | ||||||
|                     let field_ids_map = index.fields_ids_map(rtxn)?; |                     let field_ids_map = index.fields_ids_map(rtxn)?; | ||||||
|                     if let Some(fid) = field_ids_map.id(fid.value()) { |                     if let Some(fid) = field_ids_map.id(fid.value()) { | ||||||
|                         Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) |                         Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) | ||||||
| @@ -549,7 +550,6 @@ mod tests { | |||||||
|             Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)") |             Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)") | ||||||
|                 .unwrap() |                 .unwrap() | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         println!("\nExpecting: {:#?}\nGot: {:#?}\n", expected, condition); |  | ||||||
|         assert_eq!(condition, expected); |         assert_eq!(condition, expected); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -159,7 +159,7 @@ impl<'a> Search<'a> { | |||||||
|             let sortable_fields = self.index.sortable_fields(self.rtxn)?; |             let sortable_fields = self.index.sortable_fields(self.rtxn)?; | ||||||
|             for asc_desc in sort_criteria { |             for asc_desc in sort_criteria { | ||||||
|                 match asc_desc.member() { |                 match asc_desc.member() { | ||||||
|                     Member::Field(ref field) if !sortable_fields.contains(field) => { |                     Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { | ||||||
|                         return Err(UserError::InvalidSortableAttribute { |                         return Err(UserError::InvalidSortableAttribute { | ||||||
|                             field: field.to_string(), |                             field: field.to_string(), | ||||||
|                             valid_fields: sortable_fields.into_iter().collect(), |                             valid_fields: sortable_fields.into_iter().collect(), | ||||||
|   | |||||||
| @@ -98,7 +98,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -110,7 +111,8 @@ mod tests { | |||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|         assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 5); |         // the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]` | ||||||
|  |         assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7); | ||||||
|  |  | ||||||
|         assert!(index.words_fst(&rtxn).unwrap().is_empty()); |         assert!(index.words_fst(&rtxn).unwrap().is_empty()); | ||||||
|         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); |         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); | ||||||
|   | |||||||
| @@ -647,7 +647,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -681,7 +682,8 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -733,7 +735,8 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -790,7 +793,8 @@ mod tests { | |||||||
|  |  | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|  |  | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,7 +2,6 @@ use std::fs::File; | |||||||
| use std::io; | use std::io; | ||||||
|  |  | ||||||
| use concat_arrays::concat_arrays; | use concat_arrays::concat_arrays; | ||||||
| use serde_json::Value; |  | ||||||
|  |  | ||||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||||
| use crate::{FieldId, InternalError, Result, UserError}; | use crate::{FieldId, InternalError, Result, UserError}; | ||||||
| @@ -14,7 +13,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | |||||||
|     obkv_documents: grenad::Reader<R>, |     obkv_documents: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_field_id: FieldId, |     (lat_fid, lng_fid): (FieldId, FieldId), | ||||||
| ) -> Result<grenad::Reader<File>> { | ) -> Result<grenad::Reader<File>> { | ||||||
|     let mut writer = create_writer( |     let mut writer = create_writer( | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
| @@ -25,22 +24,18 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | |||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { |     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||||
|         let obkv = obkv::KvReader::new(value); |         let obkv = obkv::KvReader::new(value); | ||||||
|         let point: Value = match obkv.get(geo_field_id) { |         let (lat, lng) = obkv.get(lat_fid).zip(obkv.get(lng_fid)).ok_or_else(|| { | ||||||
|             Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?, |  | ||||||
|             None => continue, |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { |  | ||||||
|             // this will create an array of 16 bytes (two 8 bytes floats) |  | ||||||
|             let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; |  | ||||||
|             writer.insert(docid_bytes, bytes)?; |  | ||||||
|         } else { |  | ||||||
|             // All document must have a primary key so we can unwrap safely here |  | ||||||
|             let primary_key = obkv.get(primary_key_id).unwrap(); |             let primary_key = obkv.get(primary_key_id).unwrap(); | ||||||
|             let primary_key = |             let primary_key = serde_json::from_slice(primary_key).unwrap(); | ||||||
|                 serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; |             UserError::InvalidGeoField { document_id: primary_key } | ||||||
|             Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? |         })?; | ||||||
|         } |         let (lat, lng): (f64, f64) = ( | ||||||
|  |             serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, | ||||||
|  |             serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, | ||||||
|  |         ); | ||||||
|  |  | ||||||
|  |         let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||||
|  |         writer.insert(docid_bytes, bytes)?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(writer_into_reader(writer)?) |     Ok(writer_into_reader(writer)?) | ||||||
|   | |||||||
| @@ -34,28 +34,36 @@ use crate::{FieldId, Result}; | |||||||
| /// Extract data for each databases from obkv documents in parallel. | /// Extract data for each databases from obkv documents in parallel. | ||||||
| /// Send data in grenad file over provided Sender. | /// Send data in grenad file over provided Sender. | ||||||
| pub(crate) fn data_from_obkv_documents( | pub(crate) fn data_from_obkv_documents( | ||||||
|     obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, |     original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, | ||||||
|  |     flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     searchable_fields: Option<HashSet<FieldId>>, |     searchable_fields: Option<HashSet<FieldId>>, | ||||||
|     faceted_fields: HashSet<FieldId>, |     faceted_fields: HashSet<FieldId>, | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_field_id: Option<FieldId>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|     stop_words: Option<fst::Set<&[u8]>>, |     stop_words: Option<fst::Set<&[u8]>>, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
|     exact_attributes: HashSet<FieldId>, |     exact_attributes: HashSet<FieldId>, | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks |     original_obkv_chunks | ||||||
|         .par_bridge() |         .par_bridge() | ||||||
|         .map(|result| { |         .map(|original_documents_chunk| { | ||||||
|             extract_documents_data( |             send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) | ||||||
|                 result, |         }) | ||||||
|  |         .collect::<Result<()>>()?; | ||||||
|  |  | ||||||
|  |     let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks | ||||||
|  |         .par_bridge() | ||||||
|  |         .map(|flattened_obkv_chunks| { | ||||||
|  |             send_and_extract_flattened_documents_data( | ||||||
|  |                 flattened_obkv_chunks, | ||||||
|                 indexer, |                 indexer, | ||||||
|                 lmdb_writer_sx.clone(), |                 lmdb_writer_sx.clone(), | ||||||
|                 &searchable_fields, |                 &searchable_fields, | ||||||
|                 &faceted_fields, |                 &faceted_fields, | ||||||
|                 primary_key_id, |                 primary_key_id, | ||||||
|                 geo_field_id, |                 geo_fields_ids, | ||||||
|                 &stop_words, |                 &stop_words, | ||||||
|                 max_positions_per_attributes, |                 max_positions_per_attributes, | ||||||
|             ) |             ) | ||||||
| @@ -170,36 +178,48 @@ fn spawn_extraction_task<FE, FS, M>( | |||||||
|     }); |     }); | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Extract chuncked data and send it into lmdb_writer_sx sender: | /// Extract chunked data and send it into lmdb_writer_sx sender: | ||||||
| /// - documents | /// - documents | ||||||
|  | fn send_original_documents_data( | ||||||
|  |     original_documents_chunk: Result<grenad::Reader<File>>, | ||||||
|  |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|  | ) -> Result<()> { | ||||||
|  |     let original_documents_chunk = | ||||||
|  |         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||||
|  |  | ||||||
|  |     // TODO: create a custom internal error | ||||||
|  |     lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Extract chunked data and send it into lmdb_writer_sx sender: | ||||||
| /// - documents_ids | /// - documents_ids | ||||||
| /// - docid_word_positions | /// - docid_word_positions | ||||||
| /// - docid_fid_facet_numbers | /// - docid_fid_facet_numbers | ||||||
| /// - docid_fid_facet_strings | /// - docid_fid_facet_strings | ||||||
| fn extract_documents_data( | fn send_and_extract_flattened_documents_data( | ||||||
|     documents_chunk: Result<grenad::Reader<File>>, |     flattened_documents_chunk: Result<grenad::Reader<File>>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     searchable_fields: &Option<HashSet<FieldId>>, | ||||||
|     faceted_fields: &HashSet<FieldId>, |     faceted_fields: &HashSet<FieldId>, | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_field_id: Option<FieldId>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|     stop_words: &Option<fst::Set<&[u8]>>, |     stop_words: &Option<fst::Set<&[u8]>>, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<( | ) -> Result<( | ||||||
|     grenad::Reader<CursorClonableMmap>, |     grenad::Reader<CursorClonableMmap>, | ||||||
|     (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>), |     (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>), | ||||||
| )> { | )> { | ||||||
|     let documents_chunk = documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; |     let flattened_documents_chunk = | ||||||
|  |         flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||||
|  |  | ||||||
|     let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); |     if let Some(geo_fields_ids) = geo_fields_ids { | ||||||
|  |         let documents_chunk_cloned = flattened_documents_chunk.clone(); | ||||||
|     if let Some(geo_field_id) = geo_field_id { |  | ||||||
|         let documents_chunk_cloned = documents_chunk.clone(); |  | ||||||
|         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); |         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||||
|         rayon::spawn(move || { |         rayon::spawn(move || { | ||||||
|             let result = |             let result = | ||||||
|                 extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id); |                 extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids); | ||||||
|             let _ = match result { |             let _ = match result { | ||||||
|                 Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), |                 Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), | ||||||
|                 Err(error) => lmdb_writer_sx_cloned.send(Err(error)), |                 Err(error) => lmdb_writer_sx_cloned.send(Err(error)), | ||||||
| @@ -211,7 +231,7 @@ fn extract_documents_data( | |||||||
|         rayon::join( |         rayon::join( | ||||||
|             || { |             || { | ||||||
|                 let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( |                 let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( | ||||||
|                     documents_chunk.clone(), |                     flattened_documents_chunk.clone(), | ||||||
|                     indexer.clone(), |                     indexer.clone(), | ||||||
|                     searchable_fields, |                     searchable_fields, | ||||||
|                     stop_words.as_ref(), |                     stop_words.as_ref(), | ||||||
| @@ -232,7 +252,7 @@ fn extract_documents_data( | |||||||
|             || { |             || { | ||||||
|                 let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = |                 let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = | ||||||
|                     extract_fid_docid_facet_values( |                     extract_fid_docid_facet_values( | ||||||
|                         documents_chunk.clone(), |                         flattened_documents_chunk.clone(), | ||||||
|                         indexer.clone(), |                         indexer.clone(), | ||||||
|                         faceted_fields, |                         faceted_fields, | ||||||
|                     )?; |                     )?; | ||||||
|   | |||||||
| @@ -30,7 +30,7 @@ use crate::update::{ | |||||||
|     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, |     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, | ||||||
|     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, |     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, | ||||||
| }; | }; | ||||||
| use crate::{Index, Result, RoaringBitmapCodec}; | use crate::{Index, Result, RoaringBitmapCodec, UserError}; | ||||||
|  |  | ||||||
| static MERGED_DATABASE_COUNT: usize = 7; | static MERGED_DATABASE_COUNT: usize = 7; | ||||||
| static PREFIX_DATABASE_COUNT: usize = 5; | static PREFIX_DATABASE_COUNT: usize = 5; | ||||||
| @@ -94,15 +94,16 @@ where | |||||||
|         indexer_config: &'a IndexerConfig, |         indexer_config: &'a IndexerConfig, | ||||||
|         config: IndexDocumentsConfig, |         config: IndexDocumentsConfig, | ||||||
|         progress: F, |         progress: F, | ||||||
|     ) -> IndexDocuments<'t, 'u, 'i, 'a, F> { |     ) -> Result<IndexDocuments<'t, 'u, 'i, 'a, F>> { | ||||||
|         let transform = Some(Transform::new( |         let transform = Some(Transform::new( | ||||||
|  |             wtxn, | ||||||
|             &index, |             &index, | ||||||
|             indexer_config, |             indexer_config, | ||||||
|             config.update_method, |             config.update_method, | ||||||
|             config.autogenerate_docids, |             config.autogenerate_docids, | ||||||
|         )); |         )?); | ||||||
|  |  | ||||||
|         IndexDocuments { |         Ok(IndexDocuments { | ||||||
|             transform, |             transform, | ||||||
|             config, |             config, | ||||||
|             indexer_config, |             indexer_config, | ||||||
| @@ -110,7 +111,7 @@ where | |||||||
|             wtxn, |             wtxn, | ||||||
|             index, |             index, | ||||||
|             added_documents: 0, |             added_documents: 0, | ||||||
|         } |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Adds a batch of documents to the current builder. |     /// Adds a batch of documents to the current builder. | ||||||
| @@ -151,6 +152,10 @@ where | |||||||
|             .take() |             .take() | ||||||
|             .expect("Invalid document addition state") |             .expect("Invalid document addition state") | ||||||
|             .output_from_sorter(self.wtxn, &self.progress)?; |             .output_from_sorter(self.wtxn, &self.progress)?; | ||||||
|  |  | ||||||
|  |         let new_facets = output.compute_real_facets(self.wtxn, self.index)?; | ||||||
|  |         self.index.put_faceted_fields(self.wtxn, &new_facets)?; | ||||||
|  |  | ||||||
|         let indexed_documents = output.documents_count as u64; |         let indexed_documents = output.documents_count as u64; | ||||||
|         let number_of_documents = self.execute_raw(output)?; |         let number_of_documents = self.execute_raw(output)?; | ||||||
|  |  | ||||||
| @@ -171,7 +176,8 @@ where | |||||||
|             new_documents_ids, |             new_documents_ids, | ||||||
|             replaced_documents_ids, |             replaced_documents_ids, | ||||||
|             documents_count, |             documents_count, | ||||||
|             documents_file, |             original_documents, | ||||||
|  |             flattened_documents, | ||||||
|         } = output; |         } = output; | ||||||
|  |  | ||||||
|         // The fields_ids_map is put back to the store now so the rest of the transaction sees an |         // The fields_ids_map is put back to the store now so the rest of the transaction sees an | ||||||
| @@ -197,7 +203,8 @@ where | |||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let documents_file = grenad::Reader::new(documents_file)?; |         let original_documents = grenad::Reader::new(original_documents)?; | ||||||
|  |         let flattened_documents = grenad::Reader::new(flattened_documents)?; | ||||||
|  |  | ||||||
|         // create LMDB writer channel |         // create LMDB writer channel | ||||||
|         let (lmdb_writer_sx, lmdb_writer_rx): ( |         let (lmdb_writer_sx, lmdb_writer_rx): ( | ||||||
| @@ -213,13 +220,20 @@ where | |||||||
|             self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); |             self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); | ||||||
|         // get filterable fields for facet databases |         // get filterable fields for facet databases | ||||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; |         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||||
|         // get the fid of the `_geo` field. |         // get the fid of the `_geo.lat` and `_geo.lng` fields. | ||||||
|         let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { |         let geo_fields_ids = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { | ||||||
|             Some(gfid) => { |             Some(gfid) => { | ||||||
|                 let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); |                 let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); | ||||||
|                 let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); |                 let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); | ||||||
|  |                 // if `_geo` is faceted then we get the `lat` and `lng` | ||||||
|                 if is_sortable || is_filterable { |                 if is_sortable || is_filterable { | ||||||
|                     Some(gfid) |                     let field_ids = self | ||||||
|  |                         .index | ||||||
|  |                         .fields_ids_map(self.wtxn)? | ||||||
|  |                         .insert("_geo.lat") | ||||||
|  |                         .zip(self.index.fields_ids_map(self.wtxn)?.insert("_geo.lng")) | ||||||
|  |                         .ok_or(UserError::AttributeLimitReached)?; | ||||||
|  |                     Some(field_ids) | ||||||
|                 } else { |                 } else { | ||||||
|                     None |                     None | ||||||
|                 } |                 } | ||||||
| @@ -239,28 +253,38 @@ where | |||||||
|                 max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. |                 max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             // split obkv file into several chuncks |             // split obkv file into several chunks | ||||||
|             let chunk_iter = grenad_obkv_into_chunks( |             let original_chunk_iter = grenad_obkv_into_chunks( | ||||||
|                 documents_file, |                 original_documents, | ||||||
|                 params.clone(), |                 params.clone(), | ||||||
|                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB |                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB | ||||||
|             ); |             ); | ||||||
|  |  | ||||||
|             let result = chunk_iter.map(|chunk_iter| { |             // split obkv file into several chunks | ||||||
|                 // extract all databases from the chunked obkv douments |             let flattened_chunk_iter = grenad_obkv_into_chunks( | ||||||
|                 extract::data_from_obkv_documents( |                 flattened_documents, | ||||||
|                     chunk_iter, |                 params.clone(), | ||||||
|                     params, |                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB | ||||||
|                     lmdb_writer_sx.clone(), |             ); | ||||||
|                     searchable_fields, |  | ||||||
|                     faceted_fields, |             let result = original_chunk_iter | ||||||
|                     primary_key_id, |                 .and_then(|original_chunk_iter| Ok((original_chunk_iter, flattened_chunk_iter?))) | ||||||
|                     geo_field_id, |                 .map(|(original_chunk, flattened_chunk)| { | ||||||
|                     stop_words, |                     // extract all databases from the chunked obkv douments | ||||||
|                     self.indexer_config.max_positions_per_attributes, |                     extract::data_from_obkv_documents( | ||||||
|                     exact_attributes, |                         original_chunk, | ||||||
|                 ) |                         flattened_chunk, | ||||||
|             }); |                         params, | ||||||
|  |                         lmdb_writer_sx.clone(), | ||||||
|  |                         searchable_fields, | ||||||
|  |                         faceted_fields, | ||||||
|  |                         primary_key_id, | ||||||
|  |                         geo_fields_ids, | ||||||
|  |                         stop_words, | ||||||
|  |                         self.indexer_config.max_positions_per_attributes, | ||||||
|  |                         exact_attributes, | ||||||
|  |                     ) | ||||||
|  |                 }); | ||||||
|  |  | ||||||
|             if let Err(e) = result { |             if let Err(e) = result { | ||||||
|                 let _ = lmdb_writer_sx.send(Err(e)); |                 let _ = lmdb_writer_sx.send(Err(e)); | ||||||
| @@ -550,6 +574,7 @@ mod tests { | |||||||
|  |  | ||||||
|     use big_s::S; |     use big_s::S; | ||||||
|     use heed::EnvOpenOptions; |     use heed::EnvOpenOptions; | ||||||
|  |     use maplit::hashset; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::documents::DocumentBatchBuilder; |     use crate::documents::DocumentBatchBuilder; | ||||||
| @@ -574,7 +599,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -589,7 +615,8 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = documents!([ { "id": 1, "name": "updated kevin" } ]); |         let content = documents!([ { "id": 1, "name": "updated kevin" } ]); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -607,7 +634,8 @@ mod tests { | |||||||
|             { "id": 2, "name": "updated kevina" }, |             { "id": 2, "name": "updated kevina" }, | ||||||
|             { "id": 3, "name": "updated benoit" } |             { "id": 3, "name": "updated benoit" } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -639,7 +667,8 @@ mod tests { | |||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -665,7 +694,8 @@ mod tests { | |||||||
|         // Second we send 1 document with id 1, to force it to be merged with the previous one. |         // Second we send 1 document with id 1, to force it to be merged with the previous one. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = documents!([ { "id": 1, "age": 25 } ]); |         let content = documents!([ { "id": 1, "age": 25 } ]); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -706,7 +736,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         assert!(builder.add_documents(content).is_err()); |         assert!(builder.add_documents(content).is_err()); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -735,7 +766,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -753,7 +785,8 @@ mod tests { | |||||||
|         // Second we send 1 document with the generated uuid, to erase the previous ones. |         // Second we send 1 document with the generated uuid, to erase the previous ones. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); |         let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -793,7 +826,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -809,7 +843,8 @@ mod tests { | |||||||
|         let content = documents!([ { "name": "new kevin" } ]); |         let content = documents!([ { "name": "new kevin" } ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -833,7 +868,8 @@ mod tests { | |||||||
|         let content = documents!([]); |         let content = documents!([]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -859,7 +895,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         assert!(builder.add_documents(content).is_err()); |         assert!(builder.add_documents(content).is_err()); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -867,7 +904,8 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         // There is a space in the document id. |         // There is a space in the document id. | ||||||
|         let content = documents!([ { "id": 32, "name": "kevin" } ]); |         let content = documents!([ { "id": 32, "name": "kevin" } ]); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -895,7 +933,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -912,7 +951,7 @@ mod tests { | |||||||
|         assert_eq!(result.documents_ids, vec![1]); |         assert_eq!(result.documents_ids, vec![1]); | ||||||
|  |  | ||||||
|         // Search for a sub array sub object key |         // Search for a sub array sub object key | ||||||
|         let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap(); |         let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap(); | ||||||
|         assert_eq!(result.documents_ids, vec![2]); |         assert_eq!(result.documents_ids, vec![2]); | ||||||
|  |  | ||||||
|         drop(rtxn); |         drop(rtxn); | ||||||
| @@ -940,7 +979,8 @@ mod tests { | |||||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         builder.add_documents(documents).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -950,7 +990,8 @@ mod tests { | |||||||
|             update_method: IndexDocumentsMethod::UpdateDocuments, |             update_method: IndexDocumentsMethod::UpdateDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { |           { | ||||||
|             "id": 2, |             "id": 2, | ||||||
| @@ -981,7 +1022,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1000,7 +1042,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); |         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); | ||||||
| @@ -1011,7 +1054,8 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1046,7 +1090,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1080,7 +1125,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1137,13 +1183,333 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn index_documents_with_nested_fields() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let content = documents!([ | ||||||
|  |             { | ||||||
|  |                 "id": 0, | ||||||
|  |                 "title": "The zeroth document", | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 1, | ||||||
|  |                 "title": "The first document", | ||||||
|  |                 "nested": { | ||||||
|  |                     "object": "field", | ||||||
|  |                     "machin": "bidule", | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 2, | ||||||
|  |                 "title": "The second document", | ||||||
|  |                 "nested": [ | ||||||
|  |                     "array", | ||||||
|  |                     { | ||||||
|  |                         "object": "field", | ||||||
|  |                     }, | ||||||
|  |                     { | ||||||
|  |                         "prout": "truc", | ||||||
|  |                         "machin": "lol", | ||||||
|  |                     }, | ||||||
|  |                 ], | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 3, | ||||||
|  |                 "title": "The third document", | ||||||
|  |                 "nested": "I lied", | ||||||
|  |             }, | ||||||
|  |         ]); | ||||||
|  |  | ||||||
|  |         let config = IndexerConfig::default(); | ||||||
|  |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|  |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|  |         builder.add_documents(content).unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |  | ||||||
|  |         let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; | ||||||
|  |         builder.set_searchable_fields(searchable_fields); | ||||||
|  |  | ||||||
|  |         let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); | ||||||
|  |         builder.set_filterable_fields(faceted_fields); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||||
|  |         assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); | ||||||
|  |  | ||||||
|  |         // testing the simple query search | ||||||
|  |         let mut search = crate::Search::new(&rtxn, &index); | ||||||
|  |         search.query("document"); | ||||||
|  |         search.authorize_typos(true); | ||||||
|  |         search.optional_words(true); | ||||||
|  |         // all documents should be returned | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids.len(), 4); | ||||||
|  |  | ||||||
|  |         search.query("zeroth"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![0]); | ||||||
|  |         search.query("first"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1]); | ||||||
|  |         search.query("second"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![2]); | ||||||
|  |         search.query("third"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![3]); | ||||||
|  |  | ||||||
|  |         search.query("field"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1, 2]); | ||||||
|  |  | ||||||
|  |         search.query("lol"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![2]); | ||||||
|  |  | ||||||
|  |         search.query("object"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert!(documents_ids.is_empty()); | ||||||
|  |  | ||||||
|  |         search.query("array"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert!(documents_ids.is_empty()); // nested is not searchable | ||||||
|  |  | ||||||
|  |         search.query("lied"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert!(documents_ids.is_empty()); // nested is not searchable | ||||||
|  |  | ||||||
|  |         // testing the filters | ||||||
|  |         let mut search = crate::Search::new(&rtxn, &index); | ||||||
|  |         search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap()); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1]); | ||||||
|  |  | ||||||
|  |         search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap()); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1, 2]); | ||||||
|  |  | ||||||
|  |         search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap()); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1]); | ||||||
|  |  | ||||||
|  |         search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap()); | ||||||
|  |         let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable | ||||||
|  |         assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); | ||||||
|  |  | ||||||
|  |         search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap()); | ||||||
|  |         let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable | ||||||
|  |         assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn index_documents_with_nested_primary_key() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |         let config = IndexerConfig::default(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |         builder.set_primary_key("nested.id".to_owned()); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let content = documents!([ | ||||||
|  |             { | ||||||
|  |                 "nested": { | ||||||
|  |                     "id": 0, | ||||||
|  |                 }, | ||||||
|  |                 "title": "The zeroth document", | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "nested": { | ||||||
|  |                     "id": 1, | ||||||
|  |                 }, | ||||||
|  |                 "title": "The first document", | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "nested": { | ||||||
|  |                     "id": 2, | ||||||
|  |                 }, | ||||||
|  |                 "title": "The second document", | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "nested.id": 3, | ||||||
|  |                 "title": "The third document", | ||||||
|  |             }, | ||||||
|  |         ]); | ||||||
|  |  | ||||||
|  |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|  |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|  |         builder.add_documents(content).unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         // testing the simple query search | ||||||
|  |         let mut search = crate::Search::new(&rtxn, &index); | ||||||
|  |         search.query("document"); | ||||||
|  |         search.authorize_typos(true); | ||||||
|  |         search.optional_words(true); | ||||||
|  |         // all documents should be returned | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids.len(), 4); | ||||||
|  |  | ||||||
|  |         search.query("zeroth"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![0]); | ||||||
|  |         search.query("first"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1]); | ||||||
|  |         search.query("second"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![2]); | ||||||
|  |         search.query("third"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![3]); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn test_facets_generation() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let content = documents!([ | ||||||
|  |             { | ||||||
|  |                 "id": 0, | ||||||
|  |                 "dog": { | ||||||
|  |                     "race": { | ||||||
|  |                         "bernese mountain": "zeroth", | ||||||
|  |                     }, | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 1, | ||||||
|  |                 "dog.race": { | ||||||
|  |                     "bernese mountain": "first", | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 2, | ||||||
|  |                 "dog.race.bernese mountain": "second", | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 3, | ||||||
|  |                 "dog": { | ||||||
|  |                     "race.bernese mountain": "third" | ||||||
|  |                 }, | ||||||
|  |             }, | ||||||
|  |         ]); | ||||||
|  |  | ||||||
|  |         // index the documents | ||||||
|  |         let config = IndexerConfig::default(); | ||||||
|  |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|  |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|  |         builder.add_documents(content).unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         // ---- ADD THE SETTING TO TEST THE FILTERABLE | ||||||
|  |  | ||||||
|  |         // add the settings | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |  | ||||||
|  |         builder.set_filterable_fields(hashset!(String::from("dog"))); | ||||||
|  |  | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         let hidden = index.faceted_fields(&rtxn).unwrap(); | ||||||
|  |  | ||||||
|  |         assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); | ||||||
|  |  | ||||||
|  |         for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { | ||||||
|  |             let mut search = crate::Search::new(&rtxn, &index); | ||||||
|  |             let filter = format!(r#""dog.race.bernese mountain" = {s}"#); | ||||||
|  |             search.filter(crate::Filter::from_str(&filter).unwrap().unwrap()); | ||||||
|  |             let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |             assert_eq!(documents_ids, vec![i]); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // ---- RESET THE SETTINGS | ||||||
|  |  | ||||||
|  |         // update the settings | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |  | ||||||
|  |         builder.reset_filterable_fields(); | ||||||
|  |  | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||||
|  |  | ||||||
|  |         assert_eq!(facets, hashset!()); | ||||||
|  |  | ||||||
|  |         // ---- UPDATE THE SETTINGS TO TEST THE SORTABLE | ||||||
|  |  | ||||||
|  |         // update the settings | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |  | ||||||
|  |         builder.set_sortable_fields(hashset!(S("dog.race"))); | ||||||
|  |  | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||||
|  |  | ||||||
|  |         assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); | ||||||
|  |  | ||||||
|  |         let mut search = crate::Search::new(&rtxn, &index); | ||||||
|  |         search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( | ||||||
|  |             "dog.race.bernese mountain", | ||||||
|  |         )))]); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); | ||||||
|  |         assert_eq!(documents_ids, vec![1, 2, 3, 0]); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn index_2_times_documents_split_by_zero_document_indexation() { |     fn index_2_times_documents_split_by_zero_document_indexation() { | ||||||
|         let path = tempfile::tempdir().unwrap(); |         let path = tempfile::tempdir().unwrap(); | ||||||
| @@ -1162,7 +1528,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1178,7 +1545,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1199,7 +1567,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1226,7 +1595,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -1,24 +1,27 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::btree_map::Entry; | use std::collections::hash_map::Entry; | ||||||
| use std::collections::HashMap; | use std::collections::{HashMap, HashSet}; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{Read, Seek, SeekFrom}; | use std::io::{Read, Seek, SeekFrom}; | ||||||
| use std::time::Instant; |  | ||||||
|  |  | ||||||
|  | use byteorder::ReadBytesExt; | ||||||
|  | use fxhash::FxHashMap; | ||||||
|  | use heed::RoTxn; | ||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use log::info; | use obkv::{KvReader, KvWriter}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::{Map, Value}; | use serde_json::{Map, Value}; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; | ||||||
|     create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, |  | ||||||
| }; |  | ||||||
| use super::{IndexDocumentsMethod, IndexerConfig}; | use super::{IndexDocumentsMethod, IndexerConfig}; | ||||||
| use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; | use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; | ||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
| use crate::index::db_name; | use crate::index::db_name; | ||||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||||
| use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; | use crate::{ | ||||||
|  |     ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, | ||||||
|  |     Result, BEU32, | ||||||
|  | }; | ||||||
|  |  | ||||||
| const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | ||||||
|  |  | ||||||
| @@ -30,7 +33,8 @@ pub struct TransformOutput { | |||||||
|     pub new_documents_ids: RoaringBitmap, |     pub new_documents_ids: RoaringBitmap, | ||||||
|     pub replaced_documents_ids: RoaringBitmap, |     pub replaced_documents_ids: RoaringBitmap, | ||||||
|     pub documents_count: usize, |     pub documents_count: usize, | ||||||
|     pub documents_file: File, |     pub original_documents: File, | ||||||
|  |     pub flattened_documents: File, | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Extract the external ids, deduplicate and compute the new internal documents ids | /// Extract the external ids, deduplicate and compute the new internal documents ids | ||||||
| @@ -41,11 +45,17 @@ pub struct TransformOutput { | |||||||
| /// containing all those documents. | /// containing all those documents. | ||||||
| pub struct Transform<'a, 'i> { | pub struct Transform<'a, 'i> { | ||||||
|     pub index: &'i Index, |     pub index: &'i Index, | ||||||
|  |     fields_ids_map: FieldsIdsMap, | ||||||
|  |  | ||||||
|     indexer_settings: &'a IndexerConfig, |     indexer_settings: &'a IndexerConfig, | ||||||
|     pub autogenerate_docids: bool, |     pub autogenerate_docids: bool, | ||||||
|     pub index_documents_method: IndexDocumentsMethod, |     pub index_documents_method: IndexDocumentsMethod, | ||||||
|  |  | ||||||
|     sorter: grenad::Sorter<MergeFn>, |     original_sorter: grenad::Sorter<MergeFn>, | ||||||
|  |     flattened_sorter: grenad::Sorter<MergeFn>, | ||||||
|  |     replaced_documents_ids: RoaringBitmap, | ||||||
|  |     new_documents_ids: RoaringBitmap, | ||||||
|  |     new_external_documents_ids_builder: FxHashMap<Vec<u8>, u64>, | ||||||
|     documents_count: usize, |     documents_count: usize, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -72,6 +82,9 @@ fn create_fields_mapping( | |||||||
|         .collect() |         .collect() | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. | ||||||
|  | /// It doesn't look in the subfield because we don't want to enable the | ||||||
|  | /// primary key inference on nested objects. | ||||||
| fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { | fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { | ||||||
|     index |     index | ||||||
|         .iter() |         .iter() | ||||||
| @@ -83,11 +96,12 @@ fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { | |||||||
|  |  | ||||||
| impl<'a, 'i> Transform<'a, 'i> { | impl<'a, 'i> Transform<'a, 'i> { | ||||||
|     pub fn new( |     pub fn new( | ||||||
|  |         wtxn: &mut heed::RwTxn, | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         indexer_settings: &'a IndexerConfig, |         indexer_settings: &'a IndexerConfig, | ||||||
|         index_documents_method: IndexDocumentsMethod, |         index_documents_method: IndexDocumentsMethod, | ||||||
|         autogenerate_docids: bool, |         autogenerate_docids: bool, | ||||||
|     ) -> Self { |     ) -> Result<Self> { | ||||||
|         // We must choose the appropriate merge function for when two or more documents |         // We must choose the appropriate merge function for when two or more documents | ||||||
|         // with the same user id must be merged or fully replaced in the same batch. |         // with the same user id must be merged or fully replaced in the same batch. | ||||||
|         let merge_function = match index_documents_method { |         let merge_function = match index_documents_method { | ||||||
| @@ -96,22 +110,36 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // We initialize the sorter with the user indexing settings. |         // We initialize the sorter with the user indexing settings. | ||||||
|         let sorter = create_sorter( |         let original_sorter = create_sorter( | ||||||
|             merge_function, |             merge_function, | ||||||
|             indexer_settings.chunk_compression_type, |             indexer_settings.chunk_compression_type, | ||||||
|             indexer_settings.chunk_compression_level, |             indexer_settings.chunk_compression_level, | ||||||
|             indexer_settings.max_nb_chunks, |             indexer_settings.max_nb_chunks, | ||||||
|             indexer_settings.max_memory, |             indexer_settings.max_memory.map(|mem| mem / 2), | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         Transform { |         // We initialize the sorter with the user indexing settings. | ||||||
|  |         let flattened_sorter = create_sorter( | ||||||
|  |             merge_function, | ||||||
|  |             indexer_settings.chunk_compression_type, | ||||||
|  |             indexer_settings.chunk_compression_level, | ||||||
|  |             indexer_settings.max_nb_chunks, | ||||||
|  |             indexer_settings.max_memory.map(|mem| mem / 2), | ||||||
|  |         ); | ||||||
|  |  | ||||||
|  |         Ok(Transform { | ||||||
|             index, |             index, | ||||||
|  |             fields_ids_map: index.fields_ids_map(wtxn)?, | ||||||
|             indexer_settings, |             indexer_settings, | ||||||
|             autogenerate_docids, |             autogenerate_docids, | ||||||
|             sorter, |             original_sorter, | ||||||
|             documents_count: 0, |             flattened_sorter, | ||||||
|             index_documents_method, |             index_documents_method, | ||||||
|         } |             replaced_documents_ids: RoaringBitmap::new(), | ||||||
|  |             new_documents_ids: RoaringBitmap::new(), | ||||||
|  |             new_external_documents_ids_builder: FxHashMap::default(), | ||||||
|  |             documents_count: 0, | ||||||
|  |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn read_documents<R, F>( |     pub fn read_documents<R, F>( | ||||||
| @@ -125,8 +153,11 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         F: Fn(UpdateIndexingStep) + Sync, |         F: Fn(UpdateIndexingStep) + Sync, | ||||||
|     { |     { | ||||||
|         let fields_index = reader.index(); |         let fields_index = reader.index(); | ||||||
|         let mut fields_ids_map = self.index.fields_ids_map(wtxn)?; |         let external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||||
|         let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?; |         let documents_ids = self.index.documents_ids(wtxn)?; | ||||||
|  |         let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); | ||||||
|  |  | ||||||
|  |         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; | ||||||
|  |  | ||||||
|         let alternative_name = self |         let alternative_name = self | ||||||
|             .index |             .index | ||||||
| @@ -136,15 +167,19 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|  |  | ||||||
|         let (primary_key_id, primary_key_name) = compute_primary_key_pair( |         let (primary_key_id, primary_key_name) = compute_primary_key_pair( | ||||||
|             self.index.primary_key(wtxn)?, |             self.index.primary_key(wtxn)?, | ||||||
|             &mut fields_ids_map, |             &mut self.fields_ids_map, | ||||||
|             alternative_name, |             alternative_name, | ||||||
|             self.autogenerate_docids, |             self.autogenerate_docids, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|  |         let primary_key_id_nested = primary_key_name.contains('.'); | ||||||
|  |  | ||||||
|  |         let mut flattened_document = None; | ||||||
|         let mut obkv_buffer = Vec::new(); |         let mut obkv_buffer = Vec::new(); | ||||||
|  |         let mut flattened_obkv_buffer = Vec::new(); | ||||||
|         let mut documents_count = 0; |         let mut documents_count = 0; | ||||||
|         let mut external_id_buffer = Vec::new(); |         let mut external_id_buffer = Vec::new(); | ||||||
|         let mut field_buffer: Vec<(u16, &[u8])> = Vec::new(); |         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); | ||||||
|         while let Some((addition_index, document)) = reader.next_document_with_index()? { |         while let Some((addition_index, document)) = reader.next_document_with_index()? { | ||||||
|             let mut field_buffer_cache = drop_and_reuse(field_buffer); |             let mut field_buffer_cache = drop_and_reuse(field_buffer); | ||||||
|             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { |             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { | ||||||
| @@ -154,8 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             for (k, v) in document.iter() { |             for (k, v) in document.iter() { | ||||||
|                 let mapped_id = *mapping.get(&k).unwrap(); |                 let mapped_id = | ||||||
|                 field_buffer_cache.push((mapped_id, v)); |                     *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; | ||||||
|  |                 field_buffer_cache.push((mapped_id, Cow::from(v))); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // We need to make sure that every document has a primary key. After we have remapped |             // We need to make sure that every document has a primary key. After we have remapped | ||||||
| @@ -164,87 +200,125 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             // document. If none is found, and we were told to generate missing document ids, then |             // document. If none is found, and we were told to generate missing document ids, then | ||||||
|             // we create the missing field, and update the new document. |             // we create the missing field, and update the new document. | ||||||
|             let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; |             let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; | ||||||
|             let external_id = |             let external_id = if primary_key_id_nested { | ||||||
|                 match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { |                 let mut field_buffer_cache = field_buffer_cache.clone(); | ||||||
|                     Some((_, bytes)) => { |                 self.flatten_from_field_mapping( | ||||||
|                         let value = match serde_json::from_slice(bytes).unwrap() { |                     &mapping, | ||||||
|                             Value::String(string) => match validate_document_id(&string) { |                     &document, | ||||||
|                                 Some(s) if s.len() == string.len() => string, |                     &mut flattened_obkv_buffer, | ||||||
|                                 Some(s) => s.to_string(), |                     &mut field_buffer_cache, | ||||||
|                                 None => { |                 )?; | ||||||
|                                     return Err(UserError::InvalidDocumentId { |                 flattened_document = Some(&flattened_obkv_buffer); | ||||||
|                                         document_id: Value::String(string), |                 let document = KvReader::new(&flattened_obkv_buffer); | ||||||
|                                     } |  | ||||||
|                                     .into()) |  | ||||||
|                                 } |  | ||||||
|                             }, |  | ||||||
|                             Value::Number(number) => number.to_string(), |  | ||||||
|                             content => { |  | ||||||
|                                 return Err(UserError::InvalidDocumentId { |  | ||||||
|                                     document_id: content.clone(), |  | ||||||
|                                 } |  | ||||||
|                                 .into()) |  | ||||||
|                             } |  | ||||||
|                         }; |  | ||||||
|                         serde_json::to_writer(&mut external_id_buffer, &value).unwrap(); |  | ||||||
|                         Cow::Owned(value) |  | ||||||
|                     } |  | ||||||
|                     None => { |  | ||||||
|                         if !self.autogenerate_docids { |  | ||||||
|                             let mut json = Map::new(); |  | ||||||
|                             for (key, value) in document.iter() { |  | ||||||
|                                 let key = addition_index.name(key).cloned(); |  | ||||||
|                                 let value = serde_json::from_slice::<Value>(&value).ok(); |  | ||||||
|  |  | ||||||
|                                 if let Some((k, v)) = key.zip(value) { |                 update_primary_key( | ||||||
|                                     json.insert(k, v); |                     document, | ||||||
|                                 } |                     &addition_index, | ||||||
|                             } |                     primary_key_id, | ||||||
|  |                     &primary_key_name, | ||||||
|                             return Err(UserError::MissingDocumentId { |                     &mut uuid_buffer, | ||||||
|                                 primary_key: primary_key_name, |                     &mut field_buffer_cache, | ||||||
|                                 document: json, |                     &mut external_id_buffer, | ||||||
|                             } |                     self.autogenerate_docids, | ||||||
|                             .into()); |                 )? | ||||||
|                         } |             } else { | ||||||
|  |                 update_primary_key( | ||||||
|                         let uuid = |                     document, | ||||||
|                             uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); |                     &addition_index, | ||||||
|                         serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap(); |                     primary_key_id, | ||||||
|                         field_buffer_cache.push((primary_key_id, &external_id_buffer)); |                     &primary_key_name, | ||||||
|                         Cow::Borrowed(&*uuid) |                     &mut uuid_buffer, | ||||||
|                     } |                     &mut field_buffer_cache, | ||||||
|                 }; |                     &mut external_id_buffer, | ||||||
|  |                     self.autogenerate_docids, | ||||||
|  |                 )? | ||||||
|  |             }; | ||||||
|  |  | ||||||
|             // Insertion in a obkv need to be done with keys ordered. For now they are ordered |             // Insertion in a obkv need to be done with keys ordered. For now they are ordered | ||||||
|             // according to the document addition key order, so we sort it according to the |             // according to the document addition key order, so we sort it according to the | ||||||
|             // fieldids map keys order. |             // fieldids map keys order. | ||||||
|             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); |             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); | ||||||
|  |  | ||||||
|             // The last step is to build the new obkv document, and insert it in the sorter. |             // Build the new obkv document. | ||||||
|             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); |             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); | ||||||
|             for (k, v) in field_buffer_cache.iter() { |             for (k, v) in field_buffer_cache.iter() { | ||||||
|                 writer.insert(*k, v)?; |                 writer.insert(*k, v)?; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  |             let (docid, should_insert_original_document) = | ||||||
|  |                 match external_documents_ids.get(&*external_id) { | ||||||
|  |                     // if the document is in the db but has already been inserted | ||||||
|  |                     // (ie: already exists in the list of replaced documents ids), | ||||||
|  |                     // we should not add the original document a second time. | ||||||
|  |                     Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)), | ||||||
|  |                     None => { | ||||||
|  |                         // if the document has already been inserted in this | ||||||
|  |                         // batch we need to get its docid | ||||||
|  |                         match self | ||||||
|  |                             .new_external_documents_ids_builder | ||||||
|  |                             .entry(external_id.as_bytes().to_vec()) | ||||||
|  |                         { | ||||||
|  |                             Entry::Occupied(entry) => (*entry.get() as u32, false), | ||||||
|  |                             // if the document has never been encountered we give it a new docid | ||||||
|  |                             // and push this new docid to the external documents ids builder | ||||||
|  |                             Entry::Vacant(entry) => { | ||||||
|  |                                 let new_docid = available_documents_ids | ||||||
|  |                                     .next() | ||||||
|  |                                     .ok_or(UserError::DocumentLimitReached)?; | ||||||
|  |                                 entry.insert(new_docid as u64); | ||||||
|  |                                 (new_docid, false) | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|  |             if should_insert_original_document { | ||||||
|  |                 self.replaced_documents_ids.insert(docid); | ||||||
|  |  | ||||||
|  |                 let key = BEU32::new(docid); | ||||||
|  |                 let base_obkv = self | ||||||
|  |                     .index | ||||||
|  |                     .documents | ||||||
|  |                     .remap_data_type::<heed::types::ByteSlice>() | ||||||
|  |                     .get(wtxn, &key)? | ||||||
|  |                     .ok_or(InternalError::DatabaseMissingEntry { | ||||||
|  |                         db_name: db_name::DOCUMENTS, | ||||||
|  |                         key: None, | ||||||
|  |                     })?; | ||||||
|  |  | ||||||
|  |                 self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; | ||||||
|  |                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; | ||||||
|  |  | ||||||
|  |                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; | ||||||
|  |             } else { | ||||||
|  |                 self.new_documents_ids.insert(docid); | ||||||
|  |             } | ||||||
|  |  | ||||||
|             // We use the extracted/generated user id as the key for this document. |             // We use the extracted/generated user id as the key for this document. | ||||||
|             self.sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?; |             self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; | ||||||
|             documents_count += 1; |             documents_count += 1; | ||||||
|  |  | ||||||
|  |             if let Some(flatten) = flattened_document { | ||||||
|  |                 self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; | ||||||
|  |             } else { | ||||||
|  |                 let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; | ||||||
|  |                 self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; | ||||||
|  |             } | ||||||
|  |  | ||||||
|             progress_callback(UpdateIndexingStep::RemapDocumentAddition { |             progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||||
|                 documents_seen: documents_count, |                 documents_seen: documents_count, | ||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             obkv_buffer.clear(); |  | ||||||
|             field_buffer = drop_and_reuse(field_buffer_cache); |             field_buffer = drop_and_reuse(field_buffer_cache); | ||||||
|             external_id_buffer.clear(); |             external_id_buffer.clear(); | ||||||
|  |             obkv_buffer.clear(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         progress_callback(UpdateIndexingStep::RemapDocumentAddition { |         progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||||
|             documents_seen: documents_count, |             documents_seen: documents_count, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         self.index.put_fields_ids_map(wtxn, &fields_ids_map)?; |         self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; | ||||||
|         self.index.put_primary_key(wtxn, &primary_key_name)?; |         self.index.put_primary_key(wtxn, &primary_key_name)?; | ||||||
|         self.documents_count += documents_count; |         self.documents_count += documents_count; | ||||||
|         // Now that we have a valid sorter that contains the user id and the obkv we |         // Now that we have a valid sorter that contains the user id and the obkv we | ||||||
| @@ -252,6 +326,87 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         Ok(documents_count) |         Ok(documents_count) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // Flatten a document from the fields ids map contained in self and insert the new | ||||||
|  |     // created fields. | ||||||
|  |     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Vec<u8>> { | ||||||
|  |         let mut doc = serde_json::Map::new(); | ||||||
|  |  | ||||||
|  |         for (k, v) in obkv.iter() { | ||||||
|  |             let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { | ||||||
|  |                 field_id: k, | ||||||
|  |                 process: "Flatten from fields ids map.", | ||||||
|  |             })?; | ||||||
|  |             let value = serde_json::from_slice::<serde_json::Value>(v) | ||||||
|  |                 .map_err(crate::error::InternalError::SerdeJson)?; | ||||||
|  |             doc.insert(key.to_string(), value); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let flattened = flatten_serde_json::flatten(&doc); | ||||||
|  |  | ||||||
|  |         // Once we have the flattened version we can convert it back to obkv and | ||||||
|  |         // insert all the new generated fields_ids (if any) in the fields ids map. | ||||||
|  |         let mut buffer: Vec<u8> = Vec::new(); | ||||||
|  |         let mut writer = KvWriter::new(&mut buffer); | ||||||
|  |         let mut flattened: Vec<_> = flattened.into_iter().collect(); | ||||||
|  |         // we reorder the field to get all the known field first | ||||||
|  |         flattened | ||||||
|  |             .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); | ||||||
|  |  | ||||||
|  |         for (key, value) in flattened { | ||||||
|  |             let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||||
|  |             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||||
|  |             writer.insert(fid, &value)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(buffer) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Flatten a document from a field mapping generated by [create_fields_mapping] | ||||||
|  |     fn flatten_from_field_mapping( | ||||||
|  |         &mut self, | ||||||
|  |         mapping: &HashMap<FieldId, FieldId>, | ||||||
|  |         obkv: &KvReader<FieldId>, | ||||||
|  |         output_buffer: &mut Vec<u8>, | ||||||
|  |         field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, | ||||||
|  |     ) -> Result<()> { | ||||||
|  |         // if the primary_key is nested we need to flatten the document before being able to do anything | ||||||
|  |         let mut doc = serde_json::Map::new(); | ||||||
|  |  | ||||||
|  |         for (k, v) in obkv.iter() { | ||||||
|  |             let key = | ||||||
|  |                 mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; | ||||||
|  |             let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||||
|  |                 field_id: *key, | ||||||
|  |                 process: "Flatten from field mapping.", | ||||||
|  |             })?; | ||||||
|  |             let value = | ||||||
|  |                 serde_json::from_slice::<serde_json::Value>(v).map_err(InternalError::SerdeJson)?; | ||||||
|  |             doc.insert(key.to_string(), value); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let flattened = flatten_serde_json::flatten(&doc); | ||||||
|  |  | ||||||
|  |         // Once we have the flattened version we can convert it back to obkv and | ||||||
|  |         // insert all the new generated fields_ids (if any) in the fields ids map. | ||||||
|  |         output_buffer.clear(); | ||||||
|  |         let mut writer = KvWriter::new(output_buffer); | ||||||
|  |         let mut flattened: Vec<_> = flattened.into_iter().collect(); | ||||||
|  |         // we reorder the field to get all the known field first | ||||||
|  |         flattened | ||||||
|  |             .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); | ||||||
|  |  | ||||||
|  |         for (key, value) in flattened { | ||||||
|  |             let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||||
|  |             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||||
|  |             writer.insert(fid, &value)?; | ||||||
|  |             if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { | ||||||
|  |                 field_buffer_cache.push((fid, value.into())); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /// Generate the `TransformOutput` based on the given sorter that can be generated from any |     /// Generate the `TransformOutput` based on the given sorter that can be generated from any | ||||||
|     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document |     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document | ||||||
|     /// id for the user side and the value must be an obkv where keys are valid fields ids. |     /// id for the user side and the value must be an obkv where keys are valid fields ids. | ||||||
| @@ -268,110 +423,8 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             .primary_key(&wtxn)? |             .primary_key(&wtxn)? | ||||||
|             .ok_or(Error::UserError(UserError::MissingPrimaryKey))? |             .ok_or(Error::UserError(UserError::MissingPrimaryKey))? | ||||||
|             .to_string(); |             .to_string(); | ||||||
|         let fields_ids_map = self.index.fields_ids_map(wtxn)?; |  | ||||||
|         let approximate_number_of_documents = self.documents_count; |  | ||||||
|  |  | ||||||
|         let mut external_documents_ids = self.index.external_documents_ids(wtxn).unwrap(); |         let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||||
|         let documents_ids = self.index.documents_ids(wtxn)?; |  | ||||||
|         let mut field_distribution = self.index.field_distribution(wtxn)?; |  | ||||||
|         let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); |  | ||||||
|  |  | ||||||
|         // consume sorter, in order to free the internal allocation, before creating a new one. |  | ||||||
|         let mut iter = self.sorter.into_stream_merger_iter()?; |  | ||||||
|  |  | ||||||
|         // Once we have sort and deduplicated the documents we write them into a final file. |  | ||||||
|         let mut final_sorter = create_sorter( |  | ||||||
|             |_id, obkvs| { |  | ||||||
|                 if obkvs.len() == 1 { |  | ||||||
|                     Ok(obkvs[0].clone()) |  | ||||||
|                 } else { |  | ||||||
|                     Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) |  | ||||||
|                 } |  | ||||||
|             }, |  | ||||||
|             self.indexer_settings.chunk_compression_type, |  | ||||||
|             self.indexer_settings.chunk_compression_level, |  | ||||||
|             self.indexer_settings.max_nb_chunks, |  | ||||||
|             self.indexer_settings.max_memory, |  | ||||||
|         ); |  | ||||||
|         let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); |  | ||||||
|         let mut replaced_documents_ids = RoaringBitmap::new(); |  | ||||||
|         let mut new_documents_ids = RoaringBitmap::new(); |  | ||||||
|         let mut obkv_buffer = Vec::new(); |  | ||||||
|  |  | ||||||
|         // While we write into final file we get or generate the internal documents ids. |  | ||||||
|         let mut documents_count = 0; |  | ||||||
|         while let Some((external_id, update_obkv)) = iter.next()? { |  | ||||||
|             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { |  | ||||||
|                 progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { |  | ||||||
|                     documents_seen: documents_count, |  | ||||||
|                     total_documents: approximate_number_of_documents, |  | ||||||
|                 }); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             let (docid, obkv) = match external_documents_ids.get(external_id) { |  | ||||||
|                 Some(docid) => { |  | ||||||
|                     // If we find the user id in the current external documents ids map |  | ||||||
|                     // we use it and insert it in the list of replaced documents. |  | ||||||
|                     replaced_documents_ids.insert(docid); |  | ||||||
|  |  | ||||||
|                     let key = BEU32::new(docid); |  | ||||||
|                     let base_obkv = self.index.documents.get(wtxn, &key)?.ok_or( |  | ||||||
|                         InternalError::DatabaseMissingEntry { |  | ||||||
|                             db_name: db_name::DOCUMENTS, |  | ||||||
|                             key: None, |  | ||||||
|                         }, |  | ||||||
|                     )?; |  | ||||||
|  |  | ||||||
|                     // we remove all the fields that were already counted |  | ||||||
|                     for (field_id, _) in base_obkv.iter() { |  | ||||||
|                         let field_name = fields_ids_map.name(field_id).unwrap(); |  | ||||||
|                         if let Entry::Occupied(mut entry) = |  | ||||||
|                             field_distribution.entry(field_name.to_string()) |  | ||||||
|                         { |  | ||||||
|                             match entry.get().checked_sub(1) { |  | ||||||
|                                 Some(0) | None => entry.remove(), |  | ||||||
|                                 Some(count) => entry.insert(count), |  | ||||||
|                             }; |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     // Depending on the update indexing method we will merge |  | ||||||
|                     // the document update with the current document or not. |  | ||||||
|                     match self.index_documents_method { |  | ||||||
|                         IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), |  | ||||||
|                         IndexDocumentsMethod::UpdateDocuments => { |  | ||||||
|                             let update_obkv = obkv::KvReader::new(update_obkv); |  | ||||||
|                             merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); |  | ||||||
|                             (docid, obkv_buffer.as_slice()) |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 None => { |  | ||||||
|                     // If this user id is new we add it to the external documents ids map |  | ||||||
|                     // for new ids and into the list of new documents. |  | ||||||
|                     let new_docid = |  | ||||||
|                         available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?; |  | ||||||
|                     new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; |  | ||||||
|                     new_documents_ids.insert(new_docid); |  | ||||||
|                     (new_docid, update_obkv) |  | ||||||
|                 } |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             // We insert the document under the documents ids map into the final file. |  | ||||||
|             final_sorter.insert(docid.to_be_bytes(), obkv)?; |  | ||||||
|             documents_count += 1; |  | ||||||
|  |  | ||||||
|             let reader = obkv::KvReader::new(obkv); |  | ||||||
|             for (field_id, _) in reader.iter() { |  | ||||||
|                 let field_name = fields_ids_map.name(field_id).unwrap(); |  | ||||||
|                 *field_distribution.entry(field_name.to_string()).or_default() += 1; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { |  | ||||||
|             documents_seen: documents_count, |  | ||||||
|             total_documents: documents_count, |  | ||||||
|         }); |  | ||||||
|  |  | ||||||
|         // We create a final writer to write the new documents in order from the sorter. |         // We create a final writer to write the new documents in order from the sorter. | ||||||
|         let mut writer = create_writer( |         let mut writer = create_writer( | ||||||
| @@ -380,28 +433,103 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             tempfile::tempfile()?, |             tempfile::tempfile()?, | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|  |         // Once we have all the documents in the sorter, we write the documents | ||||||
|  |         // in the writer. We also generate the field distribution. | ||||||
|  |         let mut field_distribution = self.index.field_distribution(wtxn)?; | ||||||
|  |         let mut iter = self.original_sorter.into_stream_merger_iter()?; | ||||||
|  |         // used only for the callback | ||||||
|  |         let mut documents_count = 0; | ||||||
|  |  | ||||||
|  |         while let Some((key, val)) = iter.next()? { | ||||||
|  |             // send a callback to show at which step we are | ||||||
|  |             documents_count += 1; | ||||||
|  |             progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { | ||||||
|  |                 documents_seen: documents_count, | ||||||
|  |                 total_documents: self.documents_count, | ||||||
|  |             }); | ||||||
|  |  | ||||||
|  |             let u32_key = key.clone().read_u32::<byteorder::BigEndian>()?; | ||||||
|  |             // if the document was already in the db we remove all of its field | ||||||
|  |             // from the field distribution. | ||||||
|  |             if self.replaced_documents_ids.contains(u32_key) { | ||||||
|  |                 let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or( | ||||||
|  |                     InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, | ||||||
|  |                 )?; | ||||||
|  |  | ||||||
|  |                 for (key, _) in obkv.iter() { | ||||||
|  |                     let name = | ||||||
|  |                         self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||||
|  |                             field_id: key, | ||||||
|  |                             process: "Computing field distribution in transform.", | ||||||
|  |                         })?; | ||||||
|  |                     // We checked that the document was in the db earlier. If we can't find it it means | ||||||
|  |                     // there is an inconsistency between the field distribution and the field id map. | ||||||
|  |                     let field = field_distribution.get_mut(name).ok_or( | ||||||
|  |                         FieldIdMapMissingEntry::FieldId { | ||||||
|  |                             field_id: key, | ||||||
|  |                             process: "Accessing field distribution in transform.", | ||||||
|  |                         }, | ||||||
|  |                     )?; | ||||||
|  |                     *field -= 1; | ||||||
|  |                     if *field == 0 { | ||||||
|  |                         // since we were able to get the field right before it's safe to unwrap here | ||||||
|  |                         field_distribution.remove(name).unwrap(); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // We increment all the field of the current document in the field distribution. | ||||||
|  |             let obkv = KvReader::new(val); | ||||||
|  |  | ||||||
|  |             for (key, _) in obkv.iter() { | ||||||
|  |                 let name = | ||||||
|  |                     self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||||
|  |                         field_id: key, | ||||||
|  |                         process: "Computing field distribution in transform.", | ||||||
|  |                     })?; | ||||||
|  |                 *field_distribution.entry(name.to_string()).or_insert(0) += 1; | ||||||
|  |             } | ||||||
|  |             writer.insert(key, val)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let mut original_documents = writer.into_inner()?; | ||||||
|  |         // We then extract the file and reset the seek to be able to read it again. | ||||||
|  |         original_documents.seek(SeekFrom::Start(0))?; | ||||||
|  |  | ||||||
|  |         // We create a final writer to write the new documents in order from the sorter. | ||||||
|  |         let mut writer = create_writer( | ||||||
|  |             self.indexer_settings.chunk_compression_type, | ||||||
|  |             self.indexer_settings.chunk_compression_level, | ||||||
|  |             tempfile::tempfile()?, | ||||||
|  |         ); | ||||||
|         // Once we have written all the documents into the final sorter, we write the documents |         // Once we have written all the documents into the final sorter, we write the documents | ||||||
|         // into this writer, extract the file and reset the seek to be able to read it again. |         // into this writer, extract the file and reset the seek to be able to read it again. | ||||||
|         final_sorter.write_into_stream_writer(&mut writer)?; |         self.flattened_sorter.write_into_stream_writer(&mut writer)?; | ||||||
|         let mut documents_file = writer.into_inner()?; |         let mut flattened_documents = writer.into_inner()?; | ||||||
|         documents_file.seek(SeekFrom::Start(0))?; |         flattened_documents.seek(SeekFrom::Start(0))?; | ||||||
|  |  | ||||||
|         let before_docids_merging = Instant::now(); |         let mut new_external_documents_ids_builder: Vec<_> = | ||||||
|         // We merge the new external ids with existing external documents ids. |             self.new_external_documents_ids_builder.into_iter().collect(); | ||||||
|         let new_external_documents_ids = new_external_documents_ids_builder.into_map(); |  | ||||||
|  |         new_external_documents_ids_builder | ||||||
|  |             .sort_unstable_by(|(left, _), (right, _)| left.cmp(&right)); | ||||||
|  |         let mut fst_new_external_documents_ids_builder = fst::MapBuilder::memory(); | ||||||
|  |         new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { | ||||||
|  |             fst_new_external_documents_ids_builder.insert(key, value) | ||||||
|  |         })?; | ||||||
|  |         let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); | ||||||
|         external_documents_ids.insert_ids(&new_external_documents_ids)?; |         external_documents_ids.insert_ids(&new_external_documents_ids)?; | ||||||
|  |  | ||||||
|         info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); |  | ||||||
|  |  | ||||||
|         Ok(TransformOutput { |         Ok(TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map, |             fields_ids_map: self.fields_ids_map, | ||||||
|             field_distribution, |             field_distribution, | ||||||
|             external_documents_ids: external_documents_ids.into_static(), |             external_documents_ids: external_documents_ids.into_static(), | ||||||
|             new_documents_ids, |             new_documents_ids: self.new_documents_ids, | ||||||
|             replaced_documents_ids, |             replaced_documents_ids: self.replaced_documents_ids, | ||||||
|             documents_count, |             documents_count: self.documents_count, | ||||||
|             documents_file, |             original_documents, | ||||||
|  |             flattened_documents, | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -412,7 +540,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         self, |         self, | ||||||
|         wtxn: &mut heed::RwTxn, |         wtxn: &mut heed::RwTxn, | ||||||
|         old_fields_ids_map: FieldsIdsMap, |         old_fields_ids_map: FieldsIdsMap, | ||||||
|         new_fields_ids_map: FieldsIdsMap, |         mut new_fields_ids_map: FieldsIdsMap, | ||||||
|     ) -> Result<TransformOutput> { |     ) -> Result<TransformOutput> { | ||||||
|         // There already has been a document addition, the primary key should be set by now. |         // There already has been a document addition, the primary key should be set by now. | ||||||
|         let primary_key = |         let primary_key = | ||||||
| @@ -423,7 +551,14 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         let documents_count = documents_ids.len() as usize; |         let documents_count = documents_ids.len() as usize; | ||||||
|  |  | ||||||
|         // We create a final writer to write the new documents in order from the sorter. |         // We create a final writer to write the new documents in order from the sorter. | ||||||
|         let mut writer = create_writer( |         let mut original_writer = create_writer( | ||||||
|  |             self.indexer_settings.chunk_compression_type, | ||||||
|  |             self.indexer_settings.chunk_compression_level, | ||||||
|  |             tempfile::tempfile()?, | ||||||
|  |         ); | ||||||
|  |  | ||||||
|  |         // We create a final writer to write the new documents in order from the sorter. | ||||||
|  |         let mut flattened_writer = create_writer( | ||||||
|             self.indexer_settings.chunk_compression_type, |             self.indexer_settings.chunk_compression_type, | ||||||
|             self.indexer_settings.chunk_compression_level, |             self.indexer_settings.chunk_compression_level, | ||||||
|             tempfile::tempfile()?, |             tempfile::tempfile()?, | ||||||
| @@ -445,13 +580,51 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             let buffer = obkv_writer.into_inner()?; |             let buffer = obkv_writer.into_inner()?; | ||||||
|             writer.insert(docid.to_be_bytes(), buffer)?; |             original_writer.insert(docid.to_be_bytes(), &buffer)?; | ||||||
|  |  | ||||||
|  |             // Once we have the document. We're going to flatten it | ||||||
|  |             // and insert it in the flattened sorter. | ||||||
|  |             let mut doc = serde_json::Map::new(); | ||||||
|  |  | ||||||
|  |             let reader = obkv::KvReader::new(buffer); | ||||||
|  |             for (k, v) in reader.iter() { | ||||||
|  |                 let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { | ||||||
|  |                     field_id: k, | ||||||
|  |                     process: "Accessing field distribution in transform.", | ||||||
|  |                 })?; | ||||||
|  |                 let value = serde_json::from_slice::<serde_json::Value>(v) | ||||||
|  |                     .map_err(InternalError::SerdeJson)?; | ||||||
|  |                 doc.insert(key.to_string(), value); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             let flattened = flatten_serde_json::flatten(&doc); | ||||||
|  |  | ||||||
|  |             // Once we have the flattened version we can convert it back to obkv and | ||||||
|  |             // insert all the new generated fields_ids (if any) in the fields ids map. | ||||||
|  |             let mut buffer: Vec<u8> = Vec::new(); | ||||||
|  |             let mut writer = KvWriter::new(&mut buffer); | ||||||
|  |             let mut flattened: Vec<_> = flattened.into_iter().collect(); | ||||||
|  |             // we reorder the field to get all the known field first | ||||||
|  |             flattened.sort_unstable_by_key(|(key, _)| { | ||||||
|  |                 new_fields_ids_map.id(&key).unwrap_or(FieldId::MAX) | ||||||
|  |             }); | ||||||
|  |  | ||||||
|  |             for (key, value) in flattened { | ||||||
|  |                 let fid = | ||||||
|  |                     new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||||
|  |                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||||
|  |                 writer.insert(fid, &value)?; | ||||||
|  |             } | ||||||
|  |             flattened_writer.insert(docid.to_be_bytes(), &buffer)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Once we have written all the documents, we extract |         // Once we have written all the documents, we extract | ||||||
|         // the file and reset the seek to be able to read it again. |         // the file and reset the seek to be able to read it again. | ||||||
|         let mut documents_file = writer.into_inner()?; |         let mut original_documents = original_writer.into_inner()?; | ||||||
|         documents_file.seek(SeekFrom::Start(0))?; |         original_documents.seek(SeekFrom::Start(0))?; | ||||||
|  |  | ||||||
|  |         let mut flattened_documents = flattened_writer.into_inner()?; | ||||||
|  |         flattened_documents.seek(SeekFrom::Start(0))?; | ||||||
|  |  | ||||||
|         Ok(TransformOutput { |         Ok(TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
| @@ -461,7 +634,8 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             new_documents_ids: documents_ids, |             new_documents_ids: documents_ids, | ||||||
|             replaced_documents_ids: RoaringBitmap::default(), |             replaced_documents_ids: RoaringBitmap::default(), | ||||||
|             documents_count, |             documents_count, | ||||||
|             documents_file, |             original_documents, | ||||||
|  |             flattened_documents, | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -521,11 +695,84 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> { | |||||||
|     vec.into_iter().map(|_| unreachable!()).collect() |     vec.into_iter().map(|_| unreachable!()).collect() | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn update_primary_key<'a>( | ||||||
|  |     document: KvReader<'a, FieldId>, | ||||||
|  |     addition_index: &DocumentsBatchIndex, | ||||||
|  |     primary_key_id: FieldId, | ||||||
|  |     primary_key_name: &str, | ||||||
|  |     uuid_buffer: &'a mut [u8; uuid::adapter::Hyphenated::LENGTH], | ||||||
|  |     field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, | ||||||
|  |     mut external_id_buffer: &'a mut Vec<u8>, | ||||||
|  |     autogenerate_docids: bool, | ||||||
|  | ) -> Result<Cow<'a, str>> { | ||||||
|  |     match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { | ||||||
|  |         Some((_, bytes)) => { | ||||||
|  |             let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { | ||||||
|  |                 Value::String(string) => match validate_document_id(&string) { | ||||||
|  |                     Some(s) if s.len() == string.len() => string, | ||||||
|  |                     Some(s) => s.to_string(), | ||||||
|  |                     None => { | ||||||
|  |                         return Err(UserError::InvalidDocumentId { | ||||||
|  |                             document_id: Value::String(string), | ||||||
|  |                         } | ||||||
|  |                         .into()) | ||||||
|  |                     } | ||||||
|  |                 }, | ||||||
|  |                 Value::Number(number) => number.to_string(), | ||||||
|  |                 content => { | ||||||
|  |                     return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) | ||||||
|  |                 } | ||||||
|  |             }; | ||||||
|  |             serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; | ||||||
|  |             Ok(Cow::Owned(value)) | ||||||
|  |         } | ||||||
|  |         None if autogenerate_docids => { | ||||||
|  |             let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); | ||||||
|  |             serde_json::to_writer(&mut external_id_buffer, &uuid) | ||||||
|  |                 .map_err(InternalError::SerdeJson)?; | ||||||
|  |             field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); | ||||||
|  |             Ok(Cow::Borrowed(&*uuid)) | ||||||
|  |         } | ||||||
|  |         None => { | ||||||
|  |             let mut json = Map::new(); | ||||||
|  |             for (key, value) in document.iter() { | ||||||
|  |                 let key = addition_index.name(key).cloned(); | ||||||
|  |                 let value = serde_json::from_slice::<Value>(&value).ok(); | ||||||
|  |  | ||||||
|  |                 if let Some((k, v)) = key.zip(value) { | ||||||
|  |                     json.insert(k, v); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             Err(UserError::MissingDocumentId { | ||||||
|  |                 primary_key: primary_key_name.to_string(), | ||||||
|  |                 document: json, | ||||||
|  |             })? | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl TransformOutput { | ||||||
|  |     // find and insert the new field ids | ||||||
|  |     pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> { | ||||||
|  |         let user_defined_facets = index.user_defined_faceted_fields(rtxn)?; | ||||||
|  |  | ||||||
|  |         Ok(self | ||||||
|  |             .fields_ids_map | ||||||
|  |             .names() | ||||||
|  |             .filter(|&field| crate::is_faceted(field, &user_defined_facets)) | ||||||
|  |             .map(|field| field.to_string()) | ||||||
|  |             .collect()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod test { | mod test { | ||||||
|     use super::*; |     use super::*; | ||||||
|  |  | ||||||
|     mod compute_primary_key { |     mod compute_primary_key { | ||||||
|  |         use big_s::S; | ||||||
|  |  | ||||||
|         use super::{compute_primary_key_pair, FieldsIdsMap}; |         use super::{compute_primary_key_pair, FieldsIdsMap}; | ||||||
|  |  | ||||||
|         #[test] |         #[test] | ||||||
| @@ -540,6 +787,18 @@ mod test { | |||||||
|             ); |             ); | ||||||
|             assert_eq!(result.unwrap(), (0, "toto".to_string())); |             assert_eq!(result.unwrap(), (0, "toto".to_string())); | ||||||
|             assert_eq!(fields_map.len(), 1); |             assert_eq!(fields_map.len(), 1); | ||||||
|  |  | ||||||
|  |             // and with nested fields | ||||||
|  |             let mut fields_map = FieldsIdsMap::new(); | ||||||
|  |             fields_map.insert("toto.tata").unwrap(); | ||||||
|  |             let result = compute_primary_key_pair( | ||||||
|  |                 Some("toto.tata"), | ||||||
|  |                 &mut fields_map, | ||||||
|  |                 Some(S("titi")), | ||||||
|  |                 false, | ||||||
|  |             ); | ||||||
|  |             assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); | ||||||
|  |             assert_eq!(fields_map.len(), 1); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         #[test] |         #[test] | ||||||
| @@ -547,7 +806,7 @@ mod test { | |||||||
|             let mut fields_map = FieldsIdsMap::new(); |             let mut fields_map = FieldsIdsMap::new(); | ||||||
|             let result = |             let result = | ||||||
|                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); |                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); | ||||||
|             assert_eq!(result.unwrap(), (0, "tata".to_string())); |             assert_eq!(result.unwrap(), (0, S("tata"))); | ||||||
|             assert_eq!(fields_map.len(), 1); |             assert_eq!(fields_map.len(), 1); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -555,7 +814,7 @@ mod test { | |||||||
|         fn should_return_default_if_both_are_none() { |         fn should_return_default_if_both_are_none() { | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |             let mut fields_map = FieldsIdsMap::new(); | ||||||
|             let result = compute_primary_key_pair(None, &mut fields_map, None, true); |             let result = compute_primary_key_pair(None, &mut fields_map, None, true); | ||||||
|             assert_eq!(result.unwrap(), (0, "id".to_string())); |             assert_eq!(result.unwrap(), (0, S("id"))); | ||||||
|             assert_eq!(fields_map.len(), 1); |             assert_eq!(fields_map.len(), 1); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -569,6 +828,7 @@ mod test { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     mod primary_key_inference { |     mod primary_key_inference { | ||||||
|  |         use big_s::S; | ||||||
|         use bimap::BiHashMap; |         use bimap::BiHashMap; | ||||||
|  |  | ||||||
|         use crate::documents::DocumentsBatchIndex; |         use crate::documents::DocumentsBatchIndex; | ||||||
| @@ -579,11 +839,11 @@ mod test { | |||||||
|             // We run the test multiple times to change the order in which the fields are iterated upon. |             // We run the test multiple times to change the order in which the fields are iterated upon. | ||||||
|             for _ in 1..50 { |             for _ in 1..50 { | ||||||
|                 let mut map = BiHashMap::new(); |                 let mut map = BiHashMap::new(); | ||||||
|                 map.insert(1, "fakeId".to_string()); |                 map.insert(1, S("fakeId")); | ||||||
|                 map.insert(2, "fakeId".to_string()); |                 map.insert(2, S("fakeId")); | ||||||
|                 map.insert(3, "fakeId".to_string()); |                 map.insert(3, S("fakeId")); | ||||||
|                 map.insert(4, "fakeId".to_string()); |                 map.insert(4, S("fakeId")); | ||||||
|                 map.insert(0, "realId".to_string()); |                 map.insert(0, S("realId")); | ||||||
|  |  | ||||||
|                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); |                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); | ||||||
|             } |             } | ||||||
|   | |||||||
| @@ -249,11 +249,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         let transform = Transform::new( |         let transform = Transform::new( | ||||||
|  |             self.wtxn, | ||||||
|             &self.index, |             &self.index, | ||||||
|             &self.indexer_config, |             &self.indexer_config, | ||||||
|             IndexDocumentsMethod::ReplaceDocuments, |             IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             false, |             false, | ||||||
|         ); |         )?; | ||||||
|  |  | ||||||
|         // We remap the documents fields based on the new `FieldsIdsMap`. |         // We remap the documents fields based on the new `FieldsIdsMap`. | ||||||
|         let output = transform.remap_index_documents( |         let output = transform.remap_index_documents( | ||||||
| @@ -262,6 +263,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|             fields_ids_map.clone(), |             fields_ids_map.clone(), | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|  |         let new_facets = output.compute_real_facets(self.wtxn, self.index)?; | ||||||
|  |         self.index.put_faceted_fields(self.wtxn, &new_facets)?; | ||||||
|  |  | ||||||
|         // We clear the full database (words-fst, documents ids and documents content). |         // We clear the full database (words-fst, documents ids and documents content). | ||||||
|         ClearDocuments::new(self.wtxn, self.index).execute()?; |         ClearDocuments::new(self.wtxn, self.index).execute()?; | ||||||
|  |  | ||||||
| @@ -273,7 +277,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|             &self.indexer_config, |             &self.indexer_config, | ||||||
|             IndexDocumentsConfig::default(), |             IndexDocumentsConfig::default(), | ||||||
|             &cb, |             &cb, | ||||||
|         ); |         )?; | ||||||
|         indexing_builder.execute_raw(output)?; |         indexing_builder.execute_raw(output)?; | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -583,7 +587,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     { |     { | ||||||
|         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; |         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; | ||||||
|  |  | ||||||
|         let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; |         let old_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; | ||||||
|         let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; |         let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; | ||||||
|  |  | ||||||
|         self.update_displayed()?; |         self.update_displayed()?; | ||||||
| @@ -599,7 +603,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|         // If there is new faceted fields we indicate that we must reindex as we must |         // If there is new faceted fields we indicate that we must reindex as we must | ||||||
|         // index new fields as facets. It means that the distinct attribute, |         // index new fields as facets. It means that the distinct attribute, | ||||||
|         // an Asc/Desc criterion or a filtered attribute as be added or removed. |         // an Asc/Desc criterion or a filtered attribute as be added or removed. | ||||||
|         let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?; |         let new_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; | ||||||
|         let faceted_updated = old_faceted_fields != new_faceted_fields; |         let faceted_updated = old_faceted_fields != new_faceted_fields; | ||||||
|  |  | ||||||
|         let stop_words_updated = self.update_stop_words()?; |         let stop_words_updated = self.update_stop_words()?; | ||||||
| @@ -651,7 +655,8 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -713,7 +718,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -764,7 +770,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -793,7 +800,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -846,7 +854,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -858,7 +867,6 @@ mod tests { | |||||||
|         // Only count the field_id 0 and level 0 facet values. |         // Only count the field_id 0 and level 0 facet values. | ||||||
|         // TODO we must support typed CSVs for numbers to be understood. |         // TODO we must support typed CSVs for numbers to be understood. | ||||||
|         let fidmap = index.fields_ids_map(&rtxn).unwrap(); |         let fidmap = index.fields_ids_map(&rtxn).unwrap(); | ||||||
|         println!("fidmap: {:?}", fidmap); |  | ||||||
|         for document in index.all_documents(&rtxn).unwrap() { |         for document in index.all_documents(&rtxn).unwrap() { | ||||||
|             let document = document.unwrap(); |             let document = document.unwrap(); | ||||||
|             let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1) |             let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1) | ||||||
| @@ -886,7 +894,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -927,7 +936,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -977,7 +987,51 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|  |         builder.add_documents(content).unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         // Run an empty query just to ensure that the search results are ordered. | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); | ||||||
|  |  | ||||||
|  |         // There must be at least one document with a 34 as the age. | ||||||
|  |         assert_eq!(documents_ids.len(), 3); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn set_nested_distinct_field() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |         let config = IndexerConfig::default(); | ||||||
|  |  | ||||||
|  |         // Set the filterable fields to be the age. | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = Settings::new(&mut wtxn, &index, &config); | ||||||
|  |         // Don't display the generated `id` field. | ||||||
|  |         builder.set_displayed_fields(vec![S("person")]); | ||||||
|  |         builder.set_distinct_field(S("person.age")); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |  | ||||||
|  |         // Then index some documents. | ||||||
|  |         let content = documents!([ | ||||||
|  |             { "person": { "name": "kevin", "age": 23 }}, | ||||||
|  |             { "person": { "name": "kevina", "age": 21 }}, | ||||||
|  |             { "person": { "name": "benoit", "age": 34 }}, | ||||||
|  |             { "person": { "name": "bernard", "age": 34 }}, | ||||||
|  |             { "person": { "name": "bertrand", "age": 34 }}, | ||||||
|  |             { "person": { "name": "bernie", "age": 34 }}, | ||||||
|  |             { "person": { "name": "ben", "age": 34 }} | ||||||
|  |         ]); | ||||||
|  |         let indexing_config = | ||||||
|  |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|  |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1008,7 +1062,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1037,7 +1092,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1115,7 +1171,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1252,7 +1309,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1314,7 +1372,8 @@ mod tests { | |||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let mut builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         builder.add_documents(content).unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -59,7 +59,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | |||||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; |     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|  |  | ||||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |     let mut builder = | ||||||
|  |         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|     let mut cursor = Cursor::new(Vec::new()); |     let mut cursor = Cursor::new(Vec::new()); | ||||||
|     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||||
|     let reader = Cursor::new(CONTENT.as_bytes()); |     let reader = Cursor::new(CONTENT.as_bytes()); | ||||||
|   | |||||||
| @@ -390,7 +390,8 @@ fn criteria_ascdesc() { | |||||||
|     // index documents |     // index documents | ||||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; |     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|     let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); |     let mut builder = | ||||||
|  |         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|  |  | ||||||
|     let mut cursor = Cursor::new(Vec::new()); |     let mut cursor = Cursor::new(Vec::new()); | ||||||
|     let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |     let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user