mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-29 23:16:26 +00:00 
			
		
		
		
	Merge #561
561: Enriched documents batch reader r=curquiza a=Kerollmops ~This PR is based on #555 and must be rebased on main after it has been merged to ease the review.~ This PR contains the work in #555 and can be merged on main as soon as reviewed and approved. - [x] Create an `EnrichedDocumentsBatchReader` that contains the external documents id. - [x] Extract the primary key name and make it accessible in the `EnrichedDocumentsBatchReader`. - [x] Use the external id from the `EnrichedDocumentsBatchReader` in the `Transform::read_documents`. - [x] Remove the `update_primary_key` from the _transform.rs_ file. - [x] Really generate the auto-generated documents ids. - [x] Insert the (auto-generated) document ids in the document while processing it in `Transform::read_documents`. Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
		| @@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -169,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -184,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -223,11 +226,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -279,11 +283,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -294,19 +299,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -339,13 +346,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|  |  | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -377,12 +385,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -415,12 +424,13 @@ fn indexing_wiki(c: &mut Criterion) { | |||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -452,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) { | |||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -468,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) { | |||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -507,11 +519,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { | |||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); |                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -564,12 +577,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = |                 let documents = | ||||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); |                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -581,24 +595,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { | |||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = |                 let documents = | ||||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); |                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 let indexing_config = |                 let indexing_config = | ||||||
|                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = |                 let documents = | ||||||
|                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); |                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -631,12 +647,13 @@ fn indexing_movies_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -667,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -682,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -720,11 +739,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -775,12 +795,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | |||||||
|                 // as we don't care about the time it takes. |                 // as we don't care about the time it takes. | ||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -791,21 +812,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); |                 let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -861,12 +884,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -922,11 +946,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -984,12 +1009,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); |                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1021,12 +1047,13 @@ fn indexing_geo(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); |                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -1058,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); |                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -1074,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|  |  | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); |                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
| @@ -1113,11 +1142,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { | |||||||
|                 let config = IndexerConfig::default(); |                 let config = IndexerConfig::default(); | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 let mut wtxn = index.write_txn().unwrap(); | ||||||
|                 let indexing_config = IndexDocumentsConfig::default(); |                 let indexing_config = IndexDocumentsConfig::default(); | ||||||
|                 let mut builder = |                 let builder = | ||||||
|                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) |                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) | ||||||
|                         .unwrap(); |                         .unwrap(); | ||||||
|                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); |                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); | ||||||
|                 builder.add_documents(documents).unwrap(); |                 let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |                 user_error.unwrap(); | ||||||
|                 builder.execute().unwrap(); |                 builder.execute().unwrap(); | ||||||
|                 wtxn.commit().unwrap(); |                 wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -7,12 +7,12 @@ use std::path::Path; | |||||||
|  |  | ||||||
| use criterion::BenchmarkId; | use criterion::BenchmarkId; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use milli::documents::DocumentBatchReader; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::update::{ | use milli::update::{ | ||||||
|     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, |     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, | ||||||
| }; | }; | ||||||
| use milli::{Filter, Index}; | use milli::{Filter, Index, Object}; | ||||||
| use serde_json::{Map, Value}; | use serde_json::Value; | ||||||
|  |  | ||||||
| pub struct Conf<'a> { | pub struct Conf<'a> { | ||||||
|     /// where we are going to create our database.mmdb directory |     /// where we are going to create our database.mmdb directory | ||||||
| @@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index { | |||||||
|         update_method: IndexDocumentsMethod::ReplaceDocuments, |         update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|         ..Default::default() |         ..Default::default() | ||||||
|     }; |     }; | ||||||
|     let mut builder = |     let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |  | ||||||
|     let documents = documents_from(conf.dataset, conf.dataset_format); |     let documents = documents_from(conf.dataset, conf.dataset_format); | ||||||
|  |     let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|     builder.add_documents(documents).unwrap(); |     user_error.unwrap(); | ||||||
|  |  | ||||||
|     builder.execute().unwrap(); |     builder.execute().unwrap(); | ||||||
|     wtxn.commit().unwrap(); |     wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -140,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl BufRead + Seek> { | pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> { | ||||||
|     let reader = |     let reader = | ||||||
|         File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); |         File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); | ||||||
|     let reader = BufReader::new(reader); |     let reader = BufReader::new(reader); | ||||||
| @@ -150,39 +148,35 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp | |||||||
|         "jsonl" => documents_from_jsonl(reader).unwrap(), |         "jsonl" => documents_from_jsonl(reader).unwrap(), | ||||||
|         otherwise => panic!("invalid update format {:?}", otherwise), |         otherwise => panic!("invalid update format {:?}", otherwise), | ||||||
|     }; |     }; | ||||||
|     DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() |     DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result<Vec<u8>> { | fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; |  | ||||||
|  |  | ||||||
|     let mut buf = String::new(); |     for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() { | ||||||
|  |         let object = result?; | ||||||
|     while reader.read_line(&mut buf)? > 0 { |         documents.append_json_object(&object)?; | ||||||
|         documents.extend_from_json(&mut buf.as_bytes())?; |  | ||||||
|         buf.clear(); |  | ||||||
|     } |     } | ||||||
|     documents.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { | fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; |  | ||||||
|  |  | ||||||
|     documents.extend_from_json(reader)?; |     documents.append_json_array(reader)?; | ||||||
|     documents.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { | fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let csv = csv::Reader::from_reader(reader); | ||||||
|     milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |     documents.append_csv(csv)?; | ||||||
|  |  | ||||||
|  |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| enum AllowedType { | enum AllowedType { | ||||||
| @@ -222,14 +216,14 @@ impl<R: Read> CSVDocumentDeserializer<R> { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl<R: Read> Iterator for CSVDocumentDeserializer<R> { | impl<R: Read> Iterator for CSVDocumentDeserializer<R> { | ||||||
|     type Item = anyhow::Result<Map<String, Value>>; |     type Item = anyhow::Result<Object>; | ||||||
|  |  | ||||||
|     fn next(&mut self) -> Option<Self::Item> { |     fn next(&mut self) -> Option<Self::Item> { | ||||||
|         let csv_document = self.documents.next()?; |         let csv_document = self.documents.next()?; | ||||||
|  |  | ||||||
|         match csv_document { |         match csv_document { | ||||||
|             Ok(csv_document) => { |             Ok(csv_document) => { | ||||||
|                 let mut document = Map::new(); |                 let mut document = Object::new(); | ||||||
|  |  | ||||||
|                 for ((field_name, field_type), value) in |                 for ((field_name, field_type), value) in | ||||||
|                     self.headers.iter().zip(csv_document.into_iter()) |                     self.headers.iter().zip(csv_document.into_iter()) | ||||||
|   | |||||||
| @@ -8,12 +8,12 @@ use std::time::Instant; | |||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use eyre::Result; | use eyre::Result; | ||||||
| use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; | use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; | ||||||
|  | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::update::UpdateIndexingStep::{ | use milli::update::UpdateIndexingStep::{ | ||||||
|     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, |     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, | ||||||
| }; | }; | ||||||
| use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; | use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; | ||||||
| use milli::Index; | use milli::{Index, Object}; | ||||||
| use serde_json::{Map, Value}; |  | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  |  | ||||||
| #[cfg(target_os = "linux")] | #[cfg(target_os = "linux")] | ||||||
| @@ -225,9 +225,9 @@ impl Performer for DocumentAddition { | |||||||
|             DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, |             DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; |         let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?; | ||||||
|  |  | ||||||
|         println!("Adding {} documents to the index.", reader.len()); |         println!("Adding {} documents to the index.", reader.documents_count()); | ||||||
|  |  | ||||||
|         let mut txn = index.write_txn()?; |         let mut txn = index.write_txn()?; | ||||||
|         let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; |         let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; | ||||||
| @@ -255,7 +255,7 @@ impl Performer for DocumentAddition { | |||||||
|             let bar = progesses.add(bar); |             let bar = progesses.add(bar); | ||||||
|             bars.push(bar); |             bars.push(bar); | ||||||
|         } |         } | ||||||
|         let mut addition = milli::update::IndexDocuments::new( |         let addition = milli::update::IndexDocuments::new( | ||||||
|             &mut txn, |             &mut txn, | ||||||
|             &index, |             &index, | ||||||
|             &config, |             &config, | ||||||
| @@ -263,7 +263,10 @@ impl Performer for DocumentAddition { | |||||||
|             |step| indexing_callback(step, &bars), |             |step| indexing_callback(step, &bars), | ||||||
|         ) |         ) | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
|         addition.add_documents(reader)?; |         let (addition, user_error) = addition.add_documents(reader)?; | ||||||
|  |         if let Err(error) = user_error { | ||||||
|  |             return Err(error.into()); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         std::thread::spawn(move || { |         std::thread::spawn(move || { | ||||||
|             progesses.join().unwrap(); |             progesses.join().unwrap(); | ||||||
| @@ -321,35 +324,32 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa | |||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> { | fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; |     let reader = BufReader::new(reader); | ||||||
|  |  | ||||||
|     let mut buf = String::new(); |     for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() { | ||||||
|     let mut reader = BufReader::new(reader); |         let object = result?; | ||||||
|  |         documents.append_json_object(&object)?; | ||||||
|     while reader.read_line(&mut buf)? > 0 { |  | ||||||
|         documents.extend_from_json(&mut buf.as_bytes())?; |  | ||||||
|     } |     } | ||||||
|     documents.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> { | fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; |  | ||||||
|  |  | ||||||
|     documents.extend_from_json(reader)?; |     documents.append_json_array(reader)?; | ||||||
|     documents.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> { | fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let csv = csv::Reader::from_reader(reader); | ||||||
|     milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |     documents.append_csv(csv)?; | ||||||
|  |  | ||||||
|  |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, StructOpt)] | #[derive(Debug, StructOpt)] | ||||||
| @@ -423,7 +423,7 @@ impl Search { | |||||||
|         filter: &Option<String>, |         filter: &Option<String>, | ||||||
|         offset: &Option<usize>, |         offset: &Option<usize>, | ||||||
|         limit: &Option<usize>, |         limit: &Option<usize>, | ||||||
|     ) -> Result<Vec<Map<String, Value>>> { |     ) -> Result<Vec<Object>> { | ||||||
|         let txn = index.read_txn()?; |         let txn = index.read_txn()?; | ||||||
|         let mut search = index.search(&txn); |         let mut search = index.search(&txn); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ mod update_store; | |||||||
| use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | ||||||
| use std::fmt::Display; | use std::fmt::Display; | ||||||
| use std::fs::{create_dir_all, File}; | use std::fs::{create_dir_all, File}; | ||||||
| use std::io::{BufRead, BufReader, Cursor, Read}; | use std::io::{BufReader, Cursor, Read}; | ||||||
| use std::net::SocketAddr; | use std::net::SocketAddr; | ||||||
| use std::num::{NonZeroU32, NonZeroUsize}; | use std::num::{NonZeroU32, NonZeroUsize}; | ||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
| @@ -18,7 +18,7 @@ use either::Either; | |||||||
| use flate2::read::GzDecoder; | use flate2::read::GzDecoder; | ||||||
| use futures::{stream, FutureExt, StreamExt}; | use futures::{stream, FutureExt, StreamExt}; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use milli::documents::DocumentBatchReader; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::tokenizer::TokenizerBuilder; | use milli::tokenizer::TokenizerBuilder; | ||||||
| use milli::update::UpdateIndexingStep::*; | use milli::update::UpdateIndexingStep::*; | ||||||
| use milli::update::{ | use milli::update::{ | ||||||
| @@ -26,11 +26,11 @@ use milli::update::{ | |||||||
| }; | }; | ||||||
| use milli::{ | use milli::{ | ||||||
|     obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, |     obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, | ||||||
|     MatcherBuilder, SearchResult, SortError, |     MatcherBuilder, Object, SearchResult, SortError, | ||||||
| }; | }; | ||||||
| use once_cell::sync::OnceCell; | use once_cell::sync::OnceCell; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| use serde_json::{Map, Value}; | use serde_json::Value; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
| use tokio::fs::File as TFile; | use tokio::fs::File as TFile; | ||||||
| use tokio::io::AsyncWriteExt; | use tokio::io::AsyncWriteExt; | ||||||
| @@ -169,11 +169,7 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn highlight_record( |     fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet<String>) { | ||||||
|         &self, |  | ||||||
|         object: &mut Map<String, Value>, |  | ||||||
|         attributes_to_highlight: &HashSet<String>, |  | ||||||
|     ) { |  | ||||||
|         // TODO do we need to create a string for element that are not and needs to be highlight? |         // TODO do we need to create a string for element that are not and needs to be highlight? | ||||||
|         for (key, value) in object.iter_mut() { |         for (key, value) in object.iter_mut() { | ||||||
|             if attributes_to_highlight.contains(key) { |             if attributes_to_highlight.contains(key) { | ||||||
| @@ -378,7 +374,7 @@ async fn main() -> anyhow::Result<()> { | |||||||
|                         }); |                         }); | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
|                     let mut builder = milli::update::IndexDocuments::new( |                     let builder = milli::update::IndexDocuments::new( | ||||||
|                         &mut wtxn, |                         &mut wtxn, | ||||||
|                         &index_cloned, |                         &index_cloned, | ||||||
|                         GLOBAL_CONFIG.get().unwrap(), |                         GLOBAL_CONFIG.get().unwrap(), | ||||||
| @@ -399,10 +395,10 @@ async fn main() -> anyhow::Result<()> { | |||||||
|                         otherwise => panic!("invalid update format {:?}", otherwise), |                         otherwise => panic!("invalid update format {:?}", otherwise), | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
|                     let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; |                     let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; | ||||||
|  |  | ||||||
|                     builder.add_documents(documents)?; |  | ||||||
|  |  | ||||||
|  |                     let (builder, user_error) = builder.add_documents(documents)?; | ||||||
|  |                     let _count = user_error?; | ||||||
|                     let result = builder.execute(); |                     let result = builder.execute(); | ||||||
|  |  | ||||||
|                     match result { |                     match result { | ||||||
| @@ -708,7 +704,7 @@ async fn main() -> anyhow::Result<()> { | |||||||
|     #[derive(Debug, Serialize)] |     #[derive(Debug, Serialize)] | ||||||
|     #[serde(rename_all = "camelCase")] |     #[serde(rename_all = "camelCase")] | ||||||
|     struct Answer { |     struct Answer { | ||||||
|         documents: Vec<Map<String, Value>>, |         documents: Vec<Object>, | ||||||
|         number_of_candidates: u64, |         number_of_candidates: u64, | ||||||
|         facets: BTreeMap<String, BTreeMap<String, u64>>, |         facets: BTreeMap<String, BTreeMap<String, u64>>, | ||||||
|     } |     } | ||||||
| @@ -1032,35 +1028,33 @@ async fn main() -> anyhow::Result<()> { | |||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; |     let reader = BufReader::new(reader); | ||||||
|  |  | ||||||
|     for result in BufReader::new(reader).lines() { |     for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() { | ||||||
|         let line = result?; |         let object = result?; | ||||||
|         documents.extend_from_json(Cursor::new(line))?; |         documents.append_json_object(&object)?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     documents.finish()?; |     documents.into_inner().map_err(Into::into) | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |  | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; |  | ||||||
|  |  | ||||||
|     documents.extend_from_json(reader)?; |     documents.append_json_array(reader)?; | ||||||
|     documents.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> { | fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> { | ||||||
|     let mut writer = Cursor::new(Vec::new()); |     let csv = csv::Reader::from_reader(reader); | ||||||
|     milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; |  | ||||||
|  |  | ||||||
|     Ok(writer.into_inner()) |     let mut documents = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |     documents.append_csv(csv)?; | ||||||
|  |  | ||||||
|  |     documents.into_inner().map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
|   | |||||||
| @@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } | |||||||
| fst = "0.4.7" | fst = "0.4.7" | ||||||
| fxhash = "0.2.1" | fxhash = "0.2.1" | ||||||
| geoutils = "0.4.1" | geoutils = "0.4.1" | ||||||
| grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } | grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||||
| json-depth-checker = { path = "../json-depth-checker" } | json-depth-checker = { path = "../json-depth-checker" } | ||||||
| levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								milli/fuzz/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								milli/fuzz/.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,2 +1,5 @@ | |||||||
|  | Cargo.lock | ||||||
|  | target/ | ||||||
|  |  | ||||||
| /corpus/ | /corpus/ | ||||||
| /artifacts/ | /artifacts/ | ||||||
|   | |||||||
| @@ -7,10 +7,10 @@ use anyhow::{bail, Result}; | |||||||
| use arbitrary_json::ArbitraryValue; | use arbitrary_json::ArbitraryValue; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use libfuzzer_sys::fuzz_target; | use libfuzzer_sys::fuzz_target; | ||||||
| use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | ||||||
| use milli::Index; | use milli::Index; | ||||||
| use serde_json::Value; | use serde_json::{Map, Value}; | ||||||
|  |  | ||||||
| #[cfg(target_os = "linux")] | #[cfg(target_os = "linux")] | ||||||
| #[global_allocator] | #[global_allocator] | ||||||
| @@ -19,21 +19,26 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; | |||||||
| /// reads json from input and write an obkv batch to writer. | /// reads json from input and write an obkv batch to writer. | ||||||
| pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> { | pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> { | ||||||
|     let writer = BufWriter::new(writer); |     let writer = BufWriter::new(writer); | ||||||
|     let mut builder = DocumentBatchBuilder::new(writer)?; |     let mut builder = DocumentsBatchBuilder::new(writer); | ||||||
|     builder.extend_from_json(input)?; |  | ||||||
|  |  | ||||||
|     if builder.len() == 0 { |     let values: Vec<Object> = serde_json::from_reader(input)?; | ||||||
|  |     if builder.documents_count() == 0 { | ||||||
|         bail!("Empty payload"); |         bail!("Empty payload"); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let count = builder.finish()?; |     for object in values { | ||||||
|  |         builder.append_json_object(&object)?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     Ok(count) |     let count = builder.documents_count(); | ||||||
|  |     let vector = builder.into_inner()?; | ||||||
|  |  | ||||||
|  |     Ok(count as usize) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn index_documents( | fn index_documents( | ||||||
|     index: &mut milli::Index, |     index: &mut milli::Index, | ||||||
|     documents: DocumentBatchReader<Cursor<Vec<u8>>>, |     documents: DocumentsBatchReader<Cursor<Vec<u8>>>, | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let config = IndexerConfig::default(); |     let config = IndexerConfig::default(); | ||||||
|     let mut wtxn = index.write_txn()?; |     let mut wtxn = index.write_txn()?; | ||||||
| @@ -98,7 +103,7 @@ fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| { | |||||||
|             // We ignore all malformed documents |             // We ignore all malformed documents | ||||||
|             if let Ok(_) = read_json(json.as_bytes(), &mut documents) { |             if let Ok(_) = read_json(json.as_bytes(), &mut documents) { | ||||||
|                 documents.rewind().unwrap(); |                 documents.rewind().unwrap(); | ||||||
|                 let documents = DocumentBatchReader::from_reader(documents).unwrap(); |                 let documents = DocumentsBatchReader::from_reader(documents).unwrap(); | ||||||
|                 // A lot of errors can come out of milli and we don't know which ones are normal or not |                 // A lot of errors can come out of milli and we don't know which ones are normal or not | ||||||
|                 // so we are only going to look for the unexpected panics. |                 // so we are only going to look for the unexpected panics. | ||||||
|                 let _ = index_documents(&mut index, documents); |                 let _ = index_documents(&mut index, documents); | ||||||
|   | |||||||
| @@ -1,157 +1,170 @@ | |||||||
| use std::collections::BTreeMap; | use std::io::{self, Write}; | ||||||
| use std::io; |  | ||||||
| use std::io::{Cursor, Write}; |  | ||||||
|  |  | ||||||
| use byteorder::{BigEndian, WriteBytesExt}; | use grenad::{CompressionType, WriterBuilder}; | ||||||
| use serde::Deserializer; | use serde::de::Deserializer; | ||||||
| use serde_json::Value; | use serde_json::{to_writer, Value}; | ||||||
|  |  | ||||||
| use super::serde_impl::DocumentVisitor; | use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; | ||||||
| use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; | use crate::documents::serde_impl::DocumentVisitor; | ||||||
| use crate::FieldId; | use crate::Object; | ||||||
|  |  | ||||||
| /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary | /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary | ||||||
| /// format used by milli. | /// format used by milli. | ||||||
| /// | /// | ||||||
| /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to | /// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader` | ||||||
| /// iterate over the documents. | /// to iterate over the documents. | ||||||
| /// | /// | ||||||
| /// ## example: | /// ## example: | ||||||
| /// ``` | /// ``` | ||||||
| /// use milli::documents::DocumentBatchBuilder; |  | ||||||
| /// use serde_json::json; | /// use serde_json::json; | ||||||
| /// use std::io::Cursor; | /// use milli::documents::DocumentsBatchBuilder; | ||||||
| /// | /// | ||||||
| /// let json = r##"{"id": 1, "name": "foo"}"##; | /// let json = json!({ "id": 1, "name": "foo" }); | ||||||
| /// let mut writer = Cursor::new(Vec::new()); | /// | ||||||
| /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); | /// let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
| /// builder.extend_from_json(&mut json.as_bytes()).unwrap(); | /// builder.append_json_object(json.as_object().unwrap()).unwrap(); | ||||||
| /// builder.finish().unwrap(); | /// let _vector = builder.into_inner().unwrap(); | ||||||
| /// ``` | /// ``` | ||||||
| pub struct DocumentBatchBuilder<W> { | pub struct DocumentsBatchBuilder<W> { | ||||||
|     inner: ByteCounter<W>, |     /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`. | ||||||
|     index: DocumentsBatchIndex, |     writer: grenad::Writer<W>, | ||||||
|  |     /// A map that creates the relation between field ids and field names. | ||||||
|  |     fields_index: DocumentsBatchIndex, | ||||||
|  |     /// The number of documents that were added to this builder, | ||||||
|  |     /// it doesn't take the primary key of the documents into account at this point. | ||||||
|  |     documents_count: u32, | ||||||
|  |  | ||||||
|  |     /// A buffer to store a temporary obkv buffer and avoid reallocating. | ||||||
|     obkv_buffer: Vec<u8>, |     obkv_buffer: Vec<u8>, | ||||||
|  |     /// A buffer to serialize the values and avoid reallocating, | ||||||
|  |     /// serialized values are stored in an obkv. | ||||||
|     value_buffer: Vec<u8>, |     value_buffer: Vec<u8>, | ||||||
|     values: BTreeMap<FieldId, Value>, |  | ||||||
|     count: usize, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> { | impl<W: Write> DocumentsBatchBuilder<W> { | ||||||
|     pub fn new(writer: W) -> Result<Self, Error> { |     pub fn new(writer: W) -> DocumentsBatchBuilder<W> { | ||||||
|         let index = DocumentsBatchIndex::default(); |         DocumentsBatchBuilder { | ||||||
|         let mut writer = ByteCounter::new(writer); |             writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer), | ||||||
|         // add space to write the offset of the metadata at the end of the writer |             fields_index: DocumentsBatchIndex::default(), | ||||||
|         writer.write_u64::<BigEndian>(0)?; |             documents_count: 0, | ||||||
|  |  | ||||||
|         Ok(Self { |  | ||||||
|             inner: writer, |  | ||||||
|             index, |  | ||||||
|             obkv_buffer: Vec::new(), |             obkv_buffer: Vec::new(), | ||||||
|             value_buffer: Vec::new(), |             value_buffer: Vec::new(), | ||||||
|             values: BTreeMap::new(), |         } | ||||||
|             count: 0, |  | ||||||
|         }) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the number of documents that have been written to the builder. |     /// Returns the number of documents inserted into this builder. | ||||||
|     pub fn len(&self) -> usize { |     pub fn documents_count(&self) -> u32 { | ||||||
|         self.count |         self.documents_count | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// This method must be called after the document addition is terminated. It will put the |     /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. | ||||||
|     /// metadata at the end of the file, and write the metadata offset at the beginning on the |     pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> { | ||||||
|     /// file. |         // Make sure that we insert the fields ids in order as the obkv writer has this requirement. | ||||||
|     pub fn finish(self) -> Result<usize, Error> { |         let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); | ||||||
|         let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; |         fields_ids.sort_unstable(); | ||||||
|  |  | ||||||
|         let meta = DocumentsMetadata { count, index }; |         self.obkv_buffer.clear(); | ||||||
|  |         let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); | ||||||
|         bincode::serialize_into(&mut writer, &meta)?; |         for field_id in fields_ids { | ||||||
|  |             let key = self.fields_index.name(field_id).unwrap(); | ||||||
|         writer.seek(io::SeekFrom::Start(0))?; |             self.value_buffer.clear(); | ||||||
|         writer.write_u64::<BigEndian>(offset as u64)?; |             to_writer(&mut self.value_buffer, &object[key])?; | ||||||
|  |             writer.insert(field_id, &self.value_buffer)?; | ||||||
|         writer.flush()?; |  | ||||||
|  |  | ||||||
|         Ok(count) |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     /// Extends the builder with json documents from a reader. |         let internal_id = self.documents_count.to_be_bytes(); | ||||||
|     pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> { |         let document_bytes = writer.into_inner()?; | ||||||
|  |         self.writer.insert(internal_id, &document_bytes)?; | ||||||
|  |         self.documents_count += 1; | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly. | ||||||
|  |     pub fn append_json_array<R: io::Read>(&mut self, reader: R) -> Result<(), Error> { | ||||||
|         let mut de = serde_json::Deserializer::from_reader(reader); |         let mut de = serde_json::Deserializer::from_reader(reader); | ||||||
|  |         let mut visitor = DocumentVisitor::new(self); | ||||||
|         let mut visitor = DocumentVisitor { |         de.deserialize_any(&mut visitor)? | ||||||
|             inner: &mut self.inner, |  | ||||||
|             index: &mut self.index, |  | ||||||
|             obkv_buffer: &mut self.obkv_buffer, |  | ||||||
|             value_buffer: &mut self.value_buffer, |  | ||||||
|             values: &mut self.values, |  | ||||||
|             count: &mut self.count, |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         de.deserialize_any(&mut visitor).map_err(Error::JsonError)? |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Creates a builder from a reader of CSV documents. |     /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. | ||||||
|     /// |     pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> { | ||||||
|     /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform |         // Make sure that we insert the fields ids in order as the obkv writer has this requirement. | ||||||
|     /// optimisations, and extending from another CSV is not allowed. |         let mut typed_fields_ids: Vec<_> = reader | ||||||
|     pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> { |  | ||||||
|         let mut this = Self::new(writer)?; |  | ||||||
|         // Ensure that this is the first and only addition made with this builder |  | ||||||
|         debug_assert!(this.index.is_empty()); |  | ||||||
|  |  | ||||||
|         let mut records = csv::Reader::from_reader(reader); |  | ||||||
|  |  | ||||||
|         let headers = records |  | ||||||
|             .headers()? |             .headers()? | ||||||
|             .into_iter() |             .into_iter() | ||||||
|             .map(parse_csv_header) |             .map(parse_csv_header) | ||||||
|             .map(|(k, t)| (this.index.insert(&k), t)) |             .map(|(k, t)| (self.fields_index.insert(k), t)) | ||||||
|             .collect::<BTreeMap<_, _>>(); |             .enumerate() | ||||||
|  |             .collect(); | ||||||
|  |         // Make sure that we insert the fields ids in order as the obkv writer has this requirement. | ||||||
|  |         typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); | ||||||
|  |  | ||||||
|         for (i, record) in records.into_records().enumerate() { |         let mut record = csv::StringRecord::new(); | ||||||
|             let record = record?; |         let mut line = 0; | ||||||
|             this.obkv_buffer.clear(); |         while reader.read_record(&mut record)? { | ||||||
|             let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); |             // We increment here and not at the end of the while loop to take | ||||||
|             for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { |             // the header offset into account. | ||||||
|                 let value = match ty { |             line += 1; | ||||||
|  |  | ||||||
|  |             self.obkv_buffer.clear(); | ||||||
|  |             let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); | ||||||
|  |  | ||||||
|  |             for (i, (field_id, type_)) in typed_fields_ids.iter() { | ||||||
|  |                 self.value_buffer.clear(); | ||||||
|  |  | ||||||
|  |                 let value = &record[*i]; | ||||||
|  |                 match type_ { | ||||||
|                     AllowedType::Number => { |                     AllowedType::Number => { | ||||||
|                         if value.trim().is_empty() { |                         if value.trim().is_empty() { | ||||||
|                             Value::Null |                             to_writer(&mut self.value_buffer, &Value::Null)?; | ||||||
|                         } else { |                         } else { | ||||||
|                             value.trim().parse::<f64>().map(Value::from).map_err(|error| { |                             match value.trim().parse::<f64>() { | ||||||
|                                 Error::ParseFloat { |                                 Ok(float) => { | ||||||
|                                     error, |                                     to_writer(&mut self.value_buffer, &float)?; | ||||||
|                                     // +1 for the header offset. |                                 } | ||||||
|                                     line: i + 1, |                                 Err(error) => { | ||||||
|                                     value: value.to_string(), |                                     return Err(Error::ParseFloat { | ||||||
|  |                                         error, | ||||||
|  |                                         line, | ||||||
|  |                                         value: value.to_string(), | ||||||
|  |                                     }); | ||||||
|  |                                 } | ||||||
|                             } |                             } | ||||||
|                             })? |  | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                     AllowedType::String => { |                     AllowedType::String => { | ||||||
|                         if value.is_empty() { |                         if value.is_empty() { | ||||||
|                             Value::Null |                             to_writer(&mut self.value_buffer, &Value::Null)?; | ||||||
|                         } else { |                         } else { | ||||||
|                             Value::String(value.to_string()) |                             to_writer(&mut self.value_buffer, value)?; | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 }; |  | ||||||
|  |  | ||||||
|                 this.value_buffer.clear(); |  | ||||||
|                 serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; |  | ||||||
|                 writer.insert(*fid, &this.value_buffer)?; |  | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|             this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?; |                 // We insert into the obkv writer the value buffer that has been filled just above. | ||||||
|             this.inner.write_all(&this.obkv_buffer)?; |                 writer.insert(*field_id, &self.value_buffer)?; | ||||||
|  |  | ||||||
|             this.count += 1; |  | ||||||
|             } |             } | ||||||
|  |  | ||||||
|         Ok(this) |             let internal_id = self.documents_count.to_be_bytes(); | ||||||
|  |             let document_bytes = writer.into_inner()?; | ||||||
|  |             self.writer.insert(internal_id, &document_bytes)?; | ||||||
|  |             self.documents_count += 1; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`. | ||||||
|  |     pub fn into_inner(mut self) -> io::Result<W> { | ||||||
|  |         let DocumentsBatchBuilder { mut writer, fields_index, .. } = self; | ||||||
|  |  | ||||||
|  |         // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer. | ||||||
|  |         self.value_buffer.clear(); | ||||||
|  |         to_writer(&mut self.value_buffer, &fields_index)?; | ||||||
|  |         writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?; | ||||||
|  |  | ||||||
|  |         writer.into_inner() | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -161,16 +174,16 @@ enum AllowedType { | |||||||
|     Number, |     Number, | ||||||
| } | } | ||||||
|  |  | ||||||
| fn parse_csv_header(header: &str) -> (String, AllowedType) { | fn parse_csv_header(header: &str) -> (&str, AllowedType) { | ||||||
|     // if there are several separators we only split on the last one. |     // if there are several separators we only split on the last one. | ||||||
|     match header.rsplit_once(':') { |     match header.rsplit_once(':') { | ||||||
|         Some((field_name, field_type)) => match field_type { |         Some((field_name, field_type)) => match field_type { | ||||||
|             "string" => (field_name.to_string(), AllowedType::String), |             "string" => (field_name, AllowedType::String), | ||||||
|             "number" => (field_name.to_string(), AllowedType::Number), |             "number" => (field_name, AllowedType::Number), | ||||||
|             // if the pattern isn't reconized, we keep the whole field. |             // if the pattern isn't reconized, we keep the whole field. | ||||||
|             _otherwise => (header.to_string(), AllowedType::String), |             _otherwise => (header, AllowedType::String), | ||||||
|         }, |         }, | ||||||
|         None => (header.to_string(), AllowedType::String), |         None => (header, AllowedType::String), | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -178,35 +191,20 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) { | |||||||
| mod test { | mod test { | ||||||
|     use std::io::Cursor; |     use std::io::Cursor; | ||||||
|  |  | ||||||
|     use serde_json::{json, Map}; |     use serde_json::json; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::documents::DocumentBatchReader; |     use crate::documents::{obkv_to_object, DocumentsBatchReader}; | ||||||
|  |  | ||||||
|     fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value { |  | ||||||
|         let mut map = Map::new(); |  | ||||||
|  |  | ||||||
|         for (fid, value) in obkv.iter() { |  | ||||||
|             let field_name = index.name(fid).unwrap().clone(); |  | ||||||
|             let value: Value = serde_json::from_slice(value).unwrap(); |  | ||||||
|  |  | ||||||
|             map.insert(field_name, value); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         Value::Object(map) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn add_single_documents_json() { |     fn add_single_documents_json() { | ||||||
|         let mut cursor = Cursor::new(Vec::new()); |  | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let json = serde_json::json!({ |         let json = serde_json::json!({ | ||||||
|             "id": 1, |             "id": 1, | ||||||
|             "field": "hello!", |             "field": "hello!", | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         builder.append_json_object(json.as_object().unwrap()).unwrap(); | ||||||
|  |  | ||||||
|         let json = serde_json::json!({ |         let json = serde_json::json!({ | ||||||
|             "blabla": false, |             "blabla": false, | ||||||
| @@ -214,100 +212,64 @@ mod test { | |||||||
|             "id": 1, |             "id": 1, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); |         builder.append_json_object(json.as_object().unwrap()).unwrap(); | ||||||
|  |  | ||||||
|         assert_eq!(builder.len(), 2); |         assert_eq!(builder.documents_count(), 2); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|         builder.finish().unwrap(); |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|  |             .unwrap() | ||||||
|         cursor.set_position(0); |             .into_cursor_and_fields_index(); | ||||||
|  |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); |  | ||||||
|         assert_eq!(index.len(), 3); |         assert_eq!(index.len(), 3); | ||||||
|  |  | ||||||
|  |         let document = cursor.next_document().unwrap().unwrap(); | ||||||
|         assert_eq!(document.iter().count(), 2); |         assert_eq!(document.iter().count(), 2); | ||||||
|  |  | ||||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); |         let document = cursor.next_document().unwrap().unwrap(); | ||||||
|         assert_eq!(index.len(), 3); |  | ||||||
|         assert_eq!(document.iter().count(), 3); |         assert_eq!(document.iter().count(), 3); | ||||||
|  |  | ||||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); |         assert!(cursor.next_document().unwrap().is_none()); | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn add_documents_seq_json() { |  | ||||||
|         let mut cursor = Cursor::new(Vec::new()); |  | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let json = serde_json::json!([{ |  | ||||||
|             "id": 1, |  | ||||||
|             "field": "hello!", |  | ||||||
|         },{ |  | ||||||
|             "blabla": false, |  | ||||||
|             "field": "hello!", |  | ||||||
|             "id": 1, |  | ||||||
|         } |  | ||||||
|         ]); |  | ||||||
|  |  | ||||||
|         builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(builder.len(), 2); |  | ||||||
|  |  | ||||||
|         builder.finish().unwrap(); |  | ||||||
|  |  | ||||||
|         cursor.set_position(0); |  | ||||||
|  |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); |  | ||||||
|         assert_eq!(index.len(), 3); |  | ||||||
|         assert_eq!(document.iter().count(), 2); |  | ||||||
|  |  | ||||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); |  | ||||||
|         assert_eq!(index.len(), 3); |  | ||||||
|         assert_eq!(document.iter().count(), 3); |  | ||||||
|  |  | ||||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn add_documents_csv() { |     fn add_documents_csv() { | ||||||
|         let mut cursor = Cursor::new(Vec::new()); |         let csv_content = "id:number,field:string\n1,hello!\n2,blabla"; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let csv = "id:number,field:string\n1,hello!\n2,blabla"; |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         builder.append_csv(csv).unwrap(); | ||||||
|  |         assert_eq!(builder.documents_count(), 2); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|         let builder = |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); |             .unwrap() | ||||||
|         builder.finish().unwrap(); |             .into_cursor_and_fields_index(); | ||||||
|  |  | ||||||
|         cursor.set_position(0); |  | ||||||
|  |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let (index, document) = reader.next_document_with_index().unwrap().unwrap(); |  | ||||||
|         assert_eq!(index.len(), 2); |         assert_eq!(index.len(), 2); | ||||||
|  |  | ||||||
|  |         let document = cursor.next_document().unwrap().unwrap(); | ||||||
|         assert_eq!(document.iter().count(), 2); |         assert_eq!(document.iter().count(), 2); | ||||||
|  |  | ||||||
|         let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); |         let document = cursor.next_document().unwrap().unwrap(); | ||||||
|         assert_eq!(document.iter().count(), 2); |         assert_eq!(document.iter().count(), 2); | ||||||
|  |  | ||||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); |         assert!(cursor.next_document().unwrap().is_none()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn simple_csv_document() { |     fn simple_csv_document() { | ||||||
|         let documents = r#"city,country,pop |         let csv_content = r#"city,country,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |  | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -318,22 +280,25 @@ mod test { | |||||||
|             }) |             }) | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         assert!(reader.next_document_with_index().unwrap().is_none()); |         assert!(cursor.next_document().unwrap().is_none()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn coma_in_field() { |     fn coma_in_field() { | ||||||
|         let documents = r#"city,country,pop |         let csv_content = r#"city,country,pop | ||||||
| "Boston","United, States","4628910""#; | "Boston","United, States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -347,17 +312,20 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn quote_in_field() { |     fn quote_in_field() { | ||||||
|         let documents = r#"city,country,pop |         let csv_content = r#"city,country,pop | ||||||
| "Boston","United"" States","4628910""#; | "Boston","United"" States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -371,17 +339,20 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn integer_in_field() { |     fn integer_in_field() { | ||||||
|         let documents = r#"city,country,pop:number |         let csv_content = r#"city,country,pop:number | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -395,17 +366,20 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn float_in_field() { |     fn float_in_field() { | ||||||
|         let documents = r#"city,country,pop:number |         let csv_content = r#"city,country,pop:number | ||||||
| "Boston","United States","4628910.01""#; | "Boston","United States","4628910.01""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -419,17 +393,20 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn several_colon_in_header() { |     fn several_colon_in_header() { | ||||||
|         let documents = r#"city:love:string,country:state,pop |         let csv_content = r#"city:love:string,country:state,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -443,17 +420,20 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn ending_by_colon_in_header() { |     fn ending_by_colon_in_header() { | ||||||
|         let documents = r#"city:,country,pop |         let csv_content = r#"city:,country,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -467,17 +447,20 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn starting_by_colon_in_header() { |     fn starting_by_colon_in_header() { | ||||||
|         let documents = r#":city,country,pop |         let csv_content = r#":city,country,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -492,32 +475,37 @@ mod test { | |||||||
|     #[ignore] |     #[ignore] | ||||||
|     #[test] |     #[test] | ||||||
|     fn starting_by_colon_in_header2() { |     fn starting_by_colon_in_header2() { | ||||||
|         let documents = r#":string,country,pop |         let csv_content = r#":string,country,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |  | ||||||
|  |  | ||||||
|         assert!(reader.next_document_with_index().is_err()); |         assert!(cursor.next_document().is_err()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn double_colon_in_header() { |     fn double_colon_in_header() { | ||||||
|         let documents = r#"city::string,country,pop |         let csv_content = r#"city::string,country,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) |         builder.append_csv(csv).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .finish() |             .into_cursor_and_fields_index(); | ||||||
|             .unwrap(); |  | ||||||
|         let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); |         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); | ||||||
|         let val = obkv_to_value(&doc, index); |  | ||||||
|  |  | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             val, |             val, | ||||||
| @@ -531,34 +519,32 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn bad_type_in_header() { |     fn bad_type_in_header() { | ||||||
|         let documents = r#"city,country:number,pop |         let csv_content = r#"city,country:number,pop | ||||||
| "Boston","United States","4628910""#; | "Boston","United States","4628910""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         assert!( |         assert!(builder.append_csv(csv).is_err()); | ||||||
|             DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() |  | ||||||
|         ); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn bad_column_count1() { |     fn bad_column_count1() { | ||||||
|         let documents = r#"city,country,pop |         let csv_content = r#"city,country,pop | ||||||
| "Boston","United States","4628910", "too much""#; | "Boston","United States","4628910", "too much | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content"#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         assert!( |         assert!(builder.append_csv(csv).is_err()); | ||||||
|             DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() |  | ||||||
|         ); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn bad_column_count2() { |     fn bad_column_count2() { | ||||||
|         let documents = r#"city,country,pop |         let csv_content = r#"city,country,pop | ||||||
| "Boston","United States""#; | "Boston","United States""#; | ||||||
|  |         let csv = csv::Reader::from_reader(Cursor::new(csv_content)); | ||||||
|  |  | ||||||
|         let mut buf = Vec::new(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         assert!( |         assert!(builder.append_csv(csv).is_err()); | ||||||
|             DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() |  | ||||||
|         ); |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										109
									
								
								milli/src/documents/enriched.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								milli/src/documents/enriched.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,109 @@ | |||||||
|  | use std::fs::File; | ||||||
|  | use std::{io, str}; | ||||||
|  |  | ||||||
|  | use obkv::KvReader; | ||||||
|  |  | ||||||
|  | use super::{ | ||||||
|  |     DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, | ||||||
|  |     Error, | ||||||
|  | }; | ||||||
|  | use crate::update::DocumentId; | ||||||
|  | use crate::FieldId; | ||||||
|  |  | ||||||
|  | /// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have | ||||||
|  | /// been created with a `DocumentsBatchWriter` and, for the enriched data, | ||||||
|  | /// a simple `grenad::Reader<File>`. | ||||||
|  | /// | ||||||
|  | /// The documents are returned in the form of `obkv::Reader` where each field is identified with a | ||||||
|  | /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. | ||||||
|  | pub struct EnrichedDocumentsBatchReader<R> { | ||||||
|  |     documents: DocumentsBatchReader<R>, | ||||||
|  |     primary_key: String, | ||||||
|  |     external_ids: grenad::ReaderCursor<File>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> { | ||||||
|  |     pub fn new( | ||||||
|  |         documents: DocumentsBatchReader<R>, | ||||||
|  |         primary_key: String, | ||||||
|  |         external_ids: grenad::Reader<File>, | ||||||
|  |     ) -> Result<Self, Error> { | ||||||
|  |         if documents.documents_count() as u64 == external_ids.len() { | ||||||
|  |             Ok(EnrichedDocumentsBatchReader { | ||||||
|  |                 documents, | ||||||
|  |                 primary_key, | ||||||
|  |                 external_ids: external_ids.into_cursor()?, | ||||||
|  |             }) | ||||||
|  |         } else { | ||||||
|  |             Err(Error::InvalidEnrichedData) | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn documents_count(&self) -> u32 { | ||||||
|  |         self.documents.documents_count() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn primary_key(&self) -> &str { | ||||||
|  |         &self.primary_key | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn is_empty(&self) -> bool { | ||||||
|  |         self.documents.is_empty() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { | ||||||
|  |         self.documents.documents_batch_index() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// This method returns a forward cursor over the enriched documents. | ||||||
|  |     pub fn into_cursor_and_fields_index( | ||||||
|  |         self, | ||||||
|  |     ) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) { | ||||||
|  |         let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; | ||||||
|  |         let (documents, fields_index) = documents.into_cursor_and_fields_index(); | ||||||
|  |         external_ids.reset(); | ||||||
|  |         (EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone)] | ||||||
|  | pub struct EnrichedDocument<'a> { | ||||||
|  |     pub document: KvReader<'a, FieldId>, | ||||||
|  |     pub document_id: DocumentId, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub struct EnrichedDocumentsBatchCursor<R> { | ||||||
|  |     documents: DocumentsBatchCursor<R>, | ||||||
|  |     primary_key: String, | ||||||
|  |     external_ids: grenad::ReaderCursor<File>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<R> EnrichedDocumentsBatchCursor<R> { | ||||||
|  |     pub fn primary_key(&self) -> &str { | ||||||
|  |         &self.primary_key | ||||||
|  |     } | ||||||
|  |     /// Resets the cursor to be able to read from the start again. | ||||||
|  |     pub fn reset(&mut self) { | ||||||
|  |         self.documents.reset(); | ||||||
|  |         self.external_ids.reset(); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> { | ||||||
|  |     /// Returns the next document, starting from the first one. Subsequent calls to | ||||||
|  |     /// `next_document` advance the document reader until all the documents have been read. | ||||||
|  |     pub fn next_enriched_document( | ||||||
|  |         &mut self, | ||||||
|  |     ) -> Result<Option<EnrichedDocument>, DocumentsBatchCursorError> { | ||||||
|  |         let document = self.documents.next_document()?; | ||||||
|  |         let document_id = match self.external_ids.move_on_next()? { | ||||||
|  |             Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?, | ||||||
|  |             None => None, | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         match document.zip(document_id) { | ||||||
|  |             Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })), | ||||||
|  |             None => Ok(None), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,24 +1,41 @@ | |||||||
| mod builder; | mod builder; | ||||||
| /// The documents module defines an intermediary document format that milli uses for indexation, and | mod enriched; | ||||||
| /// provides an API to easily build and read such documents. |  | ||||||
| /// |  | ||||||
| /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can |  | ||||||
| /// later be read by milli using the `DocumentBatchReader` interface. |  | ||||||
| mod reader; | mod reader; | ||||||
| mod serde_impl; | mod serde_impl; | ||||||
|  |  | ||||||
| use std::fmt::{self, Debug}; | use std::fmt::{self, Debug}; | ||||||
| use std::io; | use std::io; | ||||||
|  | use std::str::Utf8Error; | ||||||
|  |  | ||||||
| use bimap::BiHashMap; | use bimap::BiHashMap; | ||||||
| pub use builder::DocumentBatchBuilder; | pub use builder::DocumentsBatchBuilder; | ||||||
| pub use reader::DocumentBatchReader; | pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; | ||||||
|  | use obkv::KvReader; | ||||||
|  | pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| use crate::FieldId; | use crate::error::{FieldIdMapMissingEntry, InternalError}; | ||||||
|  | use crate::{FieldId, Object, Result}; | ||||||
|  |  | ||||||
|  | /// The key that is used to store the `DocumentsBatchIndex` datastructure, | ||||||
|  | /// it is the absolute last key of the list. | ||||||
|  | const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); | ||||||
|  |  | ||||||
|  | /// Helper function to convert an obkv reader into a JSON object. | ||||||
|  | pub fn obkv_to_object(obkv: &KvReader<FieldId>, index: &DocumentsBatchIndex) -> Result<Object> { | ||||||
|  |     obkv.iter() | ||||||
|  |         .map(|(field_id, value)| { | ||||||
|  |             let field_name = index.name(field_id).ok_or_else(|| { | ||||||
|  |                 FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" } | ||||||
|  |             })?; | ||||||
|  |             let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; | ||||||
|  |             Ok((field_name.to_string(), value)) | ||||||
|  |         }) | ||||||
|  |         .collect() | ||||||
|  | } | ||||||
|  |  | ||||||
| /// A bidirectional map that links field ids to their name in a document batch. | /// A bidirectional map that links field ids to their name in a document batch. | ||||||
| #[derive(Default, Debug, Serialize, Deserialize)] | #[derive(Default, Clone, Debug, Serialize, Deserialize)] | ||||||
| pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>); | pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>); | ||||||
|  |  | ||||||
| impl DocumentsBatchIndex { | impl DocumentsBatchIndex { | ||||||
| @@ -46,15 +63,16 @@ impl DocumentsBatchIndex { | |||||||
|         self.0.iter() |         self.0.iter() | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn name(&self, id: FieldId) -> Option<&String> { |     pub fn name(&self, id: FieldId) -> Option<&str> { | ||||||
|         self.0.get_by_left(&id) |         self.0.get_by_left(&id).map(AsRef::as_ref) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn recreate_json( |     pub fn id(&self, name: &str) -> Option<FieldId> { | ||||||
|         &self, |         self.0.get_by_right(name).cloned() | ||||||
|         document: &obkv::KvReaderU16, |     } | ||||||
|     ) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> { |  | ||||||
|         let mut map = serde_json::Map::new(); |     pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result<Object> { | ||||||
|  |         let mut map = Object::new(); | ||||||
|  |  | ||||||
|         for (k, v) in document.iter() { |         for (k, v) in document.iter() { | ||||||
|             // TODO: TAMO: update the error type |             // TODO: TAMO: update the error type | ||||||
| @@ -69,50 +87,22 @@ impl DocumentsBatchIndex { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Serialize, Deserialize)] |  | ||||||
| struct DocumentsMetadata { |  | ||||||
|     count: usize, |  | ||||||
|     index: DocumentsBatchIndex, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct ByteCounter<W> { |  | ||||||
|     count: usize, |  | ||||||
|     writer: W, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<W> ByteCounter<W> { |  | ||||||
|     fn new(writer: W) -> Self { |  | ||||||
|         Self { count: 0, writer } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<W: io::Write> io::Write for ByteCounter<W> { |  | ||||||
|     fn write(&mut self, buf: &[u8]) -> io::Result<usize> { |  | ||||||
|         let count = self.writer.write(buf)?; |  | ||||||
|         self.count += count; |  | ||||||
|         Ok(count) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn flush(&mut self) -> io::Result<()> { |  | ||||||
|         self.writer.flush() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub enum Error { | pub enum Error { | ||||||
|     ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, |     ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, | ||||||
|     InvalidDocumentFormat, |     InvalidDocumentFormat, | ||||||
|     Custom(String), |     InvalidEnrichedData, | ||||||
|     JsonError(serde_json::Error), |     InvalidUtf8(Utf8Error), | ||||||
|     CsvError(csv::Error), |     Csv(csv::Error), | ||||||
|     Serialize(bincode::Error), |     Json(serde_json::Error), | ||||||
|  |     Serialize(serde_json::Error), | ||||||
|  |     Grenad(grenad::Error), | ||||||
|     Io(io::Error), |     Io(io::Error), | ||||||
|     DocumentTooLarge, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl From<csv::Error> for Error { | impl From<csv::Error> for Error { | ||||||
|     fn from(e: csv::Error) -> Self { |     fn from(e: csv::Error) -> Self { | ||||||
|         Self::CsvError(e) |         Self::Csv(e) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -122,15 +112,21 @@ impl From<io::Error> for Error { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl From<bincode::Error> for Error { | impl From<serde_json::Error> for Error { | ||||||
|     fn from(other: bincode::Error) -> Self { |     fn from(other: serde_json::Error) -> Self { | ||||||
|         Self::Serialize(other) |         Self::Json(other) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl From<serde_json::Error> for Error { | impl From<grenad::Error> for Error { | ||||||
|     fn from(other: serde_json::Error) -> Self { |     fn from(other: grenad::Error) -> Self { | ||||||
|         Self::JsonError(other) |         Self::Grenad(other) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl From<Utf8Error> for Error { | ||||||
|  |     fn from(other: Utf8Error) -> Self { | ||||||
|  |         Self::InvalidUtf8(other) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -140,13 +136,16 @@ impl fmt::Display for Error { | |||||||
|             Error::ParseFloat { error, line, value } => { |             Error::ParseFloat { error, line, value } => { | ||||||
|                 write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) |                 write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) | ||||||
|             } |             } | ||||||
|             Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), |             Error::InvalidDocumentFormat => { | ||||||
|             Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), |                 f.write_str("Invalid document addition format, missing the documents batch index.") | ||||||
|             Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), |             } | ||||||
|  |             Error::InvalidEnrichedData => f.write_str("Invalid enriched data."), | ||||||
|  |             Error::InvalidUtf8(e) => write!(f, "{}", e), | ||||||
|             Error::Io(e) => write!(f, "{}", e), |             Error::Io(e) => write!(f, "{}", e), | ||||||
|             Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), |  | ||||||
|             Error::Serialize(e) => write!(f, "{}", e), |             Error::Serialize(e) => write!(f, "{}", e), | ||||||
|             Error::CsvError(e) => write!(f, "{}", e), |             Error::Grenad(e) => write!(f, "{}", e), | ||||||
|  |             Error::Csv(e) => write!(f, "{}", e), | ||||||
|  |             Error::Json(e) => write!(f, "{}", e), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -158,15 +157,25 @@ impl std::error::Error for Error {} | |||||||
| macro_rules! documents { | macro_rules! documents { | ||||||
|     ($data:tt) => {{ |     ($data:tt) => {{ | ||||||
|         let documents = serde_json::json!($data); |         let documents = serde_json::json!($data); | ||||||
|         let mut writer = std::io::Cursor::new(Vec::new()); |         let documents = match documents { | ||||||
|         let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); |             object @ serde_json::Value::Object(_) => vec![object], | ||||||
|         let documents = serde_json::to_vec(&documents).unwrap(); |             serde_json::Value::Array(objects) => objects, | ||||||
|         builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); |             invalid => { | ||||||
|         builder.finish().unwrap(); |                 panic!("an array of objects must be specified, {:#?} is not an array", invalid) | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         writer.set_position(0); |         let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         for document in documents { | ||||||
|  |             let object = match document { | ||||||
|  |                 serde_json::Value::Object(object) => object, | ||||||
|  |                 invalid => panic!("an object must be specified, {:#?} is not an object", invalid), | ||||||
|  |             }; | ||||||
|  |             builder.append_json_object(&object).unwrap(); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         crate::documents::DocumentBatchReader::from_reader(writer).unwrap() |         let vector = builder.into_inner().unwrap(); | ||||||
|  |         crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() | ||||||
|     }}; |     }}; | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -180,7 +189,7 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn create_documents_no_errors() { |     fn create_documents_no_errors() { | ||||||
|         let json = json!({ |         let value = json!({ | ||||||
|             "number": 1, |             "number": 1, | ||||||
|             "string": "this is a field", |             "string": "this is a field", | ||||||
|             "array": ["an", "array"], |             "array": ["an", "array"], | ||||||
| @@ -190,26 +199,18 @@ mod test { | |||||||
|             "bool": true |             "bool": true | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         let json = serde_json::to_vec(&json).unwrap(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         builder.append_json_object(value.as_object().unwrap()).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|         let mut v = Vec::new(); |         let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) | ||||||
|         let mut cursor = io::Cursor::new(&mut v); |             .unwrap() | ||||||
|  |             .into_cursor_and_fields_index(); | ||||||
|  |  | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |         assert_eq!(index.iter().count(), 5); | ||||||
|  |         let reader = documents.next_document().unwrap().unwrap(); | ||||||
|         builder.extend_from_json(Cursor::new(json)).unwrap(); |         assert_eq!(reader.iter().count(), 5); | ||||||
|  |         assert!(documents.next_document().unwrap().is_none()); | ||||||
|         builder.finish().unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = |  | ||||||
|             DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(documents.index().iter().count(), 5); |  | ||||||
|  |  | ||||||
|         let reader = documents.next_document_with_index().unwrap().unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(reader.1.iter().count(), 5); |  | ||||||
|         assert!(documents.next_document_with_index().unwrap().is_none()); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
| @@ -221,101 +222,56 @@ mod test { | |||||||
|             "toto": false, |             "toto": false, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         let doc1 = serde_json::to_vec(&doc1).unwrap(); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         let doc2 = serde_json::to_vec(&doc2).unwrap(); |         builder.append_json_object(doc1.as_object().unwrap()).unwrap(); | ||||||
|  |         builder.append_json_object(doc2.as_object().unwrap()).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|         let mut v = Vec::new(); |         let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector)) | ||||||
|         let mut cursor = io::Cursor::new(&mut v); |             .unwrap() | ||||||
|  |             .into_cursor_and_fields_index(); | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |         assert_eq!(index.iter().count(), 2); | ||||||
|  |         let reader = documents.next_document().unwrap().unwrap(); | ||||||
|         builder.extend_from_json(Cursor::new(doc1)).unwrap(); |         assert_eq!(reader.iter().count(), 1); | ||||||
|         builder.extend_from_json(Cursor::new(doc2)).unwrap(); |         assert!(documents.next_document().unwrap().is_some()); | ||||||
|  |         assert!(documents.next_document().unwrap().is_none()); | ||||||
|         builder.finish().unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = |  | ||||||
|             DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(documents.index().iter().count(), 2); |  | ||||||
|  |  | ||||||
|         let reader = documents.next_document_with_index().unwrap().unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(reader.1.iter().count(), 1); |  | ||||||
|         assert!(documents.next_document_with_index().unwrap().is_some()); |  | ||||||
|         assert!(documents.next_document_with_index().unwrap().is_none()); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn add_documents_array() { |  | ||||||
|         let docs = json!([ |  | ||||||
|             { "toto": false }, |  | ||||||
|             { "tata": "hello" }, |  | ||||||
|         ]); |  | ||||||
|  |  | ||||||
|         let docs = serde_json::to_vec(&docs).unwrap(); |  | ||||||
|  |  | ||||||
|         let mut v = Vec::new(); |  | ||||||
|         let mut cursor = io::Cursor::new(&mut v); |  | ||||||
|  |  | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         builder.extend_from_json(Cursor::new(docs)).unwrap(); |  | ||||||
|  |  | ||||||
|         builder.finish().unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = |  | ||||||
|             DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(documents.index().iter().count(), 2); |  | ||||||
|  |  | ||||||
|         let reader = documents.next_document_with_index().unwrap().unwrap(); |  | ||||||
|  |  | ||||||
|         assert_eq!(reader.1.iter().count(), 1); |  | ||||||
|         assert!(documents.next_document_with_index().unwrap().is_some()); |  | ||||||
|         assert!(documents.next_document_with_index().unwrap().is_none()); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn add_invalid_document_format() { |  | ||||||
|         let mut v = Vec::new(); |  | ||||||
|         let mut cursor = io::Cursor::new(&mut v); |  | ||||||
|  |  | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let docs = json!([[ |  | ||||||
|             { "toto": false }, |  | ||||||
|             { "tata": "hello" }, |  | ||||||
|         ]]); |  | ||||||
|  |  | ||||||
|         let docs = serde_json::to_vec(&docs).unwrap(); |  | ||||||
|         assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); |  | ||||||
|  |  | ||||||
|         let docs = json!("hello"); |  | ||||||
|         let docs = serde_json::to_vec(&docs).unwrap(); |  | ||||||
|  |  | ||||||
|         assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn test_nested() { |     fn test_nested() { | ||||||
|         let mut docs = documents!([{ |         let docs_reader = documents!([{ | ||||||
|             "hello": { |             "hello": { | ||||||
|                 "toto": ["hello"] |                 "toto": ["hello"] | ||||||
|             } |             } | ||||||
|         }]); |         }]); | ||||||
|  |  | ||||||
|         let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); |         let (mut cursor, _) = docs_reader.into_cursor_and_fields_index(); | ||||||
|  |         let doc = cursor.next_document().unwrap().unwrap(); | ||||||
|         let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); |         let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); | ||||||
|         assert_eq!(nested, json!({ "toto": ["hello"] })); |         assert_eq!(nested, json!({ "toto": ["hello"] })); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn out_of_order_fields() { |     fn out_of_order_json_fields() { | ||||||
|         let _documents = documents!([ |         let _documents = documents!([ | ||||||
|             {"id": 1,"b": 0}, |             {"id": 1,"b": 0}, | ||||||
|             {"id": 2,"a": 0,"b": 0}, |             {"id": 2,"a": 0,"b": 0}, | ||||||
|         ]); |         ]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn out_of_order_csv_fields() { | ||||||
|  |         let csv1_content = "id:number,b\n1,0"; | ||||||
|  |         let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); | ||||||
|  |  | ||||||
|  |         let csv2_content = "id:number,a,b\n2,0,0"; | ||||||
|  |         let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content)); | ||||||
|  |  | ||||||
|  |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         builder.append_csv(csv1).unwrap(); | ||||||
|  |         builder.append_csv(csv2).unwrap(); | ||||||
|  |         let vector = builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|  |         DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,11 +1,9 @@ | |||||||
| use std::io; | use std::convert::TryInto; | ||||||
| use std::io::{BufReader, Read}; | use std::{error, fmt, io}; | ||||||
| use std::mem::size_of; |  | ||||||
|  |  | ||||||
| use byteorder::{BigEndian, ReadBytesExt}; |  | ||||||
| use obkv::KvReader; | use obkv::KvReader; | ||||||
|  |  | ||||||
| use super::{DocumentsBatchIndex, DocumentsMetadata, Error}; | use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; | ||||||
| use crate::FieldId; | use crate::FieldId; | ||||||
|  |  | ||||||
| /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with | /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with | ||||||
| @@ -13,63 +11,106 @@ use crate::FieldId; | |||||||
| /// | /// | ||||||
| /// The documents are returned in the form of `obkv::Reader` where each field is identified with a | /// The documents are returned in the form of `obkv::Reader` where each field is identified with a | ||||||
| /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. | /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. | ||||||
| pub struct DocumentBatchReader<R> { | pub struct DocumentsBatchReader<R> { | ||||||
|     reader: BufReader<R>, |     cursor: grenad::ReaderCursor<R>, | ||||||
|     metadata: DocumentsMetadata, |     fields_index: DocumentsBatchIndex, | ||||||
|     buffer: Vec<u8>, |  | ||||||
|     seen_documents: usize, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<R: io::Read + io::Seek> DocumentBatchReader<R> { | impl<R: io::Read + io::Seek> DocumentsBatchReader<R> { | ||||||
|  |     pub fn new(cursor: DocumentsBatchCursor<R>, fields_index: DocumentsBatchIndex) -> Self { | ||||||
|  |         Self { cursor: cursor.cursor, fields_index } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /// Construct a `DocumentsReader` from a reader. |     /// Construct a `DocumentsReader` from a reader. | ||||||
|     /// |     /// | ||||||
|     /// It first retrieves the index, then moves to the first document. Subsequent calls to |     /// It first retrieves the index, then moves to the first document. Use the `into_cursor` | ||||||
|     /// `next_document` advance the document reader until all the documents have been read. |     /// method to iterator over the documents, from the first to the last. | ||||||
|     pub fn from_reader(mut reader: R) -> Result<Self, Error> { |     pub fn from_reader(reader: R) -> Result<Self, Error> { | ||||||
|         let mut buffer = Vec::new(); |         let reader = grenad::Reader::new(reader)?; | ||||||
|  |         let mut cursor = reader.into_cursor()?; | ||||||
|  |  | ||||||
|         let meta_offset = reader.read_u64::<BigEndian>()?; |         let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? { | ||||||
|         reader.seek(io::SeekFrom::Start(meta_offset))?; |             Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?, | ||||||
|         reader.read_to_end(&mut buffer)?; |             None => return Err(Error::InvalidDocumentFormat), | ||||||
|         let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?; |         }; | ||||||
|  |  | ||||||
|         reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?; |         Ok(DocumentsBatchReader { cursor, fields_index }) | ||||||
|         buffer.clear(); |  | ||||||
|  |  | ||||||
|         let reader = BufReader::new(reader); |  | ||||||
|  |  | ||||||
|         Ok(Self { reader, metadata, buffer, seen_documents: 0 }) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a |     pub fn documents_count(&self) -> u32 { | ||||||
|     /// reference to the addition index. |         self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents") | ||||||
|     pub fn next_document_with_index<'a>( |  | ||||||
|         &'a mut self, |  | ||||||
|     ) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> { |  | ||||||
|         if self.seen_documents < self.metadata.count { |  | ||||||
|             let doc_len = self.reader.read_u32::<BigEndian>()?; |  | ||||||
|             self.buffer.resize(doc_len as usize, 0); |  | ||||||
|             self.reader.read_exact(&mut self.buffer)?; |  | ||||||
|             self.seen_documents += 1; |  | ||||||
|  |  | ||||||
|             let reader = KvReader::new(&self.buffer); |  | ||||||
|             Ok(Some((&self.metadata.index, reader))) |  | ||||||
|         } else { |  | ||||||
|             Ok(None) |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Return the fields index for the documents batch. |  | ||||||
|     pub fn index(&self) -> &DocumentsBatchIndex { |  | ||||||
|         &self.metadata.index |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Returns the number of documents in the reader. |  | ||||||
|     pub fn len(&self) -> usize { |  | ||||||
|         self.metadata.count |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn is_empty(&self) -> bool { |     pub fn is_empty(&self) -> bool { | ||||||
|         self.len() == 0 |         self.cursor.len().saturating_sub(1) == 0 | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { | ||||||
|  |         &self.fields_index | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// This method returns a forward cursor over the documents. | ||||||
|  |     pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor<R>, DocumentsBatchIndex) { | ||||||
|  |         let DocumentsBatchReader { cursor, fields_index } = self; | ||||||
|  |         let mut cursor = DocumentsBatchCursor { cursor }; | ||||||
|  |         cursor.reset(); | ||||||
|  |         (cursor, fields_index) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// A forward cursor over the documents in a `DocumentsBatchReader`. | ||||||
|  | pub struct DocumentsBatchCursor<R> { | ||||||
|  |     cursor: grenad::ReaderCursor<R>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<R> DocumentsBatchCursor<R> { | ||||||
|  |     /// Resets the cursor to be able to read from the start again. | ||||||
|  |     pub fn reset(&mut self) { | ||||||
|  |         self.cursor.reset(); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> { | ||||||
|  |     /// Returns the next document, starting from the first one. Subsequent calls to | ||||||
|  |     /// `next_document` advance the document reader until all the documents have been read. | ||||||
|  |     pub fn next_document( | ||||||
|  |         &mut self, | ||||||
|  |     ) -> Result<Option<KvReader<FieldId>>, DocumentsBatchCursorError> { | ||||||
|  |         match self.cursor.move_on_next()? { | ||||||
|  |             Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { | ||||||
|  |                 Ok(Some(KvReader::new(value))) | ||||||
|  |             } | ||||||
|  |             _otherwise => Ok(None), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. | ||||||
|  | #[derive(Debug)] | ||||||
|  | pub enum DocumentsBatchCursorError { | ||||||
|  |     Grenad(grenad::Error), | ||||||
|  |     SerdeJson(serde_json::Error), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl From<grenad::Error> for DocumentsBatchCursorError { | ||||||
|  |     fn from(error: grenad::Error) -> DocumentsBatchCursorError { | ||||||
|  |         DocumentsBatchCursorError::Grenad(error) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl From<serde_json::Error> for DocumentsBatchCursorError { | ||||||
|  |     fn from(error: serde_json::Error) -> DocumentsBatchCursorError { | ||||||
|  |         DocumentsBatchCursorError::SerdeJson(error) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl error::Error for DocumentsBatchCursorError {} | ||||||
|  |  | ||||||
|  | impl fmt::Display for DocumentsBatchCursorError { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|  |         match self { | ||||||
|  |             DocumentsBatchCursorError::Grenad(e) => e.fmt(f), | ||||||
|  |             DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f), | ||||||
|  |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,14 +1,11 @@ | |||||||
| use std::collections::BTreeMap; |  | ||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::io::{Cursor, Write}; | use std::io::Write; | ||||||
|  |  | ||||||
| use byteorder::WriteBytesExt; |  | ||||||
| use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; | use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; | ||||||
| use serde::Deserialize; |  | ||||||
| use serde_json::Value; |  | ||||||
|  |  | ||||||
| use super::{ByteCounter, DocumentsBatchIndex, Error}; | use super::Error; | ||||||
| use crate::FieldId; | use crate::documents::DocumentsBatchBuilder; | ||||||
|  | use crate::Object; | ||||||
|  |  | ||||||
| macro_rules! tri { | macro_rules! tri { | ||||||
|     ($e:expr) => { |     ($e:expr) => { | ||||||
| @@ -19,54 +16,15 @@ macro_rules! tri { | |||||||
|     }; |     }; | ||||||
| } | } | ||||||
|  |  | ||||||
| struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); |  | ||||||
|  |  | ||||||
| impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { |  | ||||||
|     type Value = FieldId; |  | ||||||
|  |  | ||||||
|     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error> |  | ||||||
|     where |  | ||||||
|         D: serde::Deserializer<'de>, |  | ||||||
|     { |  | ||||||
|         deserializer.deserialize_str(self) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { |  | ||||||
|     type Value = FieldId; |  | ||||||
|  |  | ||||||
|     fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> |  | ||||||
|     where |  | ||||||
|         E: serde::de::Error, |  | ||||||
|     { |  | ||||||
|         Ok(self.0.insert(v)) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { |  | ||||||
|         write!(f, "a string") |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| struct ValueDeserializer; |  | ||||||
|  |  | ||||||
| impl<'de> DeserializeSeed<'de> for ValueDeserializer { |  | ||||||
|     type Value = serde_json::Value; |  | ||||||
|  |  | ||||||
|     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error> |  | ||||||
|     where |  | ||||||
|         D: serde::Deserializer<'de>, |  | ||||||
|     { |  | ||||||
|         serde_json::Value::deserialize(deserializer) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub struct DocumentVisitor<'a, W> { | pub struct DocumentVisitor<'a, W> { | ||||||
|     pub inner: &'a mut ByteCounter<W>, |     inner: &'a mut DocumentsBatchBuilder<W>, | ||||||
|     pub index: &'a mut DocumentsBatchIndex, |     object: Object, | ||||||
|     pub obkv_buffer: &'a mut Vec<u8>, | } | ||||||
|     pub value_buffer: &'a mut Vec<u8>, |  | ||||||
|     pub values: &'a mut BTreeMap<FieldId, Value>, | impl<'a, W> DocumentVisitor<'a, W> { | ||||||
|     pub count: &'a mut usize, |     pub fn new(inner: &'a mut DocumentsBatchBuilder<W>) -> Self { | ||||||
|  |         DocumentVisitor { inner, object: Object::new() } | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { | impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { | ||||||
| @@ -88,28 +46,12 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { | |||||||
|     where |     where | ||||||
|         A: MapAccess<'de>, |         A: MapAccess<'de>, | ||||||
|     { |     { | ||||||
|         while let Some((key, value)) = |         self.object.clear(); | ||||||
|             map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? |         while let Some((key, value)) = map.next_entry()? { | ||||||
|         { |             self.object.insert(key, value); | ||||||
|             self.values.insert(key, value); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         self.obkv_buffer.clear(); |         tri!(self.inner.append_json_object(&self.object)); | ||||||
|         let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); |  | ||||||
|         for (key, value) in self.values.iter() { |  | ||||||
|             self.value_buffer.clear(); |  | ||||||
|             // This is guaranteed to work |  | ||||||
|             tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); |  | ||||||
|             tri!(obkv.insert(*key, &self.value_buffer)); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let reader = tri!(obkv.into_inner()).into_inner(); |  | ||||||
|  |  | ||||||
|         tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32)); |  | ||||||
|         tri!(self.inner.write_all(reader)); |  | ||||||
|  |  | ||||||
|         *self.count += 1; |  | ||||||
|         self.values.clear(); |  | ||||||
|  |  | ||||||
|         Ok(Ok(())) |         Ok(Ok(())) | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -4,12 +4,11 @@ use std::{io, str}; | |||||||
|  |  | ||||||
| use heed::{Error as HeedError, MdbError}; | use heed::{Error as HeedError, MdbError}; | ||||||
| use rayon::ThreadPoolBuildError; | use rayon::ThreadPoolBuildError; | ||||||
| use serde_json::{Map, Value}; | use serde_json::Value; | ||||||
| use thiserror::Error; | use thiserror::Error; | ||||||
|  |  | ||||||
| use crate::{CriterionError, DocumentId, FieldId, SortError}; | use crate::documents::{self, DocumentsBatchCursorError}; | ||||||
|  | use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; | ||||||
| pub type Object = Map<String, Value>; |  | ||||||
|  |  | ||||||
| pub fn is_reserved_keyword(keyword: &str) -> bool { | pub fn is_reserved_keyword(keyword: &str) -> bool { | ||||||
|     ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) |     ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) | ||||||
| @@ -37,6 +36,8 @@ pub enum InternalError { | |||||||
|     FieldIdMappingMissingEntry { key: FieldId }, |     FieldIdMappingMissingEntry { key: FieldId }, | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     Fst(#[from] fst::Error), |     Fst(#[from] fst::Error), | ||||||
|  |     #[error(transparent)] | ||||||
|  |     DocumentsError(#[from] documents::Error), | ||||||
|     #[error("Invalid compression type have been specified to grenad.")] |     #[error("Invalid compression type have been specified to grenad.")] | ||||||
|     GrenadInvalidCompressionType, |     GrenadInvalidCompressionType, | ||||||
|     #[error("Invalid grenad file with an invalid version format.")] |     #[error("Invalid grenad file with an invalid version format.")] | ||||||
| @@ -123,6 +124,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco | |||||||
|     MaxDatabaseSizeReached, |     MaxDatabaseSizeReached, | ||||||
|     #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] |     #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] | ||||||
|     MissingDocumentId { primary_key: String, document: Object }, |     MissingDocumentId { primary_key: String, document: Object }, | ||||||
|  |     #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] | ||||||
|  |     TooManyDocumentIds { primary_key: String, document: Object }, | ||||||
|     #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] |     #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] | ||||||
|     MissingPrimaryKey, |     MissingPrimaryKey, | ||||||
|     #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] |     #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] | ||||||
| @@ -141,13 +144,19 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco | |||||||
|  |  | ||||||
| #[derive(Error, Debug)] | #[derive(Error, Debug)] | ||||||
| pub enum GeoError { | pub enum GeoError { | ||||||
|  |     #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] | ||||||
|  |     NotAnObject { document_id: Value, value: Value }, | ||||||
|  |     #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")] | ||||||
|  |     MissingLatitudeAndLongitude { document_id: Value }, | ||||||
|     #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] |     #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] | ||||||
|     MissingLatitude { document_id: Value }, |     MissingLatitude { document_id: Value }, | ||||||
|     #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] |     #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] | ||||||
|     MissingLongitude { document_id: Value }, |     MissingLongitude { document_id: Value }, | ||||||
|     #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] |     #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")] | ||||||
|  |     BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, | ||||||
|  |     #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] | ||||||
|     BadLatitude { document_id: Value, value: Value }, |     BadLatitude { document_id: Value, value: Value }, | ||||||
|     #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] |     #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] | ||||||
|     BadLongitude { document_id: Value, value: Value }, |     BadLongitude { document_id: Value, value: Value }, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -178,6 +187,7 @@ macro_rules! error_from_sub_error { | |||||||
| error_from_sub_error! { | error_from_sub_error! { | ||||||
|     FieldIdMapMissingEntry => InternalError, |     FieldIdMapMissingEntry => InternalError, | ||||||
|     fst::Error => InternalError, |     fst::Error => InternalError, | ||||||
|  |     documents::Error => InternalError, | ||||||
|     str::Utf8Error => InternalError, |     str::Utf8Error => InternalError, | ||||||
|     ThreadPoolBuildError => InternalError, |     ThreadPoolBuildError => InternalError, | ||||||
|     SerializationError => InternalError, |     SerializationError => InternalError, | ||||||
| @@ -203,6 +213,15 @@ where | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl From<DocumentsBatchCursorError> for Error { | ||||||
|  |     fn from(error: DocumentsBatchCursorError) -> Error { | ||||||
|  |         match error { | ||||||
|  |             DocumentsBatchCursorError::Grenad(e) => Error::from(e), | ||||||
|  |             DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl From<Infallible> for Error { | impl From<Infallible> for Error { | ||||||
|     fn from(_error: Infallible) -> Error { |     fn from(_error: Infallible) -> Error { | ||||||
|         unreachable!() |         unreachable!() | ||||||
|   | |||||||
| @@ -1212,10 +1212,11 @@ pub(crate) mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1234,7 +1235,7 @@ pub(crate) mod tests { | |||||||
|         // we add all the documents a second time. we are supposed to get the same |         // we add all the documents a second time. we are supposed to get the same | ||||||
|         // field_distribution in the end |         // field_distribution in the end | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         let content = documents!([ |         let content = documents!([ | ||||||
| @@ -1242,7 +1243,8 @@ pub(crate) mod tests { | |||||||
|             { "id": 2, "name": "bob", "age": 20 }, |             { "id": 2, "name": "bob", "age": 20 }, | ||||||
|             { "id": 2, "name": "bob", "age": 20 }, |             { "id": 2, "name": "bob", "age": 20 }, | ||||||
|         ]); |         ]); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1265,10 +1267,11 @@ pub(crate) mod tests { | |||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1333,10 +1336,11 @@ pub(crate) mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1390,10 +1394,11 @@ pub(crate) mod tests { | |||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,7 +20,7 @@ use std::hash::BuildHasherDefault; | |||||||
| pub use filter_parser::{Condition, FilterCondition}; | pub use filter_parser::{Condition, FilterCondition}; | ||||||
| use fxhash::{FxHasher32, FxHasher64}; | use fxhash::{FxHasher32, FxHasher64}; | ||||||
| pub use grenad::CompressionType; | pub use grenad::CompressionType; | ||||||
| use serde_json::{Map, Value}; | use serde_json::Value; | ||||||
| pub use {charabia as tokenizer, heed}; | pub use {charabia as tokenizer, heed}; | ||||||
|  |  | ||||||
| pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; | pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; | ||||||
| @@ -43,20 +43,21 @@ pub use self::search::{ | |||||||
|  |  | ||||||
| pub type Result<T> = std::result::Result<T, error::Error>; | pub type Result<T> = std::result::Result<T, error::Error>; | ||||||
|  |  | ||||||
|  | pub type Attribute = u32; | ||||||
|  | pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>; | ||||||
|  | pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; | ||||||
|  | pub type DocumentId = u32; | ||||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||||
| pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; | pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; | ||||||
|  | pub type FieldDistribution = BTreeMap<String, u64>; | ||||||
|  | pub type FieldId = u16; | ||||||
|  | pub type Object = serde_json::Map<String, serde_json::Value>; | ||||||
|  | pub type Position = u32; | ||||||
|  | pub type RelativePosition = u16; | ||||||
| pub type SmallString32 = smallstr::SmallString<[u8; 32]>; | pub type SmallString32 = smallstr::SmallString<[u8; 32]>; | ||||||
| pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>; | pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>; | ||||||
| pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>; | pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>; | ||||||
| pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>; | pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>; | ||||||
| pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>; |  | ||||||
| pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; |  | ||||||
| pub type Attribute = u32; |  | ||||||
| pub type DocumentId = u32; |  | ||||||
| pub type FieldId = u16; |  | ||||||
| pub type Position = u32; |  | ||||||
| pub type RelativePosition = u16; |  | ||||||
| pub type FieldDistribution = BTreeMap<String, u64>; |  | ||||||
|  |  | ||||||
| /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata | /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata | ||||||
| /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point | /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point | ||||||
| @@ -82,7 +83,7 @@ pub fn obkv_to_json( | |||||||
|     displayed_fields: &[FieldId], |     displayed_fields: &[FieldId], | ||||||
|     fields_ids_map: &FieldsIdsMap, |     fields_ids_map: &FieldsIdsMap, | ||||||
|     obkv: obkv::KvReaderU16, |     obkv: obkv::KvReaderU16, | ||||||
| ) -> Result<Map<String, Value>> { | ) -> Result<Object> { | ||||||
|     displayed_fields |     displayed_fields | ||||||
|         .iter() |         .iter() | ||||||
|         .copied() |         .copied() | ||||||
|   | |||||||
| @@ -35,7 +35,7 @@ mod test { | |||||||
|     use roaring::RoaringBitmap; |     use roaring::RoaringBitmap; | ||||||
|     use serde_json::{json, Value}; |     use serde_json::{json, Value}; | ||||||
|  |  | ||||||
|     use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; |     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
|     use crate::index::tests::TempIndex; |     use crate::index::tests::TempIndex; | ||||||
|     use crate::index::Index; |     use crate::index::Index; | ||||||
|     use crate::update::{ |     use crate::update::{ | ||||||
| @@ -43,14 +43,11 @@ mod test { | |||||||
|     }; |     }; | ||||||
|     use crate::{DocumentId, FieldId, BEU32}; |     use crate::{DocumentId, FieldId, BEU32}; | ||||||
|  |  | ||||||
|     static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents); |     static JSON: Lazy<Vec<u8>> = Lazy::new(|| { | ||||||
|  |  | ||||||
|     fn generate_documents() -> Vec<u8> { |  | ||||||
|         let mut rng = rand::thread_rng(); |         let mut rng = rand::thread_rng(); | ||||||
|         let num_docs = rng.gen_range(10..30); |         let num_docs = rng.gen_range(10..30); | ||||||
|  |  | ||||||
|         let mut cursor = Cursor::new(Vec::new()); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|         let txts = ["Toto", "Titi", "Tata"]; |         let txts = ["Toto", "Titi", "Tata"]; | ||||||
|         let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>(); |         let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>(); | ||||||
|         let cat_ints = (1..10).collect::<Vec<_>>(); |         let cat_ints = (1..10).collect::<Vec<_>>(); | ||||||
| @@ -63,7 +60,7 @@ mod test { | |||||||
|             let mut sample_ints = cat_ints.clone(); |             let mut sample_ints = cat_ints.clone(); | ||||||
|             sample_ints.shuffle(&mut rng); |             sample_ints.shuffle(&mut rng); | ||||||
|  |  | ||||||
|             let doc = json!({ |             let json = json!({ | ||||||
|                 "id": i, |                 "id": i, | ||||||
|                 "txt": txt, |                 "txt": txt, | ||||||
|                 "cat-int": rng.gen_range(0..3), |                 "cat-int": rng.gen_range(0..3), | ||||||
| @@ -71,13 +68,16 @@ mod test { | |||||||
|                 "cat-ints": sample_ints[..(rng.gen_range(0..3))], |                 "cat-ints": sample_ints[..(rng.gen_range(0..3))], | ||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); |             let object = match json { | ||||||
|             builder.extend_from_json(doc).unwrap(); |                 Value::Object(object) => object, | ||||||
|  |                 _ => panic!(), | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             builder.append_json_object(&object).unwrap(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         builder.finish().unwrap(); |         builder.into_inner().unwrap() | ||||||
|         cursor.into_inner() |     }); | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Returns a temporary index populated with random test documents, the FieldId for the |     /// Returns a temporary index populated with random test documents, the FieldId for the | ||||||
|     /// distinct attribute, and the RoaringBitmap with the document ids. |     /// distinct attribute, and the RoaringBitmap with the document ids. | ||||||
| @@ -97,20 +97,22 @@ mod test { | |||||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut addition = |         let addition = | ||||||
|             IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|  |  | ||||||
|         let reader = |         let reader = | ||||||
|             crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); |             crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) | ||||||
|  |                 .unwrap(); | ||||||
|  |  | ||||||
|         addition.add_documents(reader).unwrap(); |         let (addition, user_error) = addition.add_documents(reader).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         addition.execute().unwrap(); |         addition.execute().unwrap(); | ||||||
|  |  | ||||||
|         let fields_map = index.fields_ids_map(&txn).unwrap(); |         let fields_map = index.fields_ids_map(&txn).unwrap(); | ||||||
|         let fid = fields_map.id(&distinct).unwrap(); |         let fid = fields_map.id(&distinct).unwrap(); | ||||||
|  |  | ||||||
|         let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); |         let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); | ||||||
|         let map = (0..documents.len() as u32).collect(); |         let map = (0..documents.documents_count() as u32).collect(); | ||||||
|  |  | ||||||
|         txn.commit().unwrap(); |         txn.commit().unwrap(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -648,10 +648,11 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -100,9 +100,10 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // Clear all documents from the database. |         // Clear all documents from the database. | ||||||
|   | |||||||
| @@ -657,13 +657,13 @@ mod tests { | |||||||
|     fn insert_documents<'t, R: std::io::Read + std::io::Seek>( |     fn insert_documents<'t, R: std::io::Read + std::io::Seek>( | ||||||
|         wtxn: &mut RwTxn<'t, '_>, |         wtxn: &mut RwTxn<'t, '_>, | ||||||
|         index: &'t Index, |         index: &'t Index, | ||||||
|         documents: crate::documents::DocumentBatchReader<R>, |         documents: crate::documents::DocumentsBatchReader<R>, | ||||||
|     ) { |     ) { | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|             IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -701,9 +701,10 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // delete those documents, ids are synchronous therefore 0, 1, and 2. |         // delete those documents, ids are synchronous therefore 0, 1, and 2. | ||||||
| @@ -736,9 +737,10 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // Delete not all of the documents but some of them. |         // Delete not all of the documents but some of them. | ||||||
|   | |||||||
							
								
								
									
										365
									
								
								milli/src/update/index_documents/enrich.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										365
									
								
								milli/src/update/index_documents/enrich.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,365 @@ | |||||||
|  | use std::io::{Read, Seek}; | ||||||
|  | use std::result::Result as StdResult; | ||||||
|  | use std::{fmt, iter}; | ||||||
|  |  | ||||||
|  | use serde::{Deserialize, Serialize}; | ||||||
|  | use serde_json::Value; | ||||||
|  |  | ||||||
|  | use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; | ||||||
|  | use crate::error::{GeoError, InternalError, UserError}; | ||||||
|  | use crate::update::index_documents::{obkv_to_object, writer_into_reader}; | ||||||
|  | use crate::{FieldId, Index, Object, Result}; | ||||||
|  |  | ||||||
|  | /// The symbol used to define levels in a nested primary key. | ||||||
|  | const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; | ||||||
|  |  | ||||||
|  | /// The default primary that is used when not specified. | ||||||
|  | const DEFAULT_PRIMARY_KEY: &str = "id"; | ||||||
|  |  | ||||||
|  | /// This function validates and enrich the documents by checking that: | ||||||
|  | ///  - we can infer a primary key, | ||||||
|  | ///  - all the documents id exist and are extracted, | ||||||
|  | ///  - the validity of them but also, | ||||||
|  | ///  - the validity of the `_geo` field depending on the settings. | ||||||
|  | pub fn enrich_documents_batch<R: Read + Seek>( | ||||||
|  |     rtxn: &heed::RoTxn, | ||||||
|  |     index: &Index, | ||||||
|  |     autogenerate_docids: bool, | ||||||
|  |     reader: DocumentsBatchReader<R>, | ||||||
|  | ) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> { | ||||||
|  |     let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); | ||||||
|  |  | ||||||
|  |     let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; | ||||||
|  |     let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; | ||||||
|  |  | ||||||
|  |     // The primary key *field id* that has already been set for this index or the one | ||||||
|  |     // we will guess by searching for the first key that contains "id" as a substring. | ||||||
|  |     let primary_key = match index.primary_key(rtxn)? { | ||||||
|  |         Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { | ||||||
|  |             PrimaryKey::nested(primary_key) | ||||||
|  |         } | ||||||
|  |         Some(primary_key) => match documents_batch_index.id(primary_key) { | ||||||
|  |             Some(id) => PrimaryKey::flat(primary_key, id), | ||||||
|  |             None if autogenerate_docids => { | ||||||
|  |                 PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) | ||||||
|  |             } | ||||||
|  |             None => { | ||||||
|  |                 return match cursor.next_document()? { | ||||||
|  |                     Some(first_document) => Ok(Err(UserError::MissingDocumentId { | ||||||
|  |                         primary_key: primary_key.to_string(), | ||||||
|  |                         document: obkv_to_object(&first_document, &documents_batch_index)?, | ||||||
|  |                     })), | ||||||
|  |                     None => Ok(Err(UserError::MissingPrimaryKey)), | ||||||
|  |                 }; | ||||||
|  |             } | ||||||
|  |         }, | ||||||
|  |         None => { | ||||||
|  |             let guessed = documents_batch_index | ||||||
|  |                 .iter() | ||||||
|  |                 .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) | ||||||
|  |                 .min_by_key(|(fid, _)| *fid); | ||||||
|  |             match guessed { | ||||||
|  |                 Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), | ||||||
|  |                 None if autogenerate_docids => PrimaryKey::flat( | ||||||
|  |                     DEFAULT_PRIMARY_KEY, | ||||||
|  |                     documents_batch_index.insert(DEFAULT_PRIMARY_KEY), | ||||||
|  |                 ), | ||||||
|  |                 None => return Ok(Err(UserError::MissingPrimaryKey)), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     // If the settings specifies that a _geo field must be used therefore we must check the | ||||||
|  |     // validity of it in all the documents of this batch and this is when we return `Some`. | ||||||
|  |     let geo_field_id = match documents_batch_index.id("_geo") { | ||||||
|  |         Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id), | ||||||
|  |         _otherwise => None, | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     let mut count = 0; | ||||||
|  |     while let Some(document) = cursor.next_document()? { | ||||||
|  |         let document_id = match fetch_or_generate_document_id( | ||||||
|  |             &document, | ||||||
|  |             &documents_batch_index, | ||||||
|  |             primary_key, | ||||||
|  |             autogenerate_docids, | ||||||
|  |             &mut uuid_buffer, | ||||||
|  |             count, | ||||||
|  |         )? { | ||||||
|  |             Ok(document_id) => document_id, | ||||||
|  |             Err(user_error) => return Ok(Err(user_error)), | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { | ||||||
|  |             if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { | ||||||
|  |                 return Ok(Err(UserError::from(user_error))); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?; | ||||||
|  |         external_ids.insert(count.to_be_bytes(), document_id)?; | ||||||
|  |  | ||||||
|  |         count += 1; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     let external_ids = writer_into_reader(external_ids)?; | ||||||
|  |     let primary_key_name = primary_key.name().to_string(); | ||||||
|  |     let reader = EnrichedDocumentsBatchReader::new( | ||||||
|  |         DocumentsBatchReader::new(cursor, documents_batch_index), | ||||||
|  |         primary_key_name, | ||||||
|  |         external_ids, | ||||||
|  |     )?; | ||||||
|  |  | ||||||
|  |     Ok(Ok(reader)) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Retrieve the document id after validating it, returning a `UserError` | ||||||
|  | /// if the id is invalid or can't be guessed. | ||||||
|  | fn fetch_or_generate_document_id( | ||||||
|  |     document: &obkv::KvReader<FieldId>, | ||||||
|  |     documents_batch_index: &DocumentsBatchIndex, | ||||||
|  |     primary_key: PrimaryKey, | ||||||
|  |     autogenerate_docids: bool, | ||||||
|  |     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], | ||||||
|  |     count: u32, | ||||||
|  | ) -> Result<StdResult<DocumentId, UserError>> { | ||||||
|  |     match primary_key { | ||||||
|  |         PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { | ||||||
|  |             match document.get(primary_key_id) { | ||||||
|  |                 Some(document_id_bytes) => { | ||||||
|  |                     let document_id = serde_json::from_slice(document_id_bytes) | ||||||
|  |                         .map_err(InternalError::SerdeJson)?; | ||||||
|  |                     match validate_document_id_value(document_id)? { | ||||||
|  |                         Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), | ||||||
|  |                         Err(user_error) => Ok(Err(user_error)), | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 None if autogenerate_docids => { | ||||||
|  |                     let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); | ||||||
|  |                     Ok(Ok(DocumentId::generated(uuid.to_string(), count))) | ||||||
|  |                 } | ||||||
|  |                 None => Ok(Err(UserError::MissingDocumentId { | ||||||
|  |                     primary_key: primary_key.to_string(), | ||||||
|  |                     document: obkv_to_object(&document, &documents_batch_index)?, | ||||||
|  |                 })), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         nested @ PrimaryKey::Nested { .. } => { | ||||||
|  |             let mut matching_documents_ids = Vec::new(); | ||||||
|  |             for (first_level_name, right) in nested.possible_level_names() { | ||||||
|  |                 if let Some(field_id) = documents_batch_index.id(first_level_name) { | ||||||
|  |                     if let Some(value_bytes) = document.get(field_id) { | ||||||
|  |                         let object = serde_json::from_slice(value_bytes) | ||||||
|  |                             .map_err(InternalError::SerdeJson)?; | ||||||
|  |                         fetch_matching_values(object, right, &mut matching_documents_ids); | ||||||
|  |  | ||||||
|  |                         if matching_documents_ids.len() >= 2 { | ||||||
|  |                             return Ok(Err(UserError::TooManyDocumentIds { | ||||||
|  |                                 primary_key: nested.name().to_string(), | ||||||
|  |                                 document: obkv_to_object(&document, &documents_batch_index)?, | ||||||
|  |                             })); | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             match matching_documents_ids.pop() { | ||||||
|  |                 Some(document_id) => match validate_document_id_value(document_id)? { | ||||||
|  |                     Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), | ||||||
|  |                     Err(user_error) => Ok(Err(user_error)), | ||||||
|  |                 }, | ||||||
|  |                 None => Ok(Err(UserError::MissingDocumentId { | ||||||
|  |                     primary_key: nested.name().to_string(), | ||||||
|  |                     document: obkv_to_object(&document, &documents_batch_index)?, | ||||||
|  |                 })), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// A type that represent the type of primary key that has been set | ||||||
|  | /// for this index, a classic flat one or a nested one. | ||||||
|  | #[derive(Debug, Clone, Copy)] | ||||||
|  | enum PrimaryKey<'a> { | ||||||
|  |     Flat { name: &'a str, field_id: FieldId }, | ||||||
|  |     Nested { name: &'a str }, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl PrimaryKey<'_> { | ||||||
|  |     fn flat(name: &str, field_id: FieldId) -> PrimaryKey { | ||||||
|  |         PrimaryKey::Flat { name, field_id } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn nested(name: &str) -> PrimaryKey { | ||||||
|  |         PrimaryKey::Nested { name } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn name(&self) -> &str { | ||||||
|  |         match self { | ||||||
|  |             PrimaryKey::Flat { name, .. } => name, | ||||||
|  |             PrimaryKey::Nested { name } => name, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Returns an `Iterator` that gives all the possible fields names the primary key | ||||||
|  |     /// can have depending of the first level name and deepnes of the objects. | ||||||
|  |     fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ { | ||||||
|  |         let name = self.name(); | ||||||
|  |         name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) | ||||||
|  |             .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) | ||||||
|  |             .chain(iter::once((name, ""))) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// A type that represents a document id that has been retrieved from a document or auto-generated. | ||||||
|  | /// | ||||||
|  | /// In case the document id has been auto-generated, the document nth is kept to help | ||||||
|  | /// users debug if there is an issue with the document itself. | ||||||
|  | #[derive(Serialize, Deserialize, Clone)] | ||||||
|  | pub enum DocumentId { | ||||||
|  |     Retrieved { value: String }, | ||||||
|  |     Generated { value: String, document_nth: u32 }, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl DocumentId { | ||||||
|  |     fn retrieved(value: String) -> DocumentId { | ||||||
|  |         DocumentId::Retrieved { value } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn generated(value: String, document_nth: u32) -> DocumentId { | ||||||
|  |         DocumentId::Generated { value, document_nth } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn debug(&self) -> String { | ||||||
|  |         format!("{:?}", self) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn is_generated(&self) -> bool { | ||||||
|  |         matches!(self, DocumentId::Generated { .. }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn value(&self) -> &str { | ||||||
|  |         match self { | ||||||
|  |             DocumentId::Retrieved { value } => value, | ||||||
|  |             DocumentId::Generated { value, .. } => value, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl fmt::Debug for DocumentId { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||||
|  |         match self { | ||||||
|  |             DocumentId::Retrieved { value } => write!(f, "{:?}", value), | ||||||
|  |             DocumentId::Generated { value, document_nth } => { | ||||||
|  |                 write!(f, "{{{:?}}} of the {}nth document", value, document_nth) | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn starts_with(selector: &str, key: &str) -> bool { | ||||||
|  |     selector.strip_prefix(key).map_or(false, |tail| { | ||||||
|  |         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) | ||||||
|  |     }) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) { | ||||||
|  |     match value { | ||||||
|  |         Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), | ||||||
|  |         otherwise => output.push(otherwise), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub fn fetch_matching_values_in_object( | ||||||
|  |     object: Object, | ||||||
|  |     selector: &str, | ||||||
|  |     base_key: &str, | ||||||
|  |     output: &mut Vec<Value>, | ||||||
|  | ) { | ||||||
|  |     for (key, value) in object { | ||||||
|  |         let base_key = if base_key.is_empty() { | ||||||
|  |             key.to_string() | ||||||
|  |         } else { | ||||||
|  |             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         if starts_with(selector, &base_key) { | ||||||
|  |             match value { | ||||||
|  |                 Value::Object(object) => { | ||||||
|  |                     fetch_matching_values_in_object(object, selector, &base_key, output) | ||||||
|  |                 } | ||||||
|  |                 value => output.push(value), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Returns a trimmed version of the document id or `None` if it is invalid. | ||||||
|  | pub fn validate_document_id(document_id: &str) -> Option<&str> { | ||||||
|  |     let document_id = document_id.trim(); | ||||||
|  |     if !document_id.is_empty() | ||||||
|  |         && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) | ||||||
|  |     { | ||||||
|  |         Some(document_id) | ||||||
|  |     } else { | ||||||
|  |         None | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Parses a Json encoded document id and validate it, returning a user error when it is one. | ||||||
|  | pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> { | ||||||
|  |     match document_id { | ||||||
|  |         Value::String(string) => match validate_document_id(&string) { | ||||||
|  |             Some(s) if s.len() == string.len() => Ok(Ok(string)), | ||||||
|  |             Some(s) => Ok(Ok(s.to_string())), | ||||||
|  |             None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), | ||||||
|  |         }, | ||||||
|  |         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), | ||||||
|  |         content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Try to extract an `f64` from a JSON `Value` and return the `Value` | ||||||
|  | /// in the `Err` variant if it failed. | ||||||
|  | pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> { | ||||||
|  |     let number = match value { | ||||||
|  |         Value::Number(ref n) => match n.as_f64() { | ||||||
|  |             Some(number) => number, | ||||||
|  |             None => return Err(value), | ||||||
|  |         }, | ||||||
|  |         Value::String(ref s) => match s.parse::<f64>() { | ||||||
|  |             Ok(number) => number, | ||||||
|  |             Err(_) => return Err(value), | ||||||
|  |         }, | ||||||
|  |         value => return Err(value), | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     if number.is_finite() { | ||||||
|  |         Ok(number) | ||||||
|  |     } else { | ||||||
|  |         Err(value) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> { | ||||||
|  |     use GeoError::*; | ||||||
|  |     let debug_id = || Value::from(id.debug()); | ||||||
|  |     match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { | ||||||
|  |         Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { | ||||||
|  |             (Some(lat), Some(lng)) => { | ||||||
|  |                 match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { | ||||||
|  |                     (Ok(_), Ok(_)) => Ok(Ok(())), | ||||||
|  |                     (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), | ||||||
|  |                     (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), | ||||||
|  |                     (Err(lat), Err(lng)) => { | ||||||
|  |                         Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng })) | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })), | ||||||
|  |             (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })), | ||||||
|  |             (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })), | ||||||
|  |         }, | ||||||
|  |         value => Ok(Err(NotAnObject { document_id: debug_id(), value })), | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,12 +1,12 @@ | |||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io; | use std::io; | ||||||
| use std::result::Result as StdResult; |  | ||||||
|  |  | ||||||
| use concat_arrays::concat_arrays; | use concat_arrays::concat_arrays; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||||
| use crate::error::GeoError; | use crate::error::GeoError; | ||||||
|  | use crate::update::index_documents::extract_finite_float_from_value; | ||||||
| use crate::{FieldId, InternalError, Result}; | use crate::{FieldId, InternalError, Result}; | ||||||
|  |  | ||||||
| /// Extracts the geographical coordinates contained in each document under the `_geo` field. | /// Extracts the geographical coordinates contained in each document under the `_geo` field. | ||||||
| @@ -29,9 +29,9 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | |||||||
|         let obkv = obkv::KvReader::new(value); |         let obkv = obkv::KvReader::new(value); | ||||||
|         // since we only needs the primary key when we throw an error we create this getter to |         // since we only needs the primary key when we throw an error we create this getter to | ||||||
|         // lazily get it when needed |         // lazily get it when needed | ||||||
|         let primary_key = || -> Value { |         let document_id = || -> Value { | ||||||
|             let primary_key = obkv.get(primary_key_id).unwrap(); |             let document_id = obkv.get(primary_key_id).unwrap(); | ||||||
|             serde_json::from_slice(primary_key).unwrap() |             serde_json::from_slice(document_id).unwrap() | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // first we get the two fields |         // first we get the two fields | ||||||
| @@ -40,32 +40,24 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|         if let Some((lat, lng)) = lat.zip(lng) { |         if let Some((lat, lng)) = lat.zip(lng) { | ||||||
|             // then we extract the values |             // then we extract the values | ||||||
|             let lat = extract_float_from_value( |             let lat = extract_finite_float_from_value( | ||||||
|                 serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, |                 serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, | ||||||
|             ) |             ) | ||||||
|             .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; |             .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; | ||||||
|  |  | ||||||
|             let lng = extract_float_from_value( |             let lng = extract_finite_float_from_value( | ||||||
|                 serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, |                 serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, | ||||||
|             ) |             ) | ||||||
|             .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; |             .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; | ||||||
|  |  | ||||||
|             let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; |             let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||||
|             writer.insert(docid_bytes, bytes)?; |             writer.insert(docid_bytes, bytes)?; | ||||||
|         } else if lat.is_none() && lng.is_some() { |         } else if lat.is_none() && lng.is_some() { | ||||||
|             return Err(GeoError::MissingLatitude { document_id: primary_key() })?; |             return Err(GeoError::MissingLatitude { document_id: document_id() })?; | ||||||
|         } else if lat.is_some() && lng.is_none() { |         } else if lat.is_some() && lng.is_none() { | ||||||
|             return Err(GeoError::MissingLongitude { document_id: primary_key() })?; |             return Err(GeoError::MissingLongitude { document_id: document_id() })?; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(writer_into_reader(writer)?) |     Ok(writer_into_reader(writer)?) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn extract_float_from_value(value: Value) -> StdResult<f64, Value> { |  | ||||||
|     match value { |  | ||||||
|         Value::Number(ref n) => n.as_f64().ok_or(value), |  | ||||||
|         Value::String(ref s) => s.parse::<f64>().map_err(|_| value), |  | ||||||
|         value => Err(value), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | mod enrich; | ||||||
| mod extract; | mod extract; | ||||||
| mod helpers; | mod helpers; | ||||||
| mod transform; | mod transform; | ||||||
| @@ -7,6 +8,7 @@ use std::collections::HashSet; | |||||||
| use std::io::{Cursor, Read, Seek}; | use std::io::{Cursor, Read, Seek}; | ||||||
| use std::iter::FromIterator; | use std::iter::FromIterator; | ||||||
| use std::num::{NonZeroU32, NonZeroUsize}; | use std::num::{NonZeroU32, NonZeroUsize}; | ||||||
|  | use std::result::Result as StdResult; | ||||||
|  |  | ||||||
| use crossbeam_channel::{Receiver, Sender}; | use crossbeam_channel::{Receiver, Sender}; | ||||||
| use heed::types::Str; | use heed::types::Str; | ||||||
| @@ -17,6 +19,11 @@ use serde::{Deserialize, Serialize}; | |||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
| use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | ||||||
|  |  | ||||||
|  | use self::enrich::enrich_documents_batch; | ||||||
|  | pub use self::enrich::{ | ||||||
|  |     extract_finite_float_from_value, validate_document_id, validate_document_id_value, | ||||||
|  |     validate_geo_from_json, DocumentId, | ||||||
|  | }; | ||||||
| pub use self::helpers::{ | pub use self::helpers::{ | ||||||
|     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, |     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, | ||||||
|     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, |     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, | ||||||
| @@ -25,13 +32,14 @@ pub use self::helpers::{ | |||||||
| }; | }; | ||||||
| use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | ||||||
| pub use self::transform::{Transform, TransformOutput}; | pub use self::transform::{Transform, TransformOutput}; | ||||||
| use crate::documents::DocumentBatchReader; | use crate::documents::{obkv_to_object, DocumentsBatchReader}; | ||||||
|  | use crate::error::UserError; | ||||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||||
| use crate::update::{ | use crate::update::{ | ||||||
|     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, |     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, | ||||||
|     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, |     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, | ||||||
| }; | }; | ||||||
| use crate::{Index, Result, RoaringBitmapCodec, UserError}; | use crate::{Index, Result, RoaringBitmapCodec}; | ||||||
|  |  | ||||||
| static MERGED_DATABASE_COUNT: usize = 7; | static MERGED_DATABASE_COUNT: usize = 7; | ||||||
| static PREFIX_DATABASE_COUNT: usize = 5; | static PREFIX_DATABASE_COUNT: usize = 5; | ||||||
| @@ -117,29 +125,42 @@ where | |||||||
|  |  | ||||||
|     /// Adds a batch of documents to the current builder. |     /// Adds a batch of documents to the current builder. | ||||||
|     /// |     /// | ||||||
|     /// Since the documents are progressively added to the writer, a failure will cause a stale |     /// Since the documents are progressively added to the writer, a failure will cause only | ||||||
|     /// builder, and the builder must be discarded. |     /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward. | ||||||
|     /// |     /// | ||||||
|     /// Returns the number of documents added to the builder. |     /// Returns the number of documents added to the builder. | ||||||
|     pub fn add_documents<R>(&mut self, reader: DocumentBatchReader<R>) -> Result<u64> |     pub fn add_documents<R: Read + Seek>( | ||||||
|     where |         mut self, | ||||||
|         R: Read + Seek, |         reader: DocumentsBatchReader<R>, | ||||||
|     { |     ) -> Result<(Self, StdResult<u64, UserError>)> { | ||||||
|         // Early return when there is no document to add |         // Early return when there is no document to add | ||||||
|         if reader.is_empty() { |         if reader.is_empty() { | ||||||
|             return Ok(0); |             return Ok((self, Ok(0))); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         // We check for user errors in this validator and if there is one, we can return | ||||||
|  |         // the `IndexDocument` struct as it is valid to send more documents into it. | ||||||
|  |         // However, if there is an internal error we throw it away! | ||||||
|  |         let enriched_documents_reader = match enrich_documents_batch( | ||||||
|  |             self.wtxn, | ||||||
|  |             self.index, | ||||||
|  |             self.config.autogenerate_docids, | ||||||
|  |             reader, | ||||||
|  |         )? { | ||||||
|  |             Ok(reader) => reader, | ||||||
|  |             Err(user_error) => return Ok((self, Err(user_error))), | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         let indexed_documents = self |         let indexed_documents = self | ||||||
|             .transform |             .transform | ||||||
|             .as_mut() |             .as_mut() | ||||||
|             .expect("Invalid document addition state") |             .expect("Invalid document addition state") | ||||||
|             .read_documents(reader, self.wtxn, &self.progress)? |             .read_documents(enriched_documents_reader, self.wtxn, &self.progress)? | ||||||
|             as u64; |             as u64; | ||||||
|  |  | ||||||
|         self.added_documents += indexed_documents; |         self.added_documents += indexed_documents; | ||||||
|  |  | ||||||
|         Ok(indexed_documents) |         Ok((self, Ok(indexed_documents))) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[logging_timer::time("IndexDocuments::{}")] |     #[logging_timer::time("IndexDocuments::{}")] | ||||||
| @@ -590,9 +611,8 @@ mod tests { | |||||||
|     use maplit::hashset; |     use maplit::hashset; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::documents::DocumentBatchBuilder; |     use crate::documents::DocumentsBatchBuilder; | ||||||
|     use crate::update::DeleteDocuments; |     use crate::update::DeleteDocuments; | ||||||
|     use crate::HashMap; |  | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn simple_document_replacement() { |     fn simple_document_replacement() { | ||||||
| @@ -611,10 +631,11 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -627,10 +648,11 @@ mod tests { | |||||||
|         // Second we send 1 document with id 1, to erase the previous ones. |         // Second we send 1 document with id 1, to erase the previous ones. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = documents!([ { "id": 1, "name": "updated kevin" } ]); |         let content = documents!([ { "id": 1, "name": "updated kevin" } ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -647,9 +669,11 @@ mod tests { | |||||||
|             { "id": 2, "name": "updated kevina" }, |             { "id": 2, "name": "updated kevina" }, | ||||||
|             { "id": 3, "name": "updated benoit" } |             { "id": 3, "name": "updated benoit" } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         // Check that there is **always** 3 documents. |         // Check that there is **always** 3 documents. | ||||||
| @@ -679,10 +703,11 @@ mod tests { | |||||||
|             update_method: IndexDocumentsMethod::UpdateDocuments, |             update_method: IndexDocumentsMethod::UpdateDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -707,9 +732,10 @@ mod tests { | |||||||
|         // Second we send 1 document with id 1, to force it to be merged with the previous one. |         // Second we send 1 document with id 1, to force it to be merged with the previous one. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = documents!([ { "id": 1, "age": 25 } ]); |         let content = documents!([ { "id": 1, "age": 25 } ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -750,9 +776,10 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         assert!(builder.add_documents(content).is_err()); |         let (_builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         assert!(user_error.is_err()); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         // Check that there is no document. |         // Check that there is no document. | ||||||
| @@ -779,10 +806,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -799,9 +827,10 @@ mod tests { | |||||||
|         // Second we send 1 document with the generated uuid, to erase the previous ones. |         // Second we send 1 document with the generated uuid, to erase the previous ones. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); |         let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -841,9 +870,10 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -858,9 +888,10 @@ mod tests { | |||||||
|         let content = documents!([ { "name": "new kevin" } ]); |         let content = documents!([ { "name": "new kevin" } ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -883,9 +914,10 @@ mod tests { | |||||||
|         let content = documents!([]); |         let content = documents!([]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -909,19 +941,21 @@ mod tests { | |||||||
|         let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); |         let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         assert!(builder.add_documents(content).is_err()); |         let (_builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         assert!(user_error.is_err()); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         // First we send 1 document with a valid id. |         // First we send 1 document with a valid id. | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         // There is a space in the document id. |         // There is a space in the document id. | ||||||
|         let content = documents!([ { "id": 32, "name": "kevin" } ]); |         let content = documents!([ { "id": 32, "name": "kevin" } ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -948,9 +982,10 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -994,9 +1029,10 @@ mod tests { | |||||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1005,7 +1041,7 @@ mod tests { | |||||||
|             update_method: IndexDocumentsMethod::UpdateDocuments, |             update_method: IndexDocumentsMethod::UpdateDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { |           { | ||||||
| @@ -1015,7 +1051,8 @@ mod tests { | |||||||
|           } |           } | ||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|     } |     } | ||||||
| @@ -1042,9 +1079,10 @@ mod tests { | |||||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1084,10 +1122,11 @@ mod tests { | |||||||
|           { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, |           { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, | ||||||
|           { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, |           { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1123,10 +1162,11 @@ mod tests { | |||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { "id": 0, "_geo": { "lng": 42 } } |           { "id": 0, "_geo": { "lng": 42 } } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         let error = builder.execute().unwrap_err(); |         let error = builder.execute().unwrap_err(); | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             &error.to_string(), |             &error.to_string(), | ||||||
| @@ -1136,10 +1176,11 @@ mod tests { | |||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { "id": 0, "_geo": { "lat": 42 } } |           { "id": 0, "_geo": { "lat": 42 } } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         let error = builder.execute().unwrap_err(); |         let error = builder.execute().unwrap_err(); | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             &error.to_string(), |             &error.to_string(), | ||||||
| @@ -1149,40 +1190,43 @@ mod tests { | |||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } |           { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         let error = builder.execute().unwrap_err(); |         let error = builder.execute().unwrap_err(); | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             &error.to_string(), |             &error.to_string(), | ||||||
|             r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."# |             r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."# | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } |           { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         let error = builder.execute().unwrap_err(); |         let error = builder.execute().unwrap_err(); | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             &error.to_string(), |             &error.to_string(), | ||||||
|             r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."# |             r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."# | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let documents = documents!([ |         let documents = documents!([ | ||||||
|           { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } |           { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } | ||||||
|         ]); |         ]); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(documents).unwrap(); |         let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         let error = builder.execute().unwrap_err(); |         let error = builder.execute().unwrap_err(); | ||||||
|         assert_eq!( |         assert_eq!( | ||||||
|             &error.to_string(), |             &error.to_string(), | ||||||
|             r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."# |             r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."# | ||||||
|         ); |         ); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1202,10 +1246,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); |         assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); | ||||||
| @@ -1222,10 +1267,11 @@ mod tests { | |||||||
|             { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } |             { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } | ||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); |         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); | ||||||
|         assert!(external_documents_ids.get("30").is_some()); |         assert!(external_documents_ids.get("30").is_some()); | ||||||
| @@ -1234,10 +1280,11 @@ mod tests { | |||||||
|             { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } |             { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } | ||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1252,28 +1299,25 @@ mod tests { | |||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |  | ||||||
|         let mut big_object = HashMap::new(); |         let mut big_object = serde_json::Map::new(); | ||||||
|         big_object.insert(S("id"), "wow"); |         big_object.insert(S("id"), serde_json::Value::from("wow")); | ||||||
|         for i in 0..1000 { |         for i in 0..1000 { | ||||||
|             let key = i.to_string(); |             let key = i.to_string(); | ||||||
|             big_object.insert(key, "I am a text!"); |             big_object.insert(key, serde_json::Value::from("I am a text!")); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let mut cursor = Cursor::new(Vec::new()); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         builder.append_json_object(&big_object).unwrap(); | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |         let vector = builder.into_inner().unwrap(); | ||||||
|         let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); |         let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); | ||||||
|         builder.extend_from_json(big_object).unwrap(); |  | ||||||
|         builder.finish().unwrap(); |  | ||||||
|         cursor.set_position(0); |  | ||||||
|         let content = DocumentBatchReader::from_reader(cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1288,30 +1332,27 @@ mod tests { | |||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |  | ||||||
|         let mut big_object = HashMap::new(); |         let mut big_object = serde_json::Map::new(); | ||||||
|         big_object.insert(S("id"), "wow"); |         big_object.insert(S("id"), serde_json::Value::from("wow")); | ||||||
|         let content: String = (0..=u16::MAX) |         let content: String = (0..=u16::MAX) | ||||||
|             .into_iter() |             .into_iter() | ||||||
|             .map(|p| p.to_string()) |             .map(|p| p.to_string()) | ||||||
|             .reduce(|a, b| a + " " + b.as_ref()) |             .reduce(|a, b| a + " " + b.as_ref()) | ||||||
|             .unwrap(); |             .unwrap(); | ||||||
|         big_object.insert("content".to_string(), &content); |         big_object.insert("content".to_string(), serde_json::Value::from(content)); | ||||||
|  |  | ||||||
|         let mut cursor = Cursor::new(Vec::new()); |         let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|  |         builder.append_json_object(&big_object).unwrap(); | ||||||
|         let big_object = serde_json::to_string(&big_object).unwrap(); |         let vector = builder.into_inner().unwrap(); | ||||||
|         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |         let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); | ||||||
|         builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); |  | ||||||
|         builder.finish().unwrap(); |  | ||||||
|         cursor.set_position(0); |  | ||||||
|         let content = DocumentBatchReader::from_reader(cursor).unwrap(); |  | ||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1366,10 +1407,11 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1419,10 +1461,11 @@ mod tests { | |||||||
|  |  | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1551,10 +1594,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|  |  | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1583,6 +1627,58 @@ mod tests { | |||||||
|         assert_eq!(documents_ids, vec![3]); |         assert_eq!(documents_ids, vec![3]); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn retrieve_a_b_nested_document_id() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |         let config = IndexerConfig::default(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |         builder.set_primary_key("a.b".to_owned()); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |  | ||||||
|  |         let content = documents!({ "a" : { "b" : { "c" :  1 }}}); | ||||||
|  |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|  |         let builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|  |         let (_builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |  | ||||||
|  |         // There must be an issue with the primary key no present in the given document | ||||||
|  |         user_error.unwrap_err(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn retrieve_a_b_c_nested_document_id() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |         let config = IndexerConfig::default(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = update::Settings::new(&mut wtxn, &index, &config); | ||||||
|  |         builder.set_primary_key("a.b.c".to_owned()); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |  | ||||||
|  |         let content = documents!({ "a" : { "b" : { "c" :  1 }}}); | ||||||
|  |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|  |         let builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|  |                 .unwrap(); | ||||||
|  |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); | ||||||
|  |         assert!(external_documents_ids.get("1").is_some()); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn test_facets_generation() { |     fn test_facets_generation() { | ||||||
|         let path = tempfile::tempdir().unwrap(); |         let path = tempfile::tempdir().unwrap(); | ||||||
| @@ -1621,10 +1717,11 @@ mod tests { | |||||||
|         // index the documents |         // index the documents | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1713,10 +1810,11 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1730,10 +1828,11 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1752,10 +1851,11 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1780,10 +1880,11 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1825,10 +1926,11 @@ mod tests { | |||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|     } |     } | ||||||
| @@ -1843,28 +1945,31 @@ mod tests { | |||||||
|  |  | ||||||
|         // Create 200 documents with a long text |         // Create 200 documents with a long text | ||||||
|         let content = { |         let content = { | ||||||
|             let documents: Vec<_> = (0..200i32) |             let documents_iter = (0..200i32) | ||||||
|                 .into_iter() |                 .into_iter() | ||||||
|                 .map(|i| serde_json::json!({ "id": i, "script": script })) |                 .map(|i| serde_json::json!({ "id": i, "script": script })) | ||||||
|                 .collect(); |                 .filter_map(|json| match json { | ||||||
|  |                     serde_json::Value::Object(object) => Some(object), | ||||||
|  |                     _ => None, | ||||||
|  |                 }); | ||||||
|  |  | ||||||
|             let mut writer = std::io::Cursor::new(Vec::new()); |             let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); | ||||||
|             let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); |             for object in documents_iter { | ||||||
|             let documents = serde_json::to_vec(&documents).unwrap(); |                 builder.append_json_object(&object).unwrap(); | ||||||
|             builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); |             } | ||||||
|             builder.finish().unwrap(); |             let vector = builder.into_inner().unwrap(); | ||||||
|             writer.set_position(0); |             crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap() | ||||||
|             crate::documents::DocumentBatchReader::from_reader(writer).unwrap() |  | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // Index those 200 long documents |         // Index those 200 long documents | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // Create one long document |         // Create one long document | ||||||
| @@ -1875,10 +1980,11 @@ mod tests { | |||||||
|         // Index this one long document |         // Index this one long document | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
| @@ -1892,7 +1998,7 @@ mod tests { | |||||||
|         let index = Index::new(options, tmp).unwrap(); |         let index = Index::new(options, tmp).unwrap(); | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let indexer_config = IndexerConfig::default(); |         let indexer_config = IndexerConfig::default(); | ||||||
|         let mut builder = IndexDocuments::new( |         let builder = IndexDocuments::new( | ||||||
|             &mut wtxn, |             &mut wtxn, | ||||||
|             &index, |             &index, | ||||||
|             &indexer_config, |             &indexer_config, | ||||||
| @@ -1921,8 +2027,10 @@ mod tests { | |||||||
|             "branch_id_number": 0 |             "branch_id_number": 0 | ||||||
|         }]}; |         }]}; | ||||||
|  |  | ||||||
|         builder.add_documents(doc1).unwrap(); |         let (builder, user_error) = builder.add_documents(doc1).unwrap(); | ||||||
|         builder.add_documents(doc2).unwrap(); |         user_error.unwrap(); | ||||||
|  |         let (builder, user_error) = builder.add_documents(doc2).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|  |  | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -1931,4 +2039,51 @@ mod tests { | |||||||
|  |  | ||||||
|         assert_eq!(ids.len(), map.len()); |         assert_eq!(ids.len(), map.len()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn primary_key_must_not_contain_floats() { | ||||||
|  |         let tmp = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(4096 * 100); | ||||||
|  |         let index = Index::new(options, tmp).unwrap(); | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let indexer_config = IndexerConfig::default(); | ||||||
|  |         let builder = IndexDocuments::new( | ||||||
|  |             &mut wtxn, | ||||||
|  |             &index, | ||||||
|  |             &indexer_config, | ||||||
|  |             IndexDocumentsConfig::default(), | ||||||
|  |             |_| (), | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |  | ||||||
|  |         let doc1 = documents! {[{ | ||||||
|  |             "id": -228142, | ||||||
|  |             "title": "asdsad", | ||||||
|  |         }]}; | ||||||
|  |  | ||||||
|  |         let doc2 = documents! {[{ | ||||||
|  |             "id": 228143.56, | ||||||
|  |             "title": "something", | ||||||
|  |         }]}; | ||||||
|  |  | ||||||
|  |         let doc3 = documents! {[{ | ||||||
|  |             "id": -228143.56, | ||||||
|  |             "title": "something", | ||||||
|  |         }]}; | ||||||
|  |  | ||||||
|  |         let doc4 = documents! {[{ | ||||||
|  |             "id": 2.0, | ||||||
|  |             "title": "something", | ||||||
|  |         }]}; | ||||||
|  |  | ||||||
|  |         let (builder, user_error) = builder.add_documents(doc1).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|  |         let (builder, user_error) = builder.add_documents(doc2).unwrap(); | ||||||
|  |         assert!(user_error.is_err()); | ||||||
|  |         let (builder, user_error) = builder.add_documents(doc3).unwrap(); | ||||||
|  |         assert!(user_error.is_err()); | ||||||
|  |         let (_builder, user_error) = builder.add_documents(doc4).unwrap(); | ||||||
|  |         assert!(user_error.is_err()); | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -9,12 +9,12 @@ use heed::RoTxn; | |||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use obkv::{KvReader, KvWriter}; | use obkv::{KvReader, KvWriter}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::{Map, Value}; | use serde_json::Value; | ||||||
| use smartstring::SmartString; | use smartstring::SmartString; | ||||||
|  |  | ||||||
| use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; | use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; | ||||||
| use super::{IndexDocumentsMethod, IndexerConfig}; | use super::{IndexDocumentsMethod, IndexerConfig}; | ||||||
| use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; | use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; | ||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
| use crate::index::db_name; | use crate::index::db_name; | ||||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||||
| @@ -23,8 +23,6 @@ use crate::{ | |||||||
|     Result, BEU32, |     Result, BEU32, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; |  | ||||||
|  |  | ||||||
| pub struct TransformOutput { | pub struct TransformOutput { | ||||||
|     pub primary_key: String, |     pub primary_key: String, | ||||||
|     pub fields_ids_map: FieldsIdsMap, |     pub fields_ids_map: FieldsIdsMap, | ||||||
| @@ -84,18 +82,6 @@ fn create_fields_mapping( | |||||||
|         .collect() |         .collect() | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. |  | ||||||
| /// It doesn't look in the subfield because we don't want to enable the |  | ||||||
| /// primary key inference on nested objects. |  | ||||||
| fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { |  | ||||||
|     index |  | ||||||
|         .iter() |  | ||||||
|         .sorted_by_key(|(k, _)| *k) |  | ||||||
|         .map(|(_, v)| v) |  | ||||||
|         .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)) |  | ||||||
|         .map(String::as_str) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'a, 'i> Transform<'a, 'i> { | impl<'a, 'i> Transform<'a, 'i> { | ||||||
|     pub fn new( |     pub fn new( | ||||||
|         wtxn: &mut heed::RwTxn, |         wtxn: &mut heed::RwTxn, | ||||||
| @@ -152,7 +138,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|  |  | ||||||
|     pub fn read_documents<R, F>( |     pub fn read_documents<R, F>( | ||||||
|         &mut self, |         &mut self, | ||||||
|         mut reader: DocumentBatchReader<R>, |         reader: EnrichedDocumentsBatchReader<R>, | ||||||
|         wtxn: &mut heed::RwTxn, |         wtxn: &mut heed::RwTxn, | ||||||
|         progress_callback: F, |         progress_callback: F, | ||||||
|     ) -> Result<usize> |     ) -> Result<usize> | ||||||
| @@ -160,33 +146,25 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         R: Read + Seek, |         R: Read + Seek, | ||||||
|         F: Fn(UpdateIndexingStep) + Sync, |         F: Fn(UpdateIndexingStep) + Sync, | ||||||
|     { |     { | ||||||
|         let fields_index = reader.index(); |         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); | ||||||
|  |  | ||||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; |         let external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||||
|  |  | ||||||
|         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; |         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; | ||||||
|  |  | ||||||
|         let alternative_name = self |         let primary_key = cursor.primary_key().to_string(); | ||||||
|             .index |         let primary_key_id = | ||||||
|             .primary_key(wtxn)? |             self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; | ||||||
|             .or_else(|| find_primary_key(fields_index)) |  | ||||||
|             .map(String::from); |  | ||||||
|  |  | ||||||
|         let (primary_key_id, primary_key_name) = compute_primary_key_pair( |  | ||||||
|             self.index.primary_key(wtxn)?, |  | ||||||
|             &mut self.fields_ids_map, |  | ||||||
|             alternative_name, |  | ||||||
|             self.autogenerate_docids, |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         let primary_key_id_nested = primary_key_name.contains('.'); |  | ||||||
|  |  | ||||||
|         let mut flattened_document = None; |  | ||||||
|         let mut obkv_buffer = Vec::new(); |         let mut obkv_buffer = Vec::new(); | ||||||
|         let mut flattened_obkv_buffer = Vec::new(); |  | ||||||
|         let mut documents_count = 0; |         let mut documents_count = 0; | ||||||
|         let mut external_id_buffer = Vec::new(); |         let mut docid_buffer: Vec<u8> = Vec::new(); | ||||||
|         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); |         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); | ||||||
|         while let Some((addition_index, document)) = reader.next_document_with_index()? { |         while let Some(enriched_document) = cursor.next_enriched_document()? { | ||||||
|  |             let EnrichedDocument { document, document_id } = enriched_document; | ||||||
|  |  | ||||||
|  |             // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer | ||||||
|  |             // does not keep references from the cursor between loop iterations | ||||||
|             let mut field_buffer_cache = drop_and_reuse(field_buffer); |             let mut field_buffer_cache = drop_and_reuse(field_buffer); | ||||||
|             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { |             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { | ||||||
|                 progress_callback(UpdateIndexingStep::RemapDocumentAddition { |                 progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||||
| @@ -194,52 +172,21 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                 }); |                 }); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|  |             // When the document id has been auto-generated by the `enrich_documents_batch` | ||||||
|  |             // we must insert this document id into the remaped document. | ||||||
|  |             let external_id = document_id.value(); | ||||||
|  |             if document_id.is_generated() { | ||||||
|  |                 serde_json::to_writer(&mut docid_buffer, external_id) | ||||||
|  |                     .map_err(InternalError::SerdeJson)?; | ||||||
|  |                 field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer))); | ||||||
|  |             } | ||||||
|  |  | ||||||
|             for (k, v) in document.iter() { |             for (k, v) in document.iter() { | ||||||
|                 let mapped_id = |                 let mapped_id = | ||||||
|                     *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; |                     *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; | ||||||
|                 field_buffer_cache.push((mapped_id, Cow::from(v))); |                 field_buffer_cache.push((mapped_id, Cow::from(v))); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // We need to make sure that every document has a primary key. After we have remapped |  | ||||||
|             // all the fields in the document, we try to find the primary key value. If we can find |  | ||||||
|             // it, transform it into a string and validate it, and then update it in the |  | ||||||
|             // document. If none is found, and we were told to generate missing document ids, then |  | ||||||
|             // we create the missing field, and update the new document. |  | ||||||
|             let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; |  | ||||||
|             let external_id = if primary_key_id_nested { |  | ||||||
|                 let mut field_buffer_cache = field_buffer_cache.clone(); |  | ||||||
|                 self.flatten_from_field_mapping( |  | ||||||
|                     &mapping, |  | ||||||
|                     &document, |  | ||||||
|                     &mut flattened_obkv_buffer, |  | ||||||
|                     &mut field_buffer_cache, |  | ||||||
|                 )?; |  | ||||||
|                 flattened_document = Some(&flattened_obkv_buffer); |  | ||||||
|                 let document = KvReader::new(&flattened_obkv_buffer); |  | ||||||
|  |  | ||||||
|                 update_primary_key( |  | ||||||
|                     document, |  | ||||||
|                     &addition_index, |  | ||||||
|                     primary_key_id, |  | ||||||
|                     &primary_key_name, |  | ||||||
|                     &mut uuid_buffer, |  | ||||||
|                     &mut field_buffer_cache, |  | ||||||
|                     &mut external_id_buffer, |  | ||||||
|                     self.autogenerate_docids, |  | ||||||
|                 )? |  | ||||||
|             } else { |  | ||||||
|                 update_primary_key( |  | ||||||
|                     document, |  | ||||||
|                     &addition_index, |  | ||||||
|                     primary_key_id, |  | ||||||
|                     &primary_key_name, |  | ||||||
|                     &mut uuid_buffer, |  | ||||||
|                     &mut field_buffer_cache, |  | ||||||
|                     &mut external_id_buffer, |  | ||||||
|                     self.autogenerate_docids, |  | ||||||
|                 )? |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             // Insertion in a obkv need to be done with keys ordered. For now they are ordered |             // Insertion in a obkv need to be done with keys ordered. For now they are ordered | ||||||
|             // according to the document addition key order, so we sort it according to the |             // according to the document addition key order, so we sort it according to the | ||||||
|             // fieldids map keys order. |             // fieldids map keys order. | ||||||
| @@ -294,18 +241,12 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             // We use the extracted/generated user id as the key for this document. |             // We use the extracted/generated user id as the key for this document. | ||||||
|             self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; |             self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?; | ||||||
|             documents_count += 1; |             documents_count += 1; | ||||||
|  |  | ||||||
|             if let Some(flatten) = flattened_document { |  | ||||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; |  | ||||||
|             } else { |  | ||||||
|             match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { |             match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { | ||||||
|                 Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, |                 Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, | ||||||
|                     None => { |                 None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?, | ||||||
|                         self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             progress_callback(UpdateIndexingStep::RemapDocumentAddition { |             progress_callback(UpdateIndexingStep::RemapDocumentAddition { | ||||||
| @@ -313,7 +254,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             field_buffer = drop_and_reuse(field_buffer_cache); |             field_buffer = drop_and_reuse(field_buffer_cache); | ||||||
|             external_id_buffer.clear(); |             docid_buffer.clear(); | ||||||
|             obkv_buffer.clear(); |             obkv_buffer.clear(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -322,7 +263,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; |         self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; | ||||||
|         self.index.put_primary_key(wtxn, &primary_key_name)?; |         self.index.put_primary_key(wtxn, &primary_key)?; | ||||||
|         self.documents_count += documents_count; |         self.documents_count += documents_count; | ||||||
|         // Now that we have a valid sorter that contains the user id and the obkv we |         // Now that we have a valid sorter that contains the user id and the obkv we | ||||||
|         // give it to the last transforming function which returns the TransformOutput. |         // give it to the last transforming function which returns the TransformOutput. | ||||||
| @@ -384,61 +325,6 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         Ok(Some(buffer)) |         Ok(Some(buffer)) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Flatten a document from a field mapping generated by [create_fields_mapping] |  | ||||||
|     fn flatten_from_field_mapping( |  | ||||||
|         &mut self, |  | ||||||
|         mapping: &HashMap<FieldId, FieldId>, |  | ||||||
|         obkv: &KvReader<FieldId>, |  | ||||||
|         output_buffer: &mut Vec<u8>, |  | ||||||
|         field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         // store the keys and values of the json + the original obkv |  | ||||||
|         let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); |  | ||||||
|  |  | ||||||
|         // if the primary_key is nested we need to flatten the document before being able to do anything |  | ||||||
|         let mut doc = serde_json::Map::new(); |  | ||||||
|  |  | ||||||
|         // we recreate a json containing only the fields that needs to be flattened. |  | ||||||
|         // all the raw values get inserted directly in the `key_value` vec. |  | ||||||
|         for (key, value) in obkv.iter() { |  | ||||||
|             if json_depth_checker::should_flatten_from_unchecked_slice(value) { |  | ||||||
|                 let key = |  | ||||||
|                     mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?; |  | ||||||
|                 let key = |  | ||||||
|                     self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { |  | ||||||
|                         field_id: *key, |  | ||||||
|                         process: "Flatten from field mapping.", |  | ||||||
|                     })?; |  | ||||||
|                 let value = serde_json::from_slice::<serde_json::Value>(value) |  | ||||||
|                     .map_err(InternalError::SerdeJson)?; |  | ||||||
|                 doc.insert(key.to_string(), value); |  | ||||||
|             } else { |  | ||||||
|                 key_value.push((key, value.into())); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let flattened = flatten_serde_json::flatten(&doc); |  | ||||||
|  |  | ||||||
|         // Once we have the flattened version we insert all the new generated fields_ids |  | ||||||
|         // (if any) in the fields ids map and serialize the value. |  | ||||||
|         for (key, value) in flattened.into_iter() { |  | ||||||
|             let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; |  | ||||||
|             key_value.push((fid, value.clone().into())); |  | ||||||
|  |  | ||||||
|             if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { |  | ||||||
|                 field_buffer_cache.push((fid, value.into())); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // we sort the key. If there was a conflict between the obkv and the new generated value the |  | ||||||
|         // keys will be consecutive. |  | ||||||
|         key_value.sort_unstable_by_key(|(key, _)| *key); |  | ||||||
|  |  | ||||||
|         Self::create_obkv_from_key_value(&mut key_value, output_buffer)?; |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Generate an obkv from a slice of key / value sorted by key. |     /// Generate an obkv from a slice of key / value sorted by key. | ||||||
|     fn create_obkv_from_key_value( |     fn create_obkv_from_key_value( | ||||||
|         key_value: &mut [(FieldId, Cow<[u8]>)], |         key_value: &mut [(FieldId, Cow<[u8]>)], | ||||||
| @@ -744,50 +630,6 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name) |  | ||||||
| /// for the primary key according to the following rules: |  | ||||||
| /// - if primary_key is `Some`, returns the id and the name, else |  | ||||||
| /// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else |  | ||||||
| /// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and |  | ||||||
| /// returns the pair, else |  | ||||||
| /// - returns an error. |  | ||||||
| fn compute_primary_key_pair( |  | ||||||
|     primary_key: Option<&str>, |  | ||||||
|     fields_ids_map: &mut FieldsIdsMap, |  | ||||||
|     alternative_name: Option<String>, |  | ||||||
|     autogenerate_docids: bool, |  | ||||||
| ) -> Result<(FieldId, String)> { |  | ||||||
|     match primary_key { |  | ||||||
|         Some(primary_key) => { |  | ||||||
|             let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|             Ok((id, primary_key.to_string())) |  | ||||||
|         } |  | ||||||
|         None => { |  | ||||||
|             let name = match alternative_name { |  | ||||||
|                 Some(key) => key, |  | ||||||
|                 None => { |  | ||||||
|                     if !autogenerate_docids { |  | ||||||
|                         // If there is no primary key in the current document batch, we must |  | ||||||
|                         // return an error and not automatically generate any document id. |  | ||||||
|                         return Err(UserError::MissingPrimaryKey.into()); |  | ||||||
|                     } |  | ||||||
|                     DEFAULT_PRIMARY_KEY_NAME.to_string() |  | ||||||
|                 } |  | ||||||
|             }; |  | ||||||
|             let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|             Ok((id, name)) |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn validate_document_id(document_id: &str) -> Option<&str> { |  | ||||||
|     let document_id = document_id.trim(); |  | ||||||
|     Some(document_id).filter(|id| { |  | ||||||
|         !id.is_empty() |  | ||||||
|             && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) |  | ||||||
|     }) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`. | /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`. | ||||||
| /// | /// | ||||||
| /// The size and alignment of T and U must match. | /// The size and alignment of T and U must match. | ||||||
| @@ -799,63 +641,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> { | |||||||
|     vec.into_iter().map(|_| unreachable!()).collect() |     vec.into_iter().map(|_| unreachable!()).collect() | ||||||
| } | } | ||||||
|  |  | ||||||
| fn update_primary_key<'a>( |  | ||||||
|     document: KvReader<'a, FieldId>, |  | ||||||
|     addition_index: &DocumentsBatchIndex, |  | ||||||
|     primary_key_id: FieldId, |  | ||||||
|     primary_key_name: &str, |  | ||||||
|     uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH], |  | ||||||
|     field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, |  | ||||||
|     mut external_id_buffer: &'a mut Vec<u8>, |  | ||||||
|     autogenerate_docids: bool, |  | ||||||
| ) -> Result<Cow<'a, str>> { |  | ||||||
|     match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { |  | ||||||
|         Some((_, bytes)) => { |  | ||||||
|             let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { |  | ||||||
|                 Value::String(string) => match validate_document_id(&string) { |  | ||||||
|                     Some(s) if s.len() == string.len() => string, |  | ||||||
|                     Some(s) => s.to_string(), |  | ||||||
|                     None => { |  | ||||||
|                         return Err(UserError::InvalidDocumentId { |  | ||||||
|                             document_id: Value::String(string), |  | ||||||
|                         } |  | ||||||
|                         .into()) |  | ||||||
|                     } |  | ||||||
|                 }, |  | ||||||
|                 Value::Number(number) => number.to_string(), |  | ||||||
|                 content => { |  | ||||||
|                     return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) |  | ||||||
|                 } |  | ||||||
|             }; |  | ||||||
|             serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; |  | ||||||
|             Ok(Cow::Owned(value)) |  | ||||||
|         } |  | ||||||
|         None if autogenerate_docids => { |  | ||||||
|             let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); |  | ||||||
|             serde_json::to_writer(&mut external_id_buffer, &uuid) |  | ||||||
|                 .map_err(InternalError::SerdeJson)?; |  | ||||||
|             field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); |  | ||||||
|             Ok(Cow::Borrowed(&*uuid)) |  | ||||||
|         } |  | ||||||
|         None => { |  | ||||||
|             let mut json = Map::new(); |  | ||||||
|             for (key, value) in document.iter() { |  | ||||||
|                 let key = addition_index.name(key).cloned(); |  | ||||||
|                 let value = serde_json::from_slice::<Value>(&value).ok(); |  | ||||||
|  |  | ||||||
|                 if let Some((k, v)) = key.zip(value) { |  | ||||||
|                     json.insert(k, v); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             Err(UserError::MissingDocumentId { |  | ||||||
|                 primary_key: primary_key_name.to_string(), |  | ||||||
|                 document: json, |  | ||||||
|             })? |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl TransformOutput { | impl TransformOutput { | ||||||
|     // find and insert the new field ids |     // find and insert the new field ids | ||||||
|     pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> { |     pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> { | ||||||
| @@ -869,88 +654,3 @@ impl TransformOutput { | |||||||
|             .collect()) |             .collect()) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod test { |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     mod compute_primary_key { |  | ||||||
|         use big_s::S; |  | ||||||
|  |  | ||||||
|         use super::{compute_primary_key_pair, FieldsIdsMap}; |  | ||||||
|  |  | ||||||
|         #[test] |  | ||||||
|         fn should_return_primary_key_if_is_some() { |  | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |  | ||||||
|             fields_map.insert("toto").unwrap(); |  | ||||||
|             let result = compute_primary_key_pair( |  | ||||||
|                 Some("toto"), |  | ||||||
|                 &mut fields_map, |  | ||||||
|                 Some("tata".to_string()), |  | ||||||
|                 false, |  | ||||||
|             ); |  | ||||||
|             assert_eq!(result.unwrap(), (0, "toto".to_string())); |  | ||||||
|             assert_eq!(fields_map.len(), 1); |  | ||||||
|  |  | ||||||
|             // and with nested fields |  | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |  | ||||||
|             fields_map.insert("toto.tata").unwrap(); |  | ||||||
|             let result = compute_primary_key_pair( |  | ||||||
|                 Some("toto.tata"), |  | ||||||
|                 &mut fields_map, |  | ||||||
|                 Some(S("titi")), |  | ||||||
|                 false, |  | ||||||
|             ); |  | ||||||
|             assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); |  | ||||||
|             assert_eq!(fields_map.len(), 1); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         #[test] |  | ||||||
|         fn should_return_alternative_if_primary_is_none() { |  | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |  | ||||||
|             let result = |  | ||||||
|                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); |  | ||||||
|             assert_eq!(result.unwrap(), (0, S("tata"))); |  | ||||||
|             assert_eq!(fields_map.len(), 1); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         #[test] |  | ||||||
|         fn should_return_default_if_both_are_none() { |  | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |  | ||||||
|             let result = compute_primary_key_pair(None, &mut fields_map, None, true); |  | ||||||
|             assert_eq!(result.unwrap(), (0, S("id"))); |  | ||||||
|             assert_eq!(fields_map.len(), 1); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         #[test] |  | ||||||
|         fn should_return_err_if_both_are_none_and_recompute_is_false() { |  | ||||||
|             let mut fields_map = FieldsIdsMap::new(); |  | ||||||
|             let result = compute_primary_key_pair(None, &mut fields_map, None, false); |  | ||||||
|             assert!(result.is_err()); |  | ||||||
|             assert_eq!(fields_map.len(), 0); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     mod primary_key_inference { |  | ||||||
|         use big_s::S; |  | ||||||
|         use bimap::BiHashMap; |  | ||||||
|  |  | ||||||
|         use crate::documents::DocumentsBatchIndex; |  | ||||||
|         use crate::update::index_documents::transform::find_primary_key; |  | ||||||
|  |  | ||||||
|         #[test] |  | ||||||
|         fn primary_key_infered_on_first_field() { |  | ||||||
|             // We run the test multiple times to change the order in which the fields are iterated upon. |  | ||||||
|             for _ in 1..50 { |  | ||||||
|                 let mut map = BiHashMap::new(); |  | ||||||
|                 map.insert(1, S("fakeId")); |  | ||||||
|                 map.insert(2, S("fakeId")); |  | ||||||
|                 map.insert(3, S("fakeId")); |  | ||||||
|                 map.insert(4, S("fakeId")); |  | ||||||
|                 map.insert(0, S("realId")); |  | ||||||
|  |  | ||||||
|                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments; | |||||||
| pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; | pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; | ||||||
| pub use self::facets::Facets; | pub use self::facets::Facets; | ||||||
| pub use self::index_documents::{ | pub use self::index_documents::{ | ||||||
|     DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, |     DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, | ||||||
| }; | }; | ||||||
| pub use self::indexer_config::IndexerConfig; | pub use self::indexer_config::IndexerConfig; | ||||||
| pub use self::settings::{Setting, Settings}; | pub use self::settings::{Setting, Settings}; | ||||||
|   | |||||||
| @@ -735,10 +735,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = IndexDocumentsConfig::default(); |         let indexing_config = IndexDocumentsConfig::default(); | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -798,10 +799,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -850,10 +852,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -880,10 +883,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // In the same transaction we change the displayed fields to be only the age. |         // In the same transaction we change the displayed fields to be only the age. | ||||||
| @@ -934,10 +938,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -974,10 +979,11 @@ mod tests { | |||||||
|  |  | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1016,10 +1022,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1067,10 +1074,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1110,10 +1118,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1142,10 +1151,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1172,10 +1182,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // In the same transaction we provide some stop_words |         // In the same transaction we provide some stop_words | ||||||
| @@ -1251,10 +1262,11 @@ mod tests { | |||||||
|         let config = IndexerConfig::default(); |         let config = IndexerConfig::default(); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         // In the same transaction provide some synonyms |         // In the same transaction provide some synonyms | ||||||
| @@ -1389,10 +1401,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
| @@ -1452,10 +1465,11 @@ mod tests { | |||||||
|         ]); |         ]); | ||||||
|         let indexing_config = |         let indexing_config = | ||||||
|             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|         let mut builder = |         let builder = | ||||||
|             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) |             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) | ||||||
|                 .unwrap(); |                 .unwrap(); | ||||||
|         builder.add_documents(content).unwrap(); |         let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |         user_error.unwrap(); | ||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,9 +3,10 @@ use std::io::Cursor; | |||||||
| use big_s::S; | use big_s::S; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use maplit::hashset; | use maplit::hashset; | ||||||
| use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | ||||||
| use milli::{FacetDistribution, Index}; | use milli::{FacetDistribution, Index, Object}; | ||||||
|  | use serde_json::Deserializer; | ||||||
|  |  | ||||||
| #[test] | #[test] | ||||||
| fn test_facet_distribution_with_no_facet_values() { | fn test_facet_distribution_with_no_facet_values() { | ||||||
| @@ -28,38 +29,33 @@ fn test_facet_distribution_with_no_facet_values() { | |||||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; |     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|  |  | ||||||
|     let mut builder = |     let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |     let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut cursor = Cursor::new(Vec::new()); |  | ||||||
|     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|     let reader = Cursor::new( |     let reader = Cursor::new( | ||||||
|         r#"[ |         r#"{ | ||||||
|         { |  | ||||||
|             "id": 123, |             "id": 123, | ||||||
|             "title": "What a week, hu...", |             "title": "What a week, hu...", | ||||||
|             "genres": [], |             "genres": [], | ||||||
|             "tags": ["blue"] |             "tags": ["blue"] | ||||||
|         }, |         } | ||||||
|         { |         { | ||||||
|             "id": 345, |             "id": 345, | ||||||
|             "title": "I am the pig!", |             "title": "I am the pig!", | ||||||
|             "tags": ["red"] |             "tags": ["red"] | ||||||
|         } |         }"#, | ||||||
|     ]"#, |  | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() { |     for result in Deserializer::from_reader(reader).into_iter::<Object>() { | ||||||
|         let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); |         let object = result.unwrap(); | ||||||
|         documents_builder.extend_from_json(doc).unwrap(); |         documents_builder.append_json_object(&object).unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     documents_builder.finish().unwrap(); |     let vector = documents_builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|     cursor.set_position(0); |  | ||||||
|  |  | ||||||
|     // index documents |     // index documents | ||||||
|     let content = DocumentBatchReader::from_reader(cursor).unwrap(); |     let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); | ||||||
|     builder.add_documents(content).unwrap(); |     let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |     user_error.unwrap(); | ||||||
|     builder.execute().unwrap(); |     builder.execute().unwrap(); | ||||||
|  |  | ||||||
|     wtxn.commit().unwrap(); |     wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -6,10 +6,11 @@ use big_s::S; | |||||||
| use either::{Either, Left, Right}; | use either::{Either, Left, Right}; | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use maplit::{hashmap, hashset}; | use maplit::{hashmap, hashset}; | ||||||
| use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | ||||||
| use milli::{AscDesc, Criterion, DocumentId, Index, Member}; | use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object}; | ||||||
| use serde::Deserialize; | use serde::Deserialize; | ||||||
|  | use serde_json::Deserializer; | ||||||
| use slice_group_by::GroupBy; | use slice_group_by::GroupBy; | ||||||
|  |  | ||||||
| mod distinct; | mod distinct; | ||||||
| @@ -60,24 +61,21 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | |||||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; |     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|  |  | ||||||
|     let mut builder = |     let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |     let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut cursor = Cursor::new(Vec::new()); |  | ||||||
|     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|     let reader = Cursor::new(CONTENT.as_bytes()); |     let reader = Cursor::new(CONTENT.as_bytes()); | ||||||
|  |  | ||||||
|     for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() { |     for result in Deserializer::from_reader(reader).into_iter::<Object>() { | ||||||
|         let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); |         let object = result.unwrap(); | ||||||
|         documents_builder.extend_from_json(doc).unwrap(); |         documents_builder.append_json_object(&object).unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     documents_builder.finish().unwrap(); |     let vector = documents_builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|     cursor.set_position(0); |  | ||||||
|  |  | ||||||
|     // index documents |     // index documents | ||||||
|     let content = DocumentBatchReader::from_reader(cursor).unwrap(); |     let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); | ||||||
|     builder.add_documents(content).unwrap(); |     let (builder, user_error) = builder.add_documents(content).unwrap(); | ||||||
|  |     user_error.unwrap(); | ||||||
|     builder.execute().unwrap(); |     builder.execute().unwrap(); | ||||||
|  |  | ||||||
|     wtxn.commit().unwrap(); |     wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -5,7 +5,7 @@ use big_s::S; | |||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use maplit::hashset; | use maplit::hashset; | ||||||
| use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
| use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | ||||||
| use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; | use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; | ||||||
| use rand::Rng; | use rand::Rng; | ||||||
| @@ -390,11 +390,9 @@ fn criteria_ascdesc() { | |||||||
|     // index documents |     // index documents | ||||||
|     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; |     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; | ||||||
|     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; |     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; | ||||||
|     let mut builder = |     let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); |  | ||||||
|  |  | ||||||
|     let mut cursor = Cursor::new(Vec::new()); |     let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); | ||||||
|     let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); |  | ||||||
|  |  | ||||||
|     (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { |     (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { | ||||||
|         let mut rng = rand::thread_rng(); |         let mut rng = rand::thread_rng(); | ||||||
| @@ -412,17 +410,19 @@ fn criteria_ascdesc() { | |||||||
|             "age": age, |             "age": age, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         let json = Cursor::new(serde_json::to_vec(&json).unwrap()); |         let object = match json { | ||||||
|         batch_builder.extend_from_json(json).unwrap(); |             serde_json::Value::Object(object) => object, | ||||||
|  |             _ => panic!(), | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         batch_builder.append_json_object(&object).unwrap(); | ||||||
|     }); |     }); | ||||||
|  |  | ||||||
|     batch_builder.finish().unwrap(); |     let vector = batch_builder.into_inner().unwrap(); | ||||||
|  |  | ||||||
|     cursor.set_position(0); |     let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); | ||||||
|  |     let (builder, user_error) = builder.add_documents(reader).unwrap(); | ||||||
|     let reader = DocumentBatchReader::from_reader(cursor).unwrap(); |     user_error.unwrap(); | ||||||
|  |  | ||||||
|     builder.add_documents(reader).unwrap(); |  | ||||||
|     builder.execute().unwrap(); |     builder.execute().unwrap(); | ||||||
|  |  | ||||||
|     wtxn.commit().unwrap(); |     wtxn.commit().unwrap(); | ||||||
|   | |||||||
| @@ -106,35 +106,31 @@ fn test_typo_disabled_on_word() { | |||||||
|     options.map_size(4096 * 100); |     options.map_size(4096 * 100); | ||||||
|     let index = Index::new(options, tmp.path()).unwrap(); |     let index = Index::new(options, tmp.path()).unwrap(); | ||||||
|  |  | ||||||
|     let documents = json!([ |     let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new()); | ||||||
|         { |     let doc1 = json!({ | ||||||
|         "id": 1usize, |         "id": 1usize, | ||||||
|         "data": "zealand", |         "data": "zealand", | ||||||
|         }, |     }); | ||||||
|         { |  | ||||||
|  |     let doc2 = json!({ | ||||||
|         "id": 2usize, |         "id": 2usize, | ||||||
|         "data": "zearand", |         "data": "zearand", | ||||||
|         }, |     }); | ||||||
|     ]); |  | ||||||
|  |  | ||||||
|     let mut writer = std::io::Cursor::new(Vec::new()); |     builder.append_json_object(doc1.as_object().unwrap()).unwrap(); | ||||||
|     let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); |     builder.append_json_object(doc2.as_object().unwrap()).unwrap(); | ||||||
|     let documents = serde_json::to_vec(&documents).unwrap(); |     let vector = builder.into_inner().unwrap(); | ||||||
|     builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); |  | ||||||
|     builder.finish().unwrap(); |  | ||||||
|  |  | ||||||
|     writer.set_position(0); |     let documents = | ||||||
|  |         milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap(); | ||||||
|     let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); |  | ||||||
|  |  | ||||||
|     let mut txn = index.write_txn().unwrap(); |     let mut txn = index.write_txn().unwrap(); | ||||||
|     let config = IndexerConfig::default(); |     let config = IndexerConfig::default(); | ||||||
|     let indexing_config = IndexDocumentsConfig::default(); |     let indexing_config = IndexDocumentsConfig::default(); | ||||||
|     let mut builder = |     let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); | ||||||
|         IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); |  | ||||||
|  |  | ||||||
|     builder.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|  |     let (builder, user_error) = builder.add_documents(documents).unwrap(); | ||||||
|  |     user_error.unwrap(); | ||||||
|     builder.execute().unwrap(); |     builder.execute().unwrap(); | ||||||
|     txn.commit().unwrap(); |     txn.commit().unwrap(); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user