mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge branch 'main' into tmp-release-v1.5.0
This commit is contained in:
		
							
								
								
									
										3
									
								
								.github/workflows/benchmarks-pr.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/benchmarks-pr.yml
									
									
									
									
										vendored
									
									
								
							| @@ -90,7 +90,8 @@ jobs: | |||||||
|           set -x |           set -x | ||||||
|           export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8) |           export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8) | ||||||
|           export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json) |           export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json) | ||||||
|           echo 'Here are your benchmarks diff 👊' >> body.txt |           export bench_name=$(echo ${{ steps.command.outputs.command-arguments }}) | ||||||
|  |           echo "Here are your $bench_name benchmarks diff 👊" >> body.txt | ||||||
|           echo '```' >> body.txt |           echo '```' >> body.txt | ||||||
|           ./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt |           ./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt | ||||||
|           echo '```' >> body.txt |           echo '```' >> body.txt | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								.github/workflows/publish-apt-brew-pkg.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/publish-apt-brew-pkg.yml
									
									
									
									
										vendored
									
									
								
							| @@ -50,7 +50,7 @@ jobs: | |||||||
|     needs: check-version |     needs: check-version | ||||||
|     steps: |     steps: | ||||||
|       - name: Create PR to Homebrew |       - name: Create PR to Homebrew | ||||||
|         uses: mislav/bump-homebrew-formula-action@v2 |         uses: mislav/bump-homebrew-formula-action@v3 | ||||||
|         with: |         with: | ||||||
|           formula-name: meilisearch |           formula-name: meilisearch | ||||||
|           formula-path: Formula/m/meilisearch.rb |           formula-path: Formula/m/meilisearch.rb | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								.github/workflows/publish-docker-images.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/publish-docker-images.yml
									
									
									
									
										vendored
									
									
								
							| @@ -63,7 +63,7 @@ jobs: | |||||||
|         uses: docker/setup-buildx-action@v3 |         uses: docker/setup-buildx-action@v3 | ||||||
|  |  | ||||||
|       - name: Login to Docker Hub |       - name: Login to Docker Hub | ||||||
|         uses: docker/login-action@v2 |         uses: docker/login-action@v3 | ||||||
|         with: |         with: | ||||||
|           username: ${{ secrets.DOCKERHUB_USERNAME }} |           username: ${{ secrets.DOCKERHUB_USERNAME }} | ||||||
|           password: ${{ secrets.DOCKERHUB_TOKEN }} |           password: ${{ secrets.DOCKERHUB_TOKEN }} | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								.github/workflows/sdks-tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/sdks-tests.yml
									
									
									
									
										vendored
									
									
								
							| @@ -160,7 +160,7 @@ jobs: | |||||||
|         with: |         with: | ||||||
|           repository: meilisearch/meilisearch-js |           repository: meilisearch/meilisearch-js | ||||||
|       - name: Setup node |       - name: Setup node | ||||||
|         uses: actions/setup-node@v3 |         uses: actions/setup-node@v4 | ||||||
|         with: |         with: | ||||||
|           cache: 'yarn' |           cache: 'yarn' | ||||||
|       - name: Install dependencies |       - name: Install dependencies | ||||||
| @@ -318,7 +318,7 @@ jobs: | |||||||
|         with: |         with: | ||||||
|           repository: meilisearch/meilisearch-js-plugins |           repository: meilisearch/meilisearch-js-plugins | ||||||
|       - name: Setup node |       - name: Setup node | ||||||
|         uses: actions/setup-node@v3 |         uses: actions/setup-node@v4 | ||||||
|         with: |         with: | ||||||
|           cache: yarn |           cache: yarn | ||||||
|       - name: Install dependencies |       - name: Install dependencies | ||||||
|   | |||||||
							
								
								
									
										10
									
								
								.github/workflows/test-suite.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/workflows/test-suite.yml
									
									
									
									
										vendored
									
									
								
							| @@ -43,7 +43,7 @@ jobs: | |||||||
|           toolchain: nightly |           toolchain: nightly | ||||||
|           override: true |           override: true | ||||||
|       - name: Cache dependencies |       - name: Cache dependencies | ||||||
|         uses: Swatinem/rust-cache@v2.6.2 |         uses: Swatinem/rust-cache@v2.7.1 | ||||||
|       - name: Run cargo check without any default features |       - name: Run cargo check without any default features | ||||||
|         uses: actions-rs/cargo@v1 |         uses: actions-rs/cargo@v1 | ||||||
|         with: |         with: | ||||||
| @@ -65,7 +65,7 @@ jobs: | |||||||
|     steps: |     steps: | ||||||
|       - uses: actions/checkout@v3 |       - uses: actions/checkout@v3 | ||||||
|       - name: Cache dependencies |       - name: Cache dependencies | ||||||
|         uses: Swatinem/rust-cache@v2.6.2 |         uses: Swatinem/rust-cache@v2.7.1 | ||||||
|       - name: Run cargo check without any default features |       - name: Run cargo check without any default features | ||||||
|         uses: actions-rs/cargo@v1 |         uses: actions-rs/cargo@v1 | ||||||
|         with: |         with: | ||||||
| @@ -149,7 +149,7 @@ jobs: | |||||||
|           toolchain: stable |           toolchain: stable | ||||||
|           override: true |           override: true | ||||||
|       - name: Cache dependencies |       - name: Cache dependencies | ||||||
|         uses: Swatinem/rust-cache@v2.6.2 |         uses: Swatinem/rust-cache@v2.7.1 | ||||||
|       - name: Run tests in debug |       - name: Run tests in debug | ||||||
|         uses: actions-rs/cargo@v1 |         uses: actions-rs/cargo@v1 | ||||||
|         with: |         with: | ||||||
| @@ -168,7 +168,7 @@ jobs: | |||||||
|           override: true |           override: true | ||||||
|           components: clippy |           components: clippy | ||||||
|       - name: Cache dependencies |       - name: Cache dependencies | ||||||
|         uses: Swatinem/rust-cache@v2.6.2 |         uses: Swatinem/rust-cache@v2.7.1 | ||||||
|       - name: Run cargo clippy |       - name: Run cargo clippy | ||||||
|         uses: actions-rs/cargo@v1 |         uses: actions-rs/cargo@v1 | ||||||
|         with: |         with: | ||||||
| @@ -187,7 +187,7 @@ jobs: | |||||||
|           override: true |           override: true | ||||||
|           components: rustfmt |           components: rustfmt | ||||||
|       - name: Cache dependencies |       - name: Cache dependencies | ||||||
|         uses: Swatinem/rust-cache@v2.6.2 |         uses: Swatinem/rust-cache@v2.7.1 | ||||||
|       - name: Run cargo fmt |       - name: Run cargo fmt | ||||||
|         # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. |         # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. | ||||||
|         # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate |         # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate | ||||||
|   | |||||||
							
								
								
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -1731,12 +1731,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "grenad" | name = "grenad" | ||||||
| version = "0.4.4" | version = "0.4.5" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93" | checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bytemuck", |  "bytemuck", | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  |  "rayon", | ||||||
|  "tempfile", |  "tempfile", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| @@ -3281,6 +3282,7 @@ dependencies = [ | |||||||
|  "logging_timer", |  "logging_timer", | ||||||
|  "maplit", |  "maplit", | ||||||
|  "md5", |  "md5", | ||||||
|  |  "meili-snap", | ||||||
|  "memmap2", |  "memmap2", | ||||||
|  "mimalloc", |  "mimalloc", | ||||||
|  "obkv", |  "obkv", | ||||||
| @@ -3443,9 +3445,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "obkv" | name = "obkv" | ||||||
| version = "0.2.0" | version = "0.2.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" | checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "once_cell" | name = "once_cell" | ||||||
|   | |||||||
| @@ -25,12 +25,6 @@ | |||||||
|  |  | ||||||
| <p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p> | <p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p> | ||||||
|  |  | ||||||
| --- |  | ||||||
|  |  | ||||||
| ### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A! |  | ||||||
|  |  | ||||||
| --- |  | ||||||
|  |  | ||||||
| Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow. | Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow. | ||||||
|  |  | ||||||
| <p align="center" name="demo"> | <p align="center" name="demo"> | ||||||
|   | |||||||
| @@ -6,9 +6,7 @@ use std::path::Path; | |||||||
|  |  | ||||||
| use criterion::{criterion_group, criterion_main, Criterion}; | use criterion::{criterion_group, criterion_main, Criterion}; | ||||||
| use milli::heed::{EnvOpenOptions, RwTxn}; | use milli::heed::{EnvOpenOptions, RwTxn}; | ||||||
| use milli::update::{ | use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; | ||||||
|     DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, |  | ||||||
| }; |  | ||||||
| use milli::Index; | use milli::Index; | ||||||
| use rand::seq::SliceRandom; | use rand::seq::SliceRandom; | ||||||
| use rand_chacha::rand_core::SeedableRng; | use rand_chacha::rand_core::SeedableRng; | ||||||
| @@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { | |||||||
|                 (index, document_ids_to_delete) |                 (index, document_ids_to_delete) | ||||||
|             }, |             }, | ||||||
|             move |(index, document_ids_to_delete)| { |             move |(index, document_ids_to_delete)| { | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 delete_documents_from_ids(index, document_ids_to_delete) | ||||||
|  |  | ||||||
|                 for ids in document_ids_to_delete { |  | ||||||
|                     let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|                     builder.delete_documents(&ids); |  | ||||||
|                     builder.execute().unwrap(); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|                 index.prepare_for_closing().wait(); |  | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|     }); |     }); | ||||||
| @@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { | |||||||
|                 (index, document_ids_to_delete) |                 (index, document_ids_to_delete) | ||||||
|             }, |             }, | ||||||
|             move |(index, document_ids_to_delete)| { |             move |(index, document_ids_to_delete)| { | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 delete_documents_from_ids(index, document_ids_to_delete) | ||||||
|  |  | ||||||
|                 for ids in document_ids_to_delete { |  | ||||||
|                     let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|                     builder.delete_documents(&ids); |  | ||||||
|                     builder.execute().unwrap(); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|                 index.prepare_for_closing().wait(); |  | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|     }); |     }); | ||||||
| @@ -875,22 +853,31 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { | |||||||
|                 (index, document_ids_to_delete) |                 (index, document_ids_to_delete) | ||||||
|             }, |             }, | ||||||
|             move |(index, document_ids_to_delete)| { |             move |(index, document_ids_to_delete)| { | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 delete_documents_from_ids(index, document_ids_to_delete) | ||||||
|  |  | ||||||
|                 for ids in document_ids_to_delete { |  | ||||||
|                     let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|                     builder.delete_documents(&ids); |  | ||||||
|                     builder.execute().unwrap(); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|                 index.prepare_for_closing().wait(); |  | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) { | ||||||
|  |     let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |  | ||||||
|  |     let indexer_config = IndexerConfig::default(); | ||||||
|  |     for ids in document_ids_to_delete { | ||||||
|  |         let config = IndexDocumentsConfig::default(); | ||||||
|  |  | ||||||
|  |         let mut builder = | ||||||
|  |             IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false) | ||||||
|  |                 .unwrap(); | ||||||
|  |         (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap(); | ||||||
|  |         builder.execute().unwrap(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |     index.prepare_for_closing().wait(); | ||||||
|  | } | ||||||
|  |  | ||||||
| fn indexing_movies_in_three_batches(c: &mut Criterion) { | fn indexing_movies_in_three_batches(c: &mut Criterion) { | ||||||
|     let mut group = c.benchmark_group("indexing"); |     let mut group = c.benchmark_group("indexing"); | ||||||
|     group.sample_size(BENCHMARK_ITERATION); |     group.sample_size(BENCHMARK_ITERATION); | ||||||
| @@ -1112,17 +1099,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { | |||||||
|                 (index, document_ids_to_delete) |                 (index, document_ids_to_delete) | ||||||
|             }, |             }, | ||||||
|             move |(index, document_ids_to_delete)| { |             move |(index, document_ids_to_delete)| { | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 delete_documents_from_ids(index, document_ids_to_delete) | ||||||
|  |  | ||||||
|                 for ids in document_ids_to_delete { |  | ||||||
|                     let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|                     builder.delete_documents(&ids); |  | ||||||
|                     builder.execute().unwrap(); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|                 index.prepare_for_closing().wait(); |  | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|     }); |     }); | ||||||
| @@ -1338,17 +1315,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { | |||||||
|                 (index, document_ids_to_delete) |                 (index, document_ids_to_delete) | ||||||
|             }, |             }, | ||||||
|             move |(index, document_ids_to_delete)| { |             move |(index, document_ids_to_delete)| { | ||||||
|                 let mut wtxn = index.write_txn().unwrap(); |                 delete_documents_from_ids(index, document_ids_to_delete) | ||||||
|  |  | ||||||
|                 for ids in document_ids_to_delete { |  | ||||||
|                     let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|                     builder.delete_documents(&ids); |  | ||||||
|                     builder.execute().unwrap(); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|                 index.prepare_for_closing().wait(); |  | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|     }); |     }); | ||||||
|   | |||||||
| @@ -526,12 +526,12 @@ pub(crate) mod test { | |||||||
|         assert!(indexes.is_empty()); |         assert!(indexes.is_empty()); | ||||||
|  |  | ||||||
|         // products |         // products | ||||||
|         insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(products.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "products", |           "uid": "products", | ||||||
|           "primaryKey": "sku", |           "primaryKey": "sku", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2022-10-09T20:27:22.688964637Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2022-10-09T20:27:23.951017769Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -541,12 +541,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); | ||||||
|  |  | ||||||
|         // movies |         // movies | ||||||
|         insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(movies.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "movies", |           "uid": "movies", | ||||||
|           "primaryKey": "id", |           "primaryKey": "id", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2022-10-09T20:27:22.197788495Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2022-10-09T20:28:01.93111053Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -571,12 +571,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); | ||||||
|  |  | ||||||
|         // spells |         // spells | ||||||
|         insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(spells.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "dnd_spells", |           "uid": "dnd_spells", | ||||||
|           "primaryKey": "index", |           "primaryKey": "index", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2022-10-09T20:27:24.242683494Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2022-10-09T20:27:24.312809641Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -617,12 +617,12 @@ pub(crate) mod test { | |||||||
|         assert!(indexes.is_empty()); |         assert!(indexes.is_empty()); | ||||||
|  |  | ||||||
|         // products |         // products | ||||||
|         insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(products.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "products", |           "uid": "products", | ||||||
|           "primaryKey": "sku", |           "primaryKey": "sku", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2023-01-30T16:25:56.595257Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2023-01-30T16:25:58.70348Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -632,12 +632,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); | ||||||
|  |  | ||||||
|         // movies |         // movies | ||||||
|         insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(movies.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "movies", |           "uid": "movies", | ||||||
|           "primaryKey": "id", |           "primaryKey": "id", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2023-01-30T16:25:56.192178Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2023-01-30T16:25:56.455714Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -647,12 +647,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); | ||||||
|  |  | ||||||
|         // spells |         // spells | ||||||
|         insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(spells.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "dnd_spells", |           "uid": "dnd_spells", | ||||||
|           "primaryKey": "index", |           "primaryKey": "index", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2023-01-30T16:25:58.876405Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2023-01-30T16:25:59.079906Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,24 +0,0 @@ | |||||||
| --- |  | ||||||
| source: dump/src/reader/mod.rs |  | ||||||
| expression: spells.settings().unwrap() |  | ||||||
| --- |  | ||||||
| { |  | ||||||
|   "displayedAttributes": [ |  | ||||||
|     "*" |  | ||||||
|   ], |  | ||||||
|   "searchableAttributes": [ |  | ||||||
|     "*" |  | ||||||
|   ], |  | ||||||
|   "filterableAttributes": [], |  | ||||||
|   "sortableAttributes": [], |  | ||||||
|   "rankingRules": [ |  | ||||||
|     "typo", |  | ||||||
|     "words", |  | ||||||
|     "proximity", |  | ||||||
|     "attribute", |  | ||||||
|     "exactness" |  | ||||||
|   ], |  | ||||||
|   "stopWords": [], |  | ||||||
|   "synonyms": {}, |  | ||||||
|   "distinctAttribute": null |  | ||||||
| } |  | ||||||
| @@ -1,38 +0,0 @@ | |||||||
| --- |  | ||||||
| source: dump/src/reader/mod.rs |  | ||||||
| expression: products.settings().unwrap() |  | ||||||
| --- |  | ||||||
| { |  | ||||||
|   "displayedAttributes": [ |  | ||||||
|     "*" |  | ||||||
|   ], |  | ||||||
|   "searchableAttributes": [ |  | ||||||
|     "*" |  | ||||||
|   ], |  | ||||||
|   "filterableAttributes": [], |  | ||||||
|   "sortableAttributes": [], |  | ||||||
|   "rankingRules": [ |  | ||||||
|     "typo", |  | ||||||
|     "words", |  | ||||||
|     "proximity", |  | ||||||
|     "attribute", |  | ||||||
|     "exactness" |  | ||||||
|   ], |  | ||||||
|   "stopWords": [], |  | ||||||
|   "synonyms": { |  | ||||||
|     "android": [ |  | ||||||
|       "phone", |  | ||||||
|       "smartphone" |  | ||||||
|     ], |  | ||||||
|     "iphone": [ |  | ||||||
|       "phone", |  | ||||||
|       "smartphone" |  | ||||||
|     ], |  | ||||||
|     "phone": [ |  | ||||||
|       "android", |  | ||||||
|       "iphone", |  | ||||||
|       "smartphone" |  | ||||||
|     ] |  | ||||||
|   }, |  | ||||||
|   "distinctAttribute": null |  | ||||||
| } |  | ||||||
| @@ -1,31 +0,0 @@ | |||||||
| --- |  | ||||||
| source: dump/src/reader/mod.rs |  | ||||||
| expression: movies.settings().unwrap() |  | ||||||
| --- |  | ||||||
| { |  | ||||||
|   "displayedAttributes": [ |  | ||||||
|     "*" |  | ||||||
|   ], |  | ||||||
|   "searchableAttributes": [ |  | ||||||
|     "*" |  | ||||||
|   ], |  | ||||||
|   "filterableAttributes": [ |  | ||||||
|     "genres", |  | ||||||
|     "id" |  | ||||||
|   ], |  | ||||||
|   "sortableAttributes": [ |  | ||||||
|     "genres", |  | ||||||
|     "id" |  | ||||||
|   ], |  | ||||||
|   "rankingRules": [ |  | ||||||
|     "typo", |  | ||||||
|     "words", |  | ||||||
|     "proximity", |  | ||||||
|     "attribute", |  | ||||||
|     "exactness", |  | ||||||
|     "release_date:asc" |  | ||||||
|   ], |  | ||||||
|   "stopWords": [], |  | ||||||
|   "synonyms": {}, |  | ||||||
|   "distinctAttribute": null |  | ||||||
| } |  | ||||||
| @@ -46,6 +46,7 @@ pub type Checked = settings::Checked; | |||||||
| pub type Unchecked = settings::Unchecked; | pub type Unchecked = settings::Unchecked; | ||||||
|  |  | ||||||
| pub type Task = updates::UpdateEntry; | pub type Task = updates::UpdateEntry; | ||||||
|  | pub type Kind = updates::UpdateMeta; | ||||||
|  |  | ||||||
| // everything related to the errors | // everything related to the errors | ||||||
| pub type ResponseError = errors::ResponseError; | pub type ResponseError = errors::ResponseError; | ||||||
| @@ -107,8 +108,11 @@ impl V2Reader { | |||||||
|     pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> { |     pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> { | ||||||
|         Ok(self.index_uuid.iter().map(|index| -> Result<_> { |         Ok(self.index_uuid.iter().map(|index| -> Result<_> { | ||||||
|             V2IndexReader::new( |             V2IndexReader::new( | ||||||
|                 index.uid.clone(), |  | ||||||
|                 &self.dump.path().join("indexes").join(format!("index-{}", index.uuid)), |                 &self.dump.path().join("indexes").join(format!("index-{}", index.uuid)), | ||||||
|  |                 index, | ||||||
|  |                 BufReader::new( | ||||||
|  |                     File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(), | ||||||
|  |                 ), | ||||||
|             ) |             ) | ||||||
|         })) |         })) | ||||||
|     } |     } | ||||||
| @@ -143,16 +147,41 @@ pub struct V2IndexReader { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl V2IndexReader { | impl V2IndexReader { | ||||||
|     pub fn new(name: String, path: &Path) -> Result<Self> { |     pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> { | ||||||
|         let meta = File::open(path.join("meta.json"))?; |         let meta = File::open(path.join("meta.json"))?; | ||||||
|         let meta: DumpMeta = serde_json::from_reader(meta)?; |         let meta: DumpMeta = serde_json::from_reader(meta)?; | ||||||
|  |  | ||||||
|  |         let mut created_at = None; | ||||||
|  |         let mut updated_at = None; | ||||||
|  |  | ||||||
|  |         for line in tasks.lines() { | ||||||
|  |             let task: Task = serde_json::from_str(&line?)?; | ||||||
|  |             if !(task.uuid == index_uuid.uuid && task.is_finished()) { | ||||||
|  |                 continue; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             let new_created_at = match task.update.meta() { | ||||||
|  |                 Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(), | ||||||
|  |                 _ => None, | ||||||
|  |             }; | ||||||
|  |             let new_updated_at = task.update.finished_at(); | ||||||
|  |  | ||||||
|  |             if created_at.is_none() || created_at > new_created_at { | ||||||
|  |                 created_at = new_created_at; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             if updated_at.is_none() || updated_at < new_updated_at { | ||||||
|  |                 updated_at = new_updated_at; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let current_time = OffsetDateTime::now_utc(); | ||||||
|  |  | ||||||
|         let metadata = IndexMetadata { |         let metadata = IndexMetadata { | ||||||
|             uid: name, |             uid: index_uuid.uid.clone(), | ||||||
|             primary_key: meta.primary_key, |             primary_key: meta.primary_key, | ||||||
|             // FIXME: Iterate over the whole task queue to find the creation and last update date. |             created_at: created_at.unwrap_or(current_time), | ||||||
|             created_at: OffsetDateTime::now_utc(), |             updated_at: updated_at.unwrap_or(current_time), | ||||||
|             updated_at: OffsetDateTime::now_utc(), |  | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let ret = V2IndexReader { |         let ret = V2IndexReader { | ||||||
| @@ -248,12 +277,12 @@ pub(crate) mod test { | |||||||
|         assert!(indexes.is_empty()); |         assert!(indexes.is_empty()); | ||||||
|  |  | ||||||
|         // products |         // products | ||||||
|         insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(products.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "products", |           "uid": "products", | ||||||
|           "primaryKey": "sku", |           "primaryKey": "sku", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2022-10-09T20:27:22.688964637Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2022-10-09T20:27:23.951017769Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -263,12 +292,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); | ||||||
|  |  | ||||||
|         // movies |         // movies | ||||||
|         insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(movies.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "movies", |           "uid": "movies", | ||||||
|           "primaryKey": "id", |           "primaryKey": "id", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2022-10-09T20:27:22.197788495Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2022-10-09T20:28:01.93111053Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -293,12 +322,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce"); | ||||||
|  |  | ||||||
|         // spells |         // spells | ||||||
|         insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(spells.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "dnd_spells", |           "uid": "dnd_spells", | ||||||
|           "primaryKey": "index", |           "primaryKey": "index", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2022-10-09T20:27:24.242683494Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2022-10-09T20:27:24.312809641Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -340,12 +369,12 @@ pub(crate) mod test { | |||||||
|         assert!(indexes.is_empty()); |         assert!(indexes.is_empty()); | ||||||
|  |  | ||||||
|         // products |         // products | ||||||
|         insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(products.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "products", |           "uid": "products", | ||||||
|           "primaryKey": "sku", |           "primaryKey": "sku", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2023-01-30T16:25:56.595257Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2023-01-30T16:25:58.70348Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -355,12 +384,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5"); | ||||||
|  |  | ||||||
|         // movies |         // movies | ||||||
|         insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(movies.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "movies", |           "uid": "movies", | ||||||
|           "primaryKey": "id", |           "primaryKey": "id", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2023-01-30T16:25:56.192178Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2023-01-30T16:25:56.455714Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -370,12 +399,12 @@ pub(crate) mod test { | |||||||
|         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); |         meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720"); | ||||||
|  |  | ||||||
|         // spells |         // spells | ||||||
|         insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###" |         insta::assert_json_snapshot!(spells.metadata(), @r###" | ||||||
|         { |         { | ||||||
|           "uid": "dnd_spells", |           "uid": "dnd_spells", | ||||||
|           "primaryKey": "index", |           "primaryKey": "index", | ||||||
|           "createdAt": "[now]", |           "createdAt": "2023-01-30T16:25:58.876405Z", | ||||||
|           "updatedAt": "[now]" |           "updatedAt": "2023-01-30T16:25:59.079906Z" | ||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -227,4 +227,14 @@ impl UpdateStatus { | |||||||
|             _ => None, |             _ => None, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn finished_at(&self) -> Option<OffsetDateTime> { | ||||||
|  |         match self { | ||||||
|  |             UpdateStatus::Processing(_) => None, | ||||||
|  |             UpdateStatus::Enqueued(_) => None, | ||||||
|  |             UpdateStatus::Processed(u) => Some(u.processed_at), | ||||||
|  |             UpdateStatus::Aborted(_) => None, | ||||||
|  |             UpdateStatus::Failed(u) => Some(u.failed_at), | ||||||
|  |         } | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -24,14 +24,13 @@ use std::fs::{self, File}; | |||||||
| use std::io::BufWriter; | use std::io::BufWriter; | ||||||
|  |  | ||||||
| use dump::IndexMetadata; | use dump::IndexMetadata; | ||||||
| use log::{debug, error, info}; | use log::{debug, error, info, trace}; | ||||||
| use meilisearch_types::error::Code; | use meilisearch_types::error::Code; | ||||||
| use meilisearch_types::heed::{RoTxn, RwTxn}; | use meilisearch_types::heed::{RoTxn, RwTxn}; | ||||||
| use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; | use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; | ||||||
| use meilisearch_types::milli::heed::CompactionOption; | use meilisearch_types::milli::heed::CompactionOption; | ||||||
| use meilisearch_types::milli::update::{ | use meilisearch_types::milli::update::{ | ||||||
|     DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod, |     IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings, | ||||||
|     Settings as MilliSettings, |  | ||||||
| }; | }; | ||||||
| use meilisearch_types::milli::{self, Filter, BEU32}; | use meilisearch_types::milli::{self, Filter, BEU32}; | ||||||
| use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; | use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; | ||||||
| @@ -44,7 +43,7 @@ use uuid::Uuid; | |||||||
|  |  | ||||||
| use crate::autobatcher::{self, BatchKind}; | use crate::autobatcher::{self, BatchKind}; | ||||||
| use crate::utils::{self, swap_index_uid_in_task}; | use crate::utils::{self, swap_index_uid_in_task}; | ||||||
| use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId}; | use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId}; | ||||||
|  |  | ||||||
| /// Represents a combination of tasks that can all be processed at the same time. | /// Represents a combination of tasks that can all be processed at the same time. | ||||||
| /// | /// | ||||||
| @@ -105,12 +104,6 @@ pub(crate) enum IndexOperation { | |||||||
|         operations: Vec<DocumentOperation>, |         operations: Vec<DocumentOperation>, | ||||||
|         tasks: Vec<Task>, |         tasks: Vec<Task>, | ||||||
|     }, |     }, | ||||||
|     DocumentDeletion { |  | ||||||
|         index_uid: String, |  | ||||||
|         // The vec associated with each document deletion tasks. |  | ||||||
|         documents: Vec<Vec<String>>, |  | ||||||
|         tasks: Vec<Task>, |  | ||||||
|     }, |  | ||||||
|     IndexDocumentDeletionByFilter { |     IndexDocumentDeletionByFilter { | ||||||
|         index_uid: String, |         index_uid: String, | ||||||
|         task: Task, |         task: Task, | ||||||
| @@ -162,7 +155,6 @@ impl Batch { | |||||||
|             } |             } | ||||||
|             Batch::IndexOperation { op, .. } => match op { |             Batch::IndexOperation { op, .. } => match op { | ||||||
|                 IndexOperation::DocumentOperation { tasks, .. } |                 IndexOperation::DocumentOperation { tasks, .. } | ||||||
|                 | IndexOperation::DocumentDeletion { tasks, .. } |  | ||||||
|                 | IndexOperation::Settings { tasks, .. } |                 | IndexOperation::Settings { tasks, .. } | ||||||
|                 | IndexOperation::DocumentClear { tasks, .. } => { |                 | IndexOperation::DocumentClear { tasks, .. } => { | ||||||
|                     tasks.iter().map(|task| task.uid).collect() |                     tasks.iter().map(|task| task.uid).collect() | ||||||
| @@ -227,7 +219,6 @@ impl IndexOperation { | |||||||
|     pub fn index_uid(&self) -> &str { |     pub fn index_uid(&self) -> &str { | ||||||
|         match self { |         match self { | ||||||
|             IndexOperation::DocumentOperation { index_uid, .. } |             IndexOperation::DocumentOperation { index_uid, .. } | ||||||
|             | IndexOperation::DocumentDeletion { index_uid, .. } |  | ||||||
|             | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. } |             | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. } | ||||||
|             | IndexOperation::DocumentClear { index_uid, .. } |             | IndexOperation::DocumentClear { index_uid, .. } | ||||||
|             | IndexOperation::Settings { index_uid, .. } |             | IndexOperation::Settings { index_uid, .. } | ||||||
| @@ -243,9 +234,6 @@ impl fmt::Display for IndexOperation { | |||||||
|             IndexOperation::DocumentOperation { .. } => { |             IndexOperation::DocumentOperation { .. } => { | ||||||
|                 f.write_str("IndexOperation::DocumentOperation") |                 f.write_str("IndexOperation::DocumentOperation") | ||||||
|             } |             } | ||||||
|             IndexOperation::DocumentDeletion { .. } => { |  | ||||||
|                 f.write_str("IndexOperation::DocumentDeletion") |  | ||||||
|             } |  | ||||||
|             IndexOperation::IndexDocumentDeletionByFilter { .. } => { |             IndexOperation::IndexDocumentDeletionByFilter { .. } => { | ||||||
|                 f.write_str("IndexOperation::IndexDocumentDeletionByFilter") |                 f.write_str("IndexOperation::IndexDocumentDeletionByFilter") | ||||||
|             } |             } | ||||||
| @@ -348,18 +336,27 @@ impl IndexScheduler { | |||||||
|             BatchKind::DocumentDeletion { deletion_ids } => { |             BatchKind::DocumentDeletion { deletion_ids } => { | ||||||
|                 let tasks = self.get_existing_tasks(rtxn, deletion_ids)?; |                 let tasks = self.get_existing_tasks(rtxn, deletion_ids)?; | ||||||
|  |  | ||||||
|                 let mut documents = Vec::new(); |                 let mut operations = Vec::with_capacity(tasks.len()); | ||||||
|  |                 let mut documents_counts = Vec::with_capacity(tasks.len()); | ||||||
|                 for task in &tasks { |                 for task in &tasks { | ||||||
|                     match task.kind { |                     match task.kind { | ||||||
|                         KindWithContent::DocumentDeletion { ref documents_ids, .. } => { |                         KindWithContent::DocumentDeletion { ref documents_ids, .. } => { | ||||||
|                             documents.push(documents_ids.clone()) |                             operations.push(DocumentOperation::Delete(documents_ids.clone())); | ||||||
|  |                             documents_counts.push(documents_ids.len() as u64); | ||||||
|                         } |                         } | ||||||
|                         _ => unreachable!(), |                         _ => unreachable!(), | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 Ok(Some(Batch::IndexOperation { |                 Ok(Some(Batch::IndexOperation { | ||||||
|                     op: IndexOperation::DocumentDeletion { index_uid, documents, tasks }, |                     op: IndexOperation::DocumentOperation { | ||||||
|  |                         index_uid, | ||||||
|  |                         primary_key: None, | ||||||
|  |                         method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|  |                         documents_counts, | ||||||
|  |                         operations, | ||||||
|  |                         tasks, | ||||||
|  |                     }, | ||||||
|                     must_create_index, |                     must_create_index, | ||||||
|                 })) |                 })) | ||||||
|             } |             } | ||||||
| @@ -825,6 +822,10 @@ impl IndexScheduler { | |||||||
|                 // 2. dump the tasks |                 // 2. dump the tasks | ||||||
|                 let mut dump_tasks = dump.create_tasks_queue()?; |                 let mut dump_tasks = dump.create_tasks_queue()?; | ||||||
|                 for ret in self.all_tasks.iter(&rtxn)? { |                 for ret in self.all_tasks.iter(&rtxn)? { | ||||||
|  |                     if self.must_stop_processing.get() { | ||||||
|  |                         return Err(Error::AbortedTask); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|                     let (_, mut t) = ret?; |                     let (_, mut t) = ret?; | ||||||
|                     let status = t.status; |                     let status = t.status; | ||||||
|                     let content_file = t.content_uuid(); |                     let content_file = t.content_uuid(); | ||||||
| @@ -845,6 +846,9 @@ impl IndexScheduler { | |||||||
|  |  | ||||||
|                     // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. |                     // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. | ||||||
|                     if let Some(content_file) = content_file { |                     if let Some(content_file) = content_file { | ||||||
|  |                         if self.must_stop_processing.get() { | ||||||
|  |                             return Err(Error::AbortedTask); | ||||||
|  |                         } | ||||||
|                         if status == Status::Enqueued { |                         if status == Status::Enqueued { | ||||||
|                             let content_file = self.file_store.get_update(content_file)?; |                             let content_file = self.file_store.get_update(content_file)?; | ||||||
|  |  | ||||||
| @@ -884,6 +888,9 @@ impl IndexScheduler { | |||||||
|  |  | ||||||
|                     // 3.1. Dump the documents |                     // 3.1. Dump the documents | ||||||
|                     for ret in index.all_documents(&rtxn)? { |                     for ret in index.all_documents(&rtxn)? { | ||||||
|  |                         if self.must_stop_processing.get() { | ||||||
|  |                             return Err(Error::AbortedTask); | ||||||
|  |                         } | ||||||
|                         let (_id, doc) = ret?; |                         let (_id, doc) = ret?; | ||||||
|                         let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; |                         let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; | ||||||
|                         index_dumper.push_document(&document)?; |                         index_dumper.push_document(&document)?; | ||||||
| @@ -903,6 +910,9 @@ impl IndexScheduler { | |||||||
|                     "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" |                     "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" | ||||||
|                 )).unwrap(); |                 )).unwrap(); | ||||||
|  |  | ||||||
|  |                 if self.must_stop_processing.get() { | ||||||
|  |                     return Err(Error::AbortedTask); | ||||||
|  |                 } | ||||||
|                 let path = self.dumps_path.join(format!("{}.dump", dump_uid)); |                 let path = self.dumps_path.join(format!("{}.dump", dump_uid)); | ||||||
|                 let file = File::create(path)?; |                 let file = File::create(path)?; | ||||||
|                 dump.persist_to(BufWriter::new(file))?; |                 dump.persist_to(BufWriter::new(file))?; | ||||||
| @@ -1195,7 +1205,7 @@ impl IndexScheduler { | |||||||
|                     index, |                     index, | ||||||
|                     indexer_config, |                     indexer_config, | ||||||
|                     config, |                     config, | ||||||
|                     |indexing_step| debug!("update: {:?}", indexing_step), |                     |indexing_step| trace!("update: {:?}", indexing_step), | ||||||
|                     || must_stop_processing.get(), |                     || must_stop_processing.get(), | ||||||
|                 )?; |                 )?; | ||||||
|  |  | ||||||
| @@ -1242,7 +1252,8 @@ impl IndexScheduler { | |||||||
|                             let (new_builder, user_result) = |                             let (new_builder, user_result) = | ||||||
|                                 builder.remove_documents(document_ids)?; |                                 builder.remove_documents(document_ids)?; | ||||||
|                             builder = new_builder; |                             builder = new_builder; | ||||||
|  |                             // Uses Invariant: remove documents actually always returns Ok for the inner result | ||||||
|  |                             let count = user_result.unwrap(); | ||||||
|                             let provided_ids = |                             let provided_ids = | ||||||
|                                 if let Some(Details::DocumentDeletion { provided_ids, .. }) = |                                 if let Some(Details::DocumentDeletion { provided_ids, .. }) = | ||||||
|                                     task.details |                                     task.details | ||||||
| @@ -1253,23 +1264,11 @@ impl IndexScheduler { | |||||||
|                                     unreachable!(); |                                     unreachable!(); | ||||||
|                                 }; |                                 }; | ||||||
|  |  | ||||||
|                             match user_result { |                             task.status = Status::Succeeded; | ||||||
|                                 Ok(count) => { |                             task.details = Some(Details::DocumentDeletion { | ||||||
|                                     task.status = Status::Succeeded; |                                 provided_ids, | ||||||
|                                     task.details = Some(Details::DocumentDeletion { |                                 deleted_documents: Some(count), | ||||||
|                                         provided_ids, |                             }); | ||||||
|                                         deleted_documents: Some(count), |  | ||||||
|                                     }); |  | ||||||
|                                 } |  | ||||||
|                                 Err(e) => { |  | ||||||
|                                     task.status = Status::Failed; |  | ||||||
|                                     task.details = Some(Details::DocumentDeletion { |  | ||||||
|                                         provided_ids, |  | ||||||
|                                         deleted_documents: Some(0), |  | ||||||
|                                     }); |  | ||||||
|                                     task.error = Some(milli::Error::from(e).into()); |  | ||||||
|                                 } |  | ||||||
|                             } |  | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -1284,31 +1283,13 @@ impl IndexScheduler { | |||||||
|                         milli::update::Settings::new(index_wtxn, index, indexer_config); |                         milli::update::Settings::new(index_wtxn, index, indexer_config); | ||||||
|                     builder.reset_primary_key(); |                     builder.reset_primary_key(); | ||||||
|                     builder.execute( |                     builder.execute( | ||||||
|                         |indexing_step| debug!("update: {:?}", indexing_step), |                         |indexing_step| trace!("update: {:?}", indexing_step), | ||||||
|                         || must_stop_processing.clone().get(), |                         || must_stop_processing.clone().get(), | ||||||
|                     )?; |                     )?; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 Ok(tasks) |                 Ok(tasks) | ||||||
|             } |             } | ||||||
|             IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => { |  | ||||||
|                 let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?; |  | ||||||
|                 documents.iter().flatten().for_each(|id| { |  | ||||||
|                     builder.delete_external_id(id); |  | ||||||
|                 }); |  | ||||||
|  |  | ||||||
|                 let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?; |  | ||||||
|  |  | ||||||
|                 for (task, documents) in tasks.iter_mut().zip(documents) { |  | ||||||
|                     task.status = Status::Succeeded; |  | ||||||
|                     task.details = Some(Details::DocumentDeletion { |  | ||||||
|                         provided_ids: documents.len(), |  | ||||||
|                         deleted_documents: Some(deleted_documents.min(documents.len() as u64)), |  | ||||||
|                     }); |  | ||||||
|                 } |  | ||||||
|  |  | ||||||
|                 Ok(tasks) |  | ||||||
|             } |  | ||||||
|             IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { |             IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => { | ||||||
|                 let filter = |                 let filter = | ||||||
|                     if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } = |                     if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } = | ||||||
| @@ -1318,7 +1299,13 @@ impl IndexScheduler { | |||||||
|                     } else { |                     } else { | ||||||
|                         unreachable!() |                         unreachable!() | ||||||
|                     }; |                     }; | ||||||
|                 let deleted_documents = delete_document_by_filter(index_wtxn, filter, index); |                 let deleted_documents = delete_document_by_filter( | ||||||
|  |                     index_wtxn, | ||||||
|  |                     filter, | ||||||
|  |                     self.index_mapper.indexer_config(), | ||||||
|  |                     self.must_stop_processing.clone(), | ||||||
|  |                     index, | ||||||
|  |                 ); | ||||||
|                 let original_filter = if let Some(Details::DocumentDeletionByFilter { |                 let original_filter = if let Some(Details::DocumentDeletionByFilter { | ||||||
|                     original_filter, |                     original_filter, | ||||||
|                     deleted_documents: _, |                     deleted_documents: _, | ||||||
| @@ -1552,6 +1539,8 @@ impl IndexScheduler { | |||||||
| fn delete_document_by_filter<'a>( | fn delete_document_by_filter<'a>( | ||||||
|     wtxn: &mut RwTxn<'a, '_>, |     wtxn: &mut RwTxn<'a, '_>, | ||||||
|     filter: &serde_json::Value, |     filter: &serde_json::Value, | ||||||
|  |     indexer_config: &IndexerConfig, | ||||||
|  |     must_stop_processing: MustStopProcessing, | ||||||
|     index: &'a Index, |     index: &'a Index, | ||||||
| ) -> Result<u64> { | ) -> Result<u64> { | ||||||
|     let filter = Filter::from_json(filter)?; |     let filter = Filter::from_json(filter)?; | ||||||
| @@ -1562,9 +1551,26 @@ fn delete_document_by_filter<'a>( | |||||||
|             } |             } | ||||||
|             e => e.into(), |             e => e.into(), | ||||||
|         })?; |         })?; | ||||||
|         let mut delete_operation = DeleteDocuments::new(wtxn, index)?; |  | ||||||
|         delete_operation.delete_documents(&candidates); |         let config = IndexDocumentsConfig { | ||||||
|         delete_operation.execute().map(|result| result.deleted_documents)? |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|  |             ..Default::default() | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         let mut builder = milli::update::IndexDocuments::new( | ||||||
|  |             wtxn, | ||||||
|  |             index, | ||||||
|  |             indexer_config, | ||||||
|  |             config, | ||||||
|  |             |indexing_step| debug!("update: {:?}", indexing_step), | ||||||
|  |             || must_stop_processing.get(), | ||||||
|  |         )?; | ||||||
|  |  | ||||||
|  |         let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?; | ||||||
|  |         builder = new_builder; | ||||||
|  |  | ||||||
|  |         let _ = builder.execute()?; | ||||||
|  |         count | ||||||
|     } else { |     } else { | ||||||
|         0 |         0 | ||||||
|     }) |     }) | ||||||
|   | |||||||
| @@ -108,6 +108,8 @@ pub enum Error { | |||||||
|     TaskDeletionWithEmptyQuery, |     TaskDeletionWithEmptyQuery, | ||||||
|     #[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")] |     #[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")] | ||||||
|     TaskCancelationWithEmptyQuery, |     TaskCancelationWithEmptyQuery, | ||||||
|  |     #[error("Aborted task")] | ||||||
|  |     AbortedTask, | ||||||
|  |  | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     Dump(#[from] dump::Error), |     Dump(#[from] dump::Error), | ||||||
| @@ -175,6 +177,7 @@ impl Error { | |||||||
|             | Error::TaskNotFound(_) |             | Error::TaskNotFound(_) | ||||||
|             | Error::TaskDeletionWithEmptyQuery |             | Error::TaskDeletionWithEmptyQuery | ||||||
|             | Error::TaskCancelationWithEmptyQuery |             | Error::TaskCancelationWithEmptyQuery | ||||||
|  |             | Error::AbortedTask | ||||||
|             | Error::Dump(_) |             | Error::Dump(_) | ||||||
|             | Error::Heed(_) |             | Error::Heed(_) | ||||||
|             | Error::Milli(_) |             | Error::Milli(_) | ||||||
| @@ -236,6 +239,9 @@ impl ErrorCode for Error { | |||||||
|             Error::TaskDatabaseUpdate(_) => Code::Internal, |             Error::TaskDatabaseUpdate(_) => Code::Internal, | ||||||
|             Error::CreateBatch(_) => Code::Internal, |             Error::CreateBatch(_) => Code::Internal, | ||||||
|  |  | ||||||
|  |             // This one should never be seen by the end user | ||||||
|  |             Error::AbortedTask => Code::Internal, | ||||||
|  |  | ||||||
|             #[cfg(test)] |             #[cfg(test)] | ||||||
|             Error::PlannedFailure => Code::Internal, |             Error::PlannedFailure => Code::Internal, | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -1183,7 +1183,8 @@ impl IndexScheduler { | |||||||
|             // If we have an abortion error we must stop the tick here and re-schedule tasks. |             // If we have an abortion error we must stop the tick here and re-schedule tasks. | ||||||
|             Err(Error::Milli(milli::Error::InternalError( |             Err(Error::Milli(milli::Error::InternalError( | ||||||
|                 milli::InternalError::AbortedIndexation, |                 milli::InternalError::AbortedIndexation, | ||||||
|             ))) => { |             ))) | ||||||
|  |             | Err(Error::AbortedTask) => { | ||||||
|                 #[cfg(test)] |                 #[cfg(test)] | ||||||
|                 self.breakpoint(Breakpoint::AbortedIndexation); |                 self.breakpoint(Breakpoint::AbortedIndexation); | ||||||
|                 wtxn.abort().map_err(Error::HeedTransaction)?; |                 wtxn.abort().map_err(Error::HeedTransaction)?; | ||||||
| @@ -4339,4 +4340,26 @@ mod tests { | |||||||
|         } |         } | ||||||
|         "###); |         "###); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn cancel_processing_dump() { | ||||||
|  |         let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); | ||||||
|  |  | ||||||
|  |         let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None }; | ||||||
|  |         let dump_cancellation = KindWithContent::TaskCancelation { | ||||||
|  |             query: "cancel dump".to_owned(), | ||||||
|  |             tasks: RoaringBitmap::from_iter([0]), | ||||||
|  |         }; | ||||||
|  |         let _ = index_scheduler.register(dump_creation).unwrap(); | ||||||
|  |         snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); | ||||||
|  |         handle.advance_till([Start, BatchCreated, InsideProcessBatch]); | ||||||
|  |  | ||||||
|  |         let _ = index_scheduler.register(dump_cancellation).unwrap(); | ||||||
|  |         snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); | ||||||
|  |  | ||||||
|  |         snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); | ||||||
|  |  | ||||||
|  |         handle.advance_one_successful_batch(); | ||||||
|  |         snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -0,0 +1,35 @@ | |||||||
|  | --- | ||||||
|  | source: index-scheduler/src/lib.rs | ||||||
|  | --- | ||||||
|  | ### Autobatching Enabled = true | ||||||
|  | ### Processing Tasks: | ||||||
|  | [] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### All Tasks: | ||||||
|  | 0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Status: | ||||||
|  | enqueued [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Kind: | ||||||
|  | "dumpCreation" [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Index Tasks: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Index Mapper: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Canceled By: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Enqueued At: | ||||||
|  | [timestamp] [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Started At: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Finished At: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### File Store: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  |  | ||||||
| @@ -0,0 +1,45 @@ | |||||||
|  | --- | ||||||
|  | source: index-scheduler/src/lib.rs | ||||||
|  | --- | ||||||
|  | ### Autobatching Enabled = true | ||||||
|  | ### Processing Tasks: | ||||||
|  | [] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### All Tasks: | ||||||
|  | 0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} | ||||||
|  | 1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }} | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Status: | ||||||
|  | enqueued [] | ||||||
|  | succeeded [1,] | ||||||
|  | canceled [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Kind: | ||||||
|  | "taskCancelation" [1,] | ||||||
|  | "dumpCreation" [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Index Tasks: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Index Mapper: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Canceled By: | ||||||
|  | 1 [0,] | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Enqueued At: | ||||||
|  | [timestamp] [0,] | ||||||
|  | [timestamp] [1,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Started At: | ||||||
|  | [timestamp] [0,] | ||||||
|  | [timestamp] [1,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Finished At: | ||||||
|  | [timestamp] [0,] | ||||||
|  | [timestamp] [1,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### File Store: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  |  | ||||||
| @@ -0,0 +1,38 @@ | |||||||
|  | --- | ||||||
|  | source: index-scheduler/src/lib.rs | ||||||
|  | --- | ||||||
|  | ### Autobatching Enabled = true | ||||||
|  | ### Processing Tasks: | ||||||
|  | [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### All Tasks: | ||||||
|  | 0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} | ||||||
|  | 1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }} | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Status: | ||||||
|  | enqueued [0,1,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Kind: | ||||||
|  | "taskCancelation" [1,] | ||||||
|  | "dumpCreation" [0,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Index Tasks: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Index Mapper: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Canceled By: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Enqueued At: | ||||||
|  | [timestamp] [0,] | ||||||
|  | [timestamp] [1,] | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Started At: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### Finished At: | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  | ### File Store: | ||||||
|  |  | ||||||
|  | ---------------------------------------------------------------------- | ||||||
|  |  | ||||||
| @@ -324,7 +324,6 @@ impl ErrorCode for milli::Error { | |||||||
|                     UserError::SerdeJson(_) |                     UserError::SerdeJson(_) | ||||||
|                     | UserError::InvalidLmdbOpenOptions |                     | UserError::InvalidLmdbOpenOptions | ||||||
|                     | UserError::DocumentLimitReached |                     | UserError::DocumentLimitReached | ||||||
|                     | UserError::AccessingSoftDeletedDocument { .. } |  | ||||||
|                     | UserError::UnknownInternalDocumentId { .. } => Code::Internal, |                     | UserError::UnknownInternalDocumentId { .. } => Code::Internal, | ||||||
|                     UserError::InvalidStoreFile => Code::InvalidStoreFile, |                     UserError::InvalidStoreFile => Code::InvalidStoreFile, | ||||||
|                     UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice, |                     UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice, | ||||||
|   | |||||||
| @@ -362,7 +362,7 @@ fn import_dump( | |||||||
|                 update_method: IndexDocumentsMethod::ReplaceDocuments, |                 update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|                 ..Default::default() |                 ..Default::default() | ||||||
|             }, |             }, | ||||||
|             |indexing_step| log::debug!("update: {:?}", indexing_step), |             |indexing_step| log::trace!("update: {:?}", indexing_step), | ||||||
|             || false, |             || false, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>( | |||||||
|     let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); |     let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); | ||||||
|  |  | ||||||
|     let internal_id = index |     let internal_id = index | ||||||
|         .external_documents_ids(&txn)? |         .external_documents_ids() | ||||||
|         .get(doc_id.as_bytes()) |         .get(&txn, doc_id)? | ||||||
|         .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; |         .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; | ||||||
|  |  | ||||||
|     let document = index |     let document = index | ||||||
|   | |||||||
| @@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() { | |||||||
|       "canceledBy": null, |       "canceledBy": null, | ||||||
|       "details": { |       "details": { | ||||||
|         "providedIds": 0, |         "providedIds": 0, | ||||||
|         "deletedDocuments": 4, |         "deletedDocuments": 2, | ||||||
|         "originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]" |         "originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]" | ||||||
|       }, |       }, | ||||||
|       "error": null, |       "error": null, | ||||||
|   | |||||||
| @@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" } | |||||||
| fst = "0.4.7" | fst = "0.4.7" | ||||||
| fxhash = "0.2.1" | fxhash = "0.2.1" | ||||||
| geoutils = "0.5.1" | geoutils = "0.5.1" | ||||||
| grenad = { version = "0.4.4", default-features = false, features = [ | grenad = { version = "0.4.5", default-features = false, features = [ | ||||||
|     "tempfile", |     "rayon", "tempfile" | ||||||
| ] } | ] } | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ | heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [ | ||||||
|     "lmdb", "read-txn-no-tls" |     "lmdb", "read-txn-no-tls" | ||||||
| @@ -79,6 +79,7 @@ big_s = "1.0.2" | |||||||
| insta = "1.29.0" | insta = "1.29.0" | ||||||
| maplit = "1.0.2" | maplit = "1.0.2" | ||||||
| md5 = "0.7.0" | md5 = "0.7.0" | ||||||
|  | meili-snap = { path = "../meili-snap" } | ||||||
| rand = { version = "0.8.5", features = ["small_rng"] } | rand = { version = "0.8.5", features = ["small_rng"] } | ||||||
|  |  | ||||||
| [features] | [features] | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| mod builder; | mod builder; | ||||||
| mod enriched; | mod enriched; | ||||||
|  | mod primary_key; | ||||||
| mod reader; | mod reader; | ||||||
| mod serde_impl; | mod serde_impl; | ||||||
|  |  | ||||||
| @@ -11,6 +12,7 @@ use bimap::BiHashMap; | |||||||
| pub use builder::DocumentsBatchBuilder; | pub use builder::DocumentsBatchBuilder; | ||||||
| pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; | pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; | ||||||
| use obkv::KvReader; | use obkv::KvReader; | ||||||
|  | pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY}; | ||||||
| pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; | pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| @@ -87,6 +89,12 @@ impl DocumentsBatchIndex { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl FieldIdMapper for DocumentsBatchIndex { | ||||||
|  |     fn id(&self, name: &str) -> Option<FieldId> { | ||||||
|  |         self.id(name) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[derive(Debug, thiserror::Error)] | #[derive(Debug, thiserror::Error)] | ||||||
| pub enum Error { | pub enum Error { | ||||||
|     #[error("Error parsing number {value:?} at line {line}: {error}")] |     #[error("Error parsing number {value:?} at line {line}: {error}")] | ||||||
|   | |||||||
							
								
								
									
										172
									
								
								milli/src/documents/primary_key.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								milli/src/documents/primary_key.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,172 @@ | |||||||
|  | use std::iter; | ||||||
|  | use std::result::Result as StdResult; | ||||||
|  |  | ||||||
|  | use serde_json::Value; | ||||||
|  |  | ||||||
|  | use crate::{FieldId, InternalError, Object, Result, UserError}; | ||||||
|  |  | ||||||
|  | /// The symbol used to define levels in a nested primary key. | ||||||
|  | const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; | ||||||
|  |  | ||||||
|  | /// The default primary that is used when not specified. | ||||||
|  | pub const DEFAULT_PRIMARY_KEY: &str = "id"; | ||||||
|  |  | ||||||
|  | /// Trait for objects that can map the name of a field to its [`FieldId`]. | ||||||
|  | pub trait FieldIdMapper { | ||||||
|  |     /// Attempts to map the passed name to its [`FieldId`]. | ||||||
|  |     /// | ||||||
|  |     /// `None` if the field with this name was not found. | ||||||
|  |     fn id(&self, name: &str) -> Option<FieldId>; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// A type that represent the type of primary key that has been set | ||||||
|  | /// for this index, a classic flat one or a nested one. | ||||||
|  | #[derive(Debug, Clone, Copy)] | ||||||
|  | pub enum PrimaryKey<'a> { | ||||||
|  |     Flat { name: &'a str, field_id: FieldId }, | ||||||
|  |     Nested { name: &'a str }, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub enum DocumentIdExtractionError { | ||||||
|  |     InvalidDocumentId(UserError), | ||||||
|  |     MissingDocumentId, | ||||||
|  |     TooManyDocumentIds(usize), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<'a> PrimaryKey<'a> { | ||||||
|  |     pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> { | ||||||
|  |         Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { | ||||||
|  |             Self::Nested { name: path } | ||||||
|  |         } else { | ||||||
|  |             let field_id = fields.id(path)?; | ||||||
|  |             Self::Flat { name: path, field_id } | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn name(&self) -> &str { | ||||||
|  |         match self { | ||||||
|  |             PrimaryKey::Flat { name, .. } => name, | ||||||
|  |             PrimaryKey::Nested { name } => name, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn document_id( | ||||||
|  |         &self, | ||||||
|  |         document: &obkv::KvReader<FieldId>, | ||||||
|  |         fields: &impl FieldIdMapper, | ||||||
|  |     ) -> Result<StdResult<String, DocumentIdExtractionError>> { | ||||||
|  |         match self { | ||||||
|  |             PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) { | ||||||
|  |                 Some(document_id_bytes) => { | ||||||
|  |                     let document_id = serde_json::from_slice(document_id_bytes) | ||||||
|  |                         .map_err(InternalError::SerdeJson)?; | ||||||
|  |                     match validate_document_id_value(document_id)? { | ||||||
|  |                         Ok(document_id) => Ok(Ok(document_id)), | ||||||
|  |                         Err(user_error) => { | ||||||
|  |                             Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), | ||||||
|  |             }, | ||||||
|  |             nested @ PrimaryKey::Nested { .. } => { | ||||||
|  |                 let mut matching_documents_ids = Vec::new(); | ||||||
|  |                 for (first_level_name, right) in nested.possible_level_names() { | ||||||
|  |                     if let Some(field_id) = fields.id(first_level_name) { | ||||||
|  |                         if let Some(value_bytes) = document.get(field_id) { | ||||||
|  |                             let object = serde_json::from_slice(value_bytes) | ||||||
|  |                                 .map_err(InternalError::SerdeJson)?; | ||||||
|  |                             fetch_matching_values(object, right, &mut matching_documents_ids); | ||||||
|  |  | ||||||
|  |                             if matching_documents_ids.len() >= 2 { | ||||||
|  |                                 return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds( | ||||||
|  |                                     matching_documents_ids.len(), | ||||||
|  |                                 ))); | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 match matching_documents_ids.pop() { | ||||||
|  |                     Some(document_id) => match validate_document_id_value(document_id)? { | ||||||
|  |                         Ok(document_id) => Ok(Ok(document_id)), | ||||||
|  |                         Err(user_error) => { | ||||||
|  |                             Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) | ||||||
|  |                         } | ||||||
|  |                     }, | ||||||
|  |                     None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Returns an `Iterator` that gives all the possible fields names the primary key | ||||||
|  |     /// can have depending of the first level name and depth of the objects. | ||||||
|  |     pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ { | ||||||
|  |         let name = self.name(); | ||||||
|  |         name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) | ||||||
|  |             .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) | ||||||
|  |             .chain(iter::once((name, ""))) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) { | ||||||
|  |     match value { | ||||||
|  |         Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), | ||||||
|  |         otherwise => output.push(otherwise), | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn fetch_matching_values_in_object( | ||||||
|  |     object: Object, | ||||||
|  |     selector: &str, | ||||||
|  |     base_key: &str, | ||||||
|  |     output: &mut Vec<Value>, | ||||||
|  | ) { | ||||||
|  |     for (key, value) in object { | ||||||
|  |         let base_key = if base_key.is_empty() { | ||||||
|  |             key.to_string() | ||||||
|  |         } else { | ||||||
|  |             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         if starts_with(selector, &base_key) { | ||||||
|  |             match value { | ||||||
|  |                 Value::Object(object) => { | ||||||
|  |                     fetch_matching_values_in_object(object, selector, &base_key, output) | ||||||
|  |                 } | ||||||
|  |                 value => output.push(value), | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn starts_with(selector: &str, key: &str) -> bool { | ||||||
|  |     selector.strip_prefix(key).map_or(false, |tail| { | ||||||
|  |         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) | ||||||
|  |     }) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | // FIXME: move to a DocumentId struct | ||||||
|  |  | ||||||
|  | fn validate_document_id(document_id: &str) -> Option<&str> { | ||||||
|  |     if !document_id.is_empty() | ||||||
|  |         && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) | ||||||
|  |     { | ||||||
|  |         Some(document_id) | ||||||
|  |     } else { | ||||||
|  |         None | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> { | ||||||
|  |     match document_id { | ||||||
|  |         Value::String(string) => match validate_document_id(&string) { | ||||||
|  |             Some(s) if s.len() == string.len() => Ok(Ok(string)), | ||||||
|  |             Some(s) => Ok(Ok(s.to_string())), | ||||||
|  |             None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), | ||||||
|  |         }, | ||||||
|  |         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), | ||||||
|  |         content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry { | |||||||
|  |  | ||||||
| #[derive(Error, Debug)] | #[derive(Error, Debug)] | ||||||
| pub enum UserError { | pub enum UserError { | ||||||
|     #[error("A soft deleted internal document id have been used: `{document_id}`.")] |  | ||||||
|     AccessingSoftDeletedDocument { document_id: DocumentId }, |  | ||||||
|     #[error("A document cannot contain more than 65,535 fields.")] |     #[error("A document cannot contain more than 65,535 fields.")] | ||||||
|     AttributeLimitReached, |     AttributeLimitReached, | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|   | |||||||
| @@ -1,159 +1,75 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::HashMap; | use std::collections::HashMap; | ||||||
| use std::convert::TryInto; |  | ||||||
| use std::{fmt, str}; |  | ||||||
|  |  | ||||||
| use fst::map::IndexedValue; | use heed::types::{OwnedType, Str}; | ||||||
| use fst::{IntoStreamer, Streamer}; | use heed::{Database, RoIter, RoTxn, RwTxn}; | ||||||
| use roaring::RoaringBitmap; |  | ||||||
|  |  | ||||||
| const DELETED_ID: u64 = u64::MAX; | use crate::{DocumentId, BEU32}; | ||||||
|  |  | ||||||
| pub struct ExternalDocumentsIds<'a> { | pub enum DocumentOperationKind { | ||||||
|     pub(crate) hard: fst::Map<Cow<'a, [u8]>>, |     Create, | ||||||
|     pub(crate) soft: fst::Map<Cow<'a, [u8]>>, |     Delete, | ||||||
|     soft_deleted_docids: RoaringBitmap, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> ExternalDocumentsIds<'a> { | pub struct DocumentOperation { | ||||||
|     pub fn new( |     pub external_id: String, | ||||||
|         hard: fst::Map<Cow<'a, [u8]>>, |     pub internal_id: DocumentId, | ||||||
|         soft: fst::Map<Cow<'a, [u8]>>, |     pub kind: DocumentOperationKind, | ||||||
|         soft_deleted_docids: RoaringBitmap, | } | ||||||
|     ) -> ExternalDocumentsIds<'a> { |  | ||||||
|         ExternalDocumentsIds { hard, soft, soft_deleted_docids } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn into_static(self) -> ExternalDocumentsIds<'static> { | pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>); | ||||||
|         ExternalDocumentsIds { |  | ||||||
|             hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), | impl ExternalDocumentsIds { | ||||||
|             soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), |     pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds { | ||||||
|             soft_deleted_docids: self.soft_deleted_docids, |         ExternalDocumentsIds(db) | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns `true` if hard and soft external documents lists are empty. |     /// Returns `true` if hard and soft external documents lists are empty. | ||||||
|     pub fn is_empty(&self) -> bool { |     pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> { | ||||||
|         self.hard.is_empty() && self.soft.is_empty() |         self.0.is_empty(rtxn).map_err(Into::into) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> { |     pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> { | ||||||
|         let external_id = external_id.as_ref(); |         Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get())) | ||||||
|         match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { |  | ||||||
|             Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { |  | ||||||
|                 Some(id.try_into().unwrap()) |  | ||||||
|             } |  | ||||||
|             _otherwise => None, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they |  | ||||||
|     /// don't contain any soft deleted document id. |  | ||||||
|     pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { |  | ||||||
|         let mut new_hard_builder = fst::MapBuilder::memory(); |  | ||||||
|  |  | ||||||
|         let union_op = self.hard.op().add(&self.soft).r#union(); |  | ||||||
|         let mut iter = union_op.into_stream(); |  | ||||||
|         while let Some((external_id, docids)) = iter.next() { |  | ||||||
|             // prefer selecting the ids from soft, always |  | ||||||
|             let id = indexed_last_value(docids).unwrap(); |  | ||||||
|             if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { |  | ||||||
|                 new_hard_builder.insert(external_id, id)?; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         // Delete soft map completely |  | ||||||
|         self.soft = fst::Map::default().map_data(Cow::Owned)?; |  | ||||||
|         // We save the new map as the new hard map. |  | ||||||
|         self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; |  | ||||||
|  |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> { |  | ||||||
|         let union_op = self.soft.op().add(other).r#union(); |  | ||||||
|  |  | ||||||
|         let mut new_soft_builder = fst::MapBuilder::memory(); |  | ||||||
|         let mut iter = union_op.into_stream(); |  | ||||||
|         while let Some((external_id, marked_docids)) = iter.next() { |  | ||||||
|             let id = indexed_last_value(marked_docids).unwrap(); |  | ||||||
|             new_soft_builder.insert(external_id, id)?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         // We save the new map as the new soft map. |  | ||||||
|         self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; |  | ||||||
|         self.merge_soft_into_hard() |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// An helper function to debug this type, returns an `HashMap` of both, |     /// An helper function to debug this type, returns an `HashMap` of both, | ||||||
|     /// soft and hard fst maps, combined. |     /// soft and hard fst maps, combined. | ||||||
|     pub fn to_hash_map(&self) -> HashMap<String, u32> { |     pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> { | ||||||
|         let mut map = HashMap::new(); |         let mut map = HashMap::default(); | ||||||
|  |         for result in self.0.iter(rtxn)? { | ||||||
|         let union_op = self.hard.op().add(&self.soft).r#union(); |             let (external, internal) = result?; | ||||||
|         let mut iter = union_op.into_stream(); |             map.insert(external.to_owned(), internal.get()); | ||||||
|         while let Some((external_id, marked_docids)) = iter.next() { |  | ||||||
|             let id = indexed_last_value(marked_docids).unwrap(); |  | ||||||
|             if id != DELETED_ID { |  | ||||||
|                 let external_id = str::from_utf8(external_id).unwrap(); |  | ||||||
|                 map.insert(external_id.to_owned(), id.try_into().unwrap()); |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|  |         Ok(map) | ||||||
|         map |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Return an fst of the combined hard and soft deleted ID. |     /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. | ||||||
|     pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> { |     /// | ||||||
|         if self.soft.is_empty() { |     /// If the list contains multiple operations on the same external id, then the result is unspecified. | ||||||
|             return Ok(Cow::Borrowed(&self.hard)); |     /// | ||||||
|         } |     /// # Panics | ||||||
|         let union_op = self.hard.op().add(&self.soft).r#union(); |     /// | ||||||
|  |     /// - If attempting to delete a document that doesn't exist | ||||||
|         let mut iter = union_op.into_stream(); |     /// - If attempting to create a document that already exists | ||||||
|         let mut new_hard_builder = fst::MapBuilder::memory(); |     pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> { | ||||||
|         while let Some((external_id, marked_docids)) = iter.next() { |         for DocumentOperation { external_id, internal_id, kind } in operations { | ||||||
|             let value = indexed_last_value(marked_docids).unwrap(); |             match kind { | ||||||
|             if value != DELETED_ID { |                 DocumentOperationKind::Create => { | ||||||
|                 new_hard_builder.insert(external_id, value)?; |                     self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?; | ||||||
|  |                 } | ||||||
|  |                 DocumentOperationKind::Delete => { | ||||||
|  |                     if !self.0.delete(wtxn, &external_id)? { | ||||||
|  |                         panic!("Attempting to delete a non-existing document") | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         drop(iter); |  | ||||||
|  |  | ||||||
|         Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?)) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn merge_soft_into_hard(&mut self) -> fst::Result<()> { |  | ||||||
|         if self.soft.len() >= self.hard.len() / 2 { |  | ||||||
|             self.hard = self.to_fst()?.into_owned(); |  | ||||||
|             self.soft = fst::Map::default().map_data(Cow::Owned)?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
| } |  | ||||||
|  |  | ||||||
| impl fmt::Debug for ExternalDocumentsIds<'_> { |     /// Returns an iterator over all the external ids. | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |     pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> { | ||||||
|         f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() |         self.0.iter(rtxn) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Default for ExternalDocumentsIds<'static> { |  | ||||||
|     fn default() -> Self { |  | ||||||
|         ExternalDocumentsIds { |  | ||||||
|             hard: fst::Map::default().map_data(Cow::Owned).unwrap(), |  | ||||||
|             soft: fst::Map::default().map_data(Cow::Owned).unwrap(), |  | ||||||
|             soft_deleted_docids: RoaringBitmap::new(), |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Returns the value of the `IndexedValue` with the highest _index_. |  | ||||||
| fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> { |  | ||||||
|     indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) |  | ||||||
| } |  | ||||||
|   | |||||||
| @@ -81,6 +81,12 @@ impl Default for FieldsIdsMap { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl crate::documents::FieldIdMapper for FieldsIdsMap { | ||||||
|  |     fn id(&self, name: &str) -> Option<FieldId> { | ||||||
|  |         self.id(name) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use super::*; |     use super::*; | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::heed_codec::BytesDecodeOwned; | use crate::heed_codec::BytesDecodeOwned; | ||||||
|  | use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||||
|  |  | ||||||
| /// This is the limit where using a byteorder became less size efficient | /// This is the limit where using a byteorder became less size efficient | ||||||
| /// than using a direct roaring encoding, it is also the point where we are able | /// than using a direct roaring encoding, it is also the point where we are able | ||||||
| @@ -60,12 +61,16 @@ impl CboRoaringBitmapCodec { | |||||||
|     /// if the merged values length is under the threshold, values are directly |     /// if the merged values length is under the threshold, values are directly | ||||||
|     /// serialized in the buffer else a RoaringBitmap is created from the |     /// serialized in the buffer else a RoaringBitmap is created from the | ||||||
|     /// values and is serialized in the buffer. |     /// values and is serialized in the buffer. | ||||||
|     pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> { |     pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()> | ||||||
|  |     where | ||||||
|  |         I: IntoIterator<Item = A>, | ||||||
|  |         A: AsRef<[u8]>, | ||||||
|  |     { | ||||||
|         let mut roaring = RoaringBitmap::new(); |         let mut roaring = RoaringBitmap::new(); | ||||||
|         let mut vec = Vec::new(); |         let mut vec = Vec::new(); | ||||||
|  |  | ||||||
|         for bytes in slices { |         for bytes in slices { | ||||||
|             if bytes.len() <= THRESHOLD * size_of::<u32>() { |             if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() { | ||||||
|                 let mut reader = bytes.as_ref(); |                 let mut reader = bytes.as_ref(); | ||||||
|                 while let Ok(integer) = reader.read_u32::<NativeEndian>() { |                 while let Ok(integer) = reader.read_u32::<NativeEndian>() { | ||||||
|                     vec.push(integer); |                     vec.push(integer); | ||||||
| @@ -85,7 +90,7 @@ impl CboRoaringBitmapCodec { | |||||||
|                 } |                 } | ||||||
|             } else { |             } else { | ||||||
|                 // We can unwrap safely because the vector is sorted upper. |                 // We can unwrap safely because the vector is sorted upper. | ||||||
|                 let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); |                 let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap(); | ||||||
|                 roaring.serialize_into(buffer)?; |                 roaring.serialize_into(buffer)?; | ||||||
|             } |             } | ||||||
|         } else { |         } else { | ||||||
| @@ -95,6 +100,33 @@ impl CboRoaringBitmapCodec { | |||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Merges a DelAdd delta into a CboRoaringBitmap. | ||||||
|  |     pub fn merge_deladd_into<'a>( | ||||||
|  |         deladd: KvReaderDelAdd<'_>, | ||||||
|  |         previous: &[u8], | ||||||
|  |         buffer: &'a mut Vec<u8>, | ||||||
|  |     ) -> io::Result<Option<&'a [u8]>> { | ||||||
|  |         // Deserialize the bitmap that is already there | ||||||
|  |         let mut previous = Self::deserialize_from(previous)?; | ||||||
|  |  | ||||||
|  |         // Remove integers we no more want in the previous bitmap | ||||||
|  |         if let Some(value) = deladd.get(DelAdd::Deletion) { | ||||||
|  |             previous -= Self::deserialize_from(value)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Insert the new integers we want in the previous bitmap | ||||||
|  |         if let Some(value) = deladd.get(DelAdd::Addition) { | ||||||
|  |             previous |= Self::deserialize_from(value)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if previous.is_empty() { | ||||||
|  |             return Ok(None); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Self::serialize_into(&previous, buffer); | ||||||
|  |         Ok(Some(&buffer[..])) | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { | impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec; | |||||||
| /// The documents returned by the iterator are grouped by the facet values that | /// The documents returned by the iterator are grouped by the facet values that | ||||||
| /// determined their rank. For example, given the documents: | /// determined their rank. For example, given the documents: | ||||||
| /// | /// | ||||||
| /// ```ignore | /// ```text | ||||||
| /// 0: { "colour": ["blue", "green"] } | /// 0: { "colour": ["blue", "green"] } | ||||||
| /// 1: { "colour": ["blue", "red"] } | /// 1: { "colour": ["blue", "red"] } | ||||||
| /// 2: { "colour": ["orange", "red"] } | /// 2: { "colour": ["orange", "red"] } | ||||||
| @@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec; | |||||||
| /// ``` | /// ``` | ||||||
| /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator | /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator | ||||||
| /// over the following elements: | /// over the following elements: | ||||||
| /// ```ignore | /// ```text | ||||||
| /// [0, 4]  // corresponds to all the documents within the candidates that have the facet value "blue" | /// [0, 4]  // corresponds to all the documents within the candidates that have the facet value "blue" | ||||||
| /// [3]     // same for "green" | /// [3]     // same for "green" | ||||||
| /// [2]     // same for "orange" | /// [2]     // same for "orange" | ||||||
|   | |||||||
| @@ -223,12 +223,9 @@ impl<'a> Filter<'a> { | |||||||
| impl<'a> Filter<'a> { | impl<'a> Filter<'a> { | ||||||
|     pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { |     pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { | ||||||
|         // to avoid doing this for each recursive call we're going to do it ONCE ahead of time |         // to avoid doing this for each recursive call we're going to do it ONCE ahead of time | ||||||
|         let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; |  | ||||||
|         let filterable_fields = index.filterable_fields(rtxn)?; |         let filterable_fields = index.filterable_fields(rtxn)?; | ||||||
|  |  | ||||||
|         // and finally we delete all the soft_deleted_documents, again, only once at the very end |  | ||||||
|         self.inner_evaluate(rtxn, index, &filterable_fields) |         self.inner_evaluate(rtxn, index, &filterable_fields) | ||||||
|             .map(|result| result - soft_deleted_documents) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn evaluate_operator( |     fn evaluate_operator( | ||||||
|   | |||||||
| @@ -12,7 +12,7 @@ use super::Word; | |||||||
| use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; | use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; | ||||||
| use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; | use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; | ||||||
| use crate::{ | use crate::{ | ||||||
|     CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, |     CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| /// A cache storing pointers to values in the LMDB databases. | /// A cache storing pointers to values in the LMDB databases. | ||||||
| @@ -25,7 +25,7 @@ pub struct DatabaseCache<'ctx> { | |||||||
|     pub word_pair_proximity_docids: |     pub word_pair_proximity_docids: | ||||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, |         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||||
|     pub word_prefix_pair_proximity_docids: |     pub word_prefix_pair_proximity_docids: | ||||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, |         FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>, | ||||||
|     pub prefix_word_pair_proximity_docids: |     pub prefix_word_pair_proximity_docids: | ||||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, |         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||||
|     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>, |     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>, | ||||||
| @@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> { | |||||||
|                     merge_cbo_roaring_bitmaps, |                     merge_cbo_roaring_bitmaps, | ||||||
|                 ) |                 ) | ||||||
|             } |             } | ||||||
|             None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( |             None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||||
|                 self.txn, |                 self.txn, | ||||||
|                 word, |                 word, | ||||||
|                 self.word_interner.get(word).as_str(), |                 self.word_interner.get(word).as_str(), | ||||||
| @@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> { | |||||||
|         &mut self, |         &mut self, | ||||||
|         word: Interned<String>, |         word: Interned<String>, | ||||||
|     ) -> Result<Option<RoaringBitmap>> { |     ) -> Result<Option<RoaringBitmap>> { | ||||||
|         DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( |         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||||
|             self.txn, |             self.txn, | ||||||
|             word, |             word, | ||||||
|             self.word_interner.get(word).as_str(), |             self.word_interner.get(word).as_str(), | ||||||
| @@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> { | |||||||
|                     merge_cbo_roaring_bitmaps, |                     merge_cbo_roaring_bitmaps, | ||||||
|                 ) |                 ) | ||||||
|             } |             } | ||||||
|             None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( |             None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||||
|                 self.txn, |                 self.txn, | ||||||
|                 prefix, |                 prefix, | ||||||
|                 self.word_interner.get(prefix).as_str(), |                 self.word_interner.get(prefix).as_str(), | ||||||
| @@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> { | |||||||
|         &mut self, |         &mut self, | ||||||
|         prefix: Interned<String>, |         prefix: Interned<String>, | ||||||
|     ) -> Result<Option<RoaringBitmap>> { |     ) -> Result<Option<RoaringBitmap>> { | ||||||
|         DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( |         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||||
|             self.txn, |             self.txn, | ||||||
|             prefix, |             prefix, | ||||||
|             self.word_interner.get(prefix).as_str(), |             self.word_interner.get(prefix).as_str(), | ||||||
| @@ -297,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> { | |||||||
|         prefix2: Interned<String>, |         prefix2: Interned<String>, | ||||||
|         proximity: u8, |         proximity: u8, | ||||||
|     ) -> Result<Option<RoaringBitmap>> { |     ) -> Result<Option<RoaringBitmap>> { | ||||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( |         let docids = match self | ||||||
|             self.txn, |             .db_cache | ||||||
|             (proximity, word1, prefix2), |             .word_prefix_pair_proximity_docids | ||||||
|             &( |             .entry((proximity, word1, prefix2)) | ||||||
|                 proximity, |         { | ||||||
|                 self.word_interner.get(word1).as_str(), |             Entry::Occupied(docids) => docids.get().clone(), | ||||||
|                 self.word_interner.get(prefix2).as_str(), |             Entry::Vacant(entry) => { | ||||||
|             ), |                 // compute docids using prefix iter and store the result in the cache. | ||||||
|             &mut self.db_cache.word_prefix_pair_proximity_docids, |                 let key = U8StrStrCodec::bytes_encode(&( | ||||||
|             self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(), |                     proximity, | ||||||
|         ) |                     self.word_interner.get(word1).as_str(), | ||||||
|  |                     self.word_interner.get(prefix2).as_str(), | ||||||
|  |                 )) | ||||||
|  |                 .unwrap() | ||||||
|  |                 .into_owned(); | ||||||
|  |                 let mut prefix_docids = RoaringBitmap::new(); | ||||||
|  |                 let remap_key_type = self | ||||||
|  |                     .index | ||||||
|  |                     .word_pair_proximity_docids | ||||||
|  |                     .remap_key_type::<ByteSlice>() | ||||||
|  |                     .prefix_iter(self.txn, &key)?; | ||||||
|  |                 for result in remap_key_type { | ||||||
|  |                     let (_, docids) = result?; | ||||||
|  |  | ||||||
|  |                     prefix_docids |= docids; | ||||||
|  |                 } | ||||||
|  |                 entry.insert(Some(prefix_docids.clone())); | ||||||
|  |                 Some(prefix_docids) | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |         Ok(docids) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn get_db_prefix_word_pair_proximity_docids( |     pub fn get_db_prefix_word_pair_proximity_docids( | ||||||
|         &mut self, |         &mut self, | ||||||
|         left_prefix: Interned<String>, |         left_prefix: Interned<String>, | ||||||
|         right: Interned<String>, |         right: Interned<String>, | ||||||
|         proximity: u8, |         proximity: u8, | ||||||
|     ) -> Result<Option<RoaringBitmap>> { |     ) -> Result<Option<RoaringBitmap>> { | ||||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( |         // only accept exact matches on reverted positions | ||||||
|             self.txn, |         self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) | ||||||
|             (proximity, left_prefix, right), |  | ||||||
|             &( |  | ||||||
|                 proximity, |  | ||||||
|                 self.word_interner.get(left_prefix).as_str(), |  | ||||||
|                 self.word_interner.get(right).as_str(), |  | ||||||
|             ), |  | ||||||
|             &mut self.db_cache.prefix_word_pair_proximity_docids, |  | ||||||
|             self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(), |  | ||||||
|         ) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn get_db_word_fid_docids( |     pub fn get_db_word_fid_docids( | ||||||
|   | |||||||
| @@ -371,7 +371,7 @@ fn test_proximity_prefix_db() { | |||||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); |     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||||
|     s.query("best s"); |     s.query("best s"); | ||||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); |     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]"); | ||||||
|     insta::assert_snapshot!(format!("{document_scores:#?}")); |     insta::assert_snapshot!(format!("{document_scores:#?}")); | ||||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|  |  | ||||||
| @@ -379,13 +379,13 @@ fn test_proximity_prefix_db() { | |||||||
|     insta::assert_debug_snapshot!(texts, @r###" |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|     [ |     [ | ||||||
|         "\"this is the best summer meal\"", |         "\"this is the best summer meal\"", | ||||||
|         "\"summer best\"", |  | ||||||
|         "\"this is the best meal of summer\"", |         "\"this is the best meal of summer\"", | ||||||
|         "\"summer x best\"", |  | ||||||
|         "\"this is the best meal I have ever had in such a beautiful summer day\"", |         "\"this is the best meal I have ever had in such a beautiful summer day\"", | ||||||
|         "\"this is the best cooked meal of the summer\"", |         "\"this is the best cooked meal of the summer\"", | ||||||
|         "\"this is the best meal of the summer\"", |         "\"this is the best meal of the summer\"", | ||||||
|         "\"summer x y best\"", |         "\"summer x y best\"", | ||||||
|  |         "\"summer x best\"", | ||||||
|  |         "\"summer best\"", | ||||||
|         "\"this is the best meal I have ever had in such a beautiful winter day\"", |         "\"this is the best meal I have ever had in such a beautiful winter day\"", | ||||||
|     ] |     ] | ||||||
|     "###); |     "###); | ||||||
| @@ -423,17 +423,17 @@ fn test_proximity_prefix_db() { | |||||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); |     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||||
|     s.query("best win"); |     s.query("best win"); | ||||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); |     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); | ||||||
|     insta::assert_snapshot!(format!("{document_scores:#?}")); |     insta::assert_snapshot!(format!("{document_scores:#?}")); | ||||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|  |  | ||||||
|     insta::assert_debug_snapshot!(texts, @r###" |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|     [ |     [ | ||||||
|  |         "\"this is the best winter meal\"", | ||||||
|  |         "\"this is the best meal of winter\"", | ||||||
|         "\"this is the best meal I have ever had in such a beautiful winter day\"", |         "\"this is the best meal I have ever had in such a beautiful winter day\"", | ||||||
|         "\"this is the best cooked meal of the winter\"", |         "\"this is the best cooked meal of the winter\"", | ||||||
|         "\"this is the best meal of the winter\"", |         "\"this is the best meal of the winter\"", | ||||||
|         "\"this is the best meal of winter\"", |  | ||||||
|         "\"this is the best winter meal\"", |  | ||||||
|         "\"winter x y best\"", |         "\"winter x y best\"", | ||||||
|         "\"winter x best\"", |         "\"winter x best\"", | ||||||
|         "\"winter best\"", |         "\"winter best\"", | ||||||
| @@ -471,20 +471,20 @@ fn test_proximity_prefix_db() { | |||||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); |     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||||
|     s.query("best wi"); |     s.query("best wi"); | ||||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); |     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); | ||||||
|     insta::assert_snapshot!(format!("{document_scores:#?}")); |     insta::assert_snapshot!(format!("{document_scores:#?}")); | ||||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|  |  | ||||||
|     insta::assert_debug_snapshot!(texts, @r###" |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|     [ |     [ | ||||||
|         "\"this is the best winter meal\"", |         "\"this is the best winter meal\"", | ||||||
|         "\"winter best\"", |  | ||||||
|         "\"this is the best meal of winter\"", |         "\"this is the best meal of winter\"", | ||||||
|         "\"winter x best\"", |  | ||||||
|         "\"this is the best meal I have ever had in such a beautiful winter day\"", |         "\"this is the best meal I have ever had in such a beautiful winter day\"", | ||||||
|         "\"this is the best cooked meal of the winter\"", |         "\"this is the best cooked meal of the winter\"", | ||||||
|         "\"this is the best meal of the winter\"", |         "\"this is the best meal of the winter\"", | ||||||
|         "\"winter x y best\"", |         "\"winter x y best\"", | ||||||
|  |         "\"winter x best\"", | ||||||
|  |         "\"winter best\"", | ||||||
|     ] |     ] | ||||||
|     "###); |     "###); | ||||||
| } | } | ||||||
|   | |||||||
| @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" | |||||||
|             }, |             }, | ||||||
|         ), |         ), | ||||||
|     ], |     ], | ||||||
|     [ |  | ||||||
|         Proximity( |  | ||||||
|             Rank { |  | ||||||
|                 rank: 3, |  | ||||||
|                 max_rank: 4, |  | ||||||
|             }, |  | ||||||
|         ), |  | ||||||
|     ], |  | ||||||
|     [ |     [ | ||||||
|         Proximity( |         Proximity( | ||||||
|             Rank { |             Rank { | ||||||
| @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" | |||||||
|     [ |     [ | ||||||
|         Proximity( |         Proximity( | ||||||
|             Rank { |             Rank { | ||||||
|                 rank: 2, |                 rank: 1, | ||||||
|  |                 max_rank: 4, | ||||||
|  |             }, | ||||||
|  |         ), | ||||||
|  |     ], | ||||||
|  |     [ | ||||||
|  |         Proximity( | ||||||
|  |             Rank { | ||||||
|  |                 rank: 1, | ||||||
|                 max_rank: 4, |                 max_rank: 4, | ||||||
|             }, |             }, | ||||||
|         ), |         ), | ||||||
|   | |||||||
| @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" | |||||||
|             }, |             }, | ||||||
|         ), |         ), | ||||||
|     ], |     ], | ||||||
|     [ |  | ||||||
|         Proximity( |  | ||||||
|             Rank { |  | ||||||
|                 rank: 3, |  | ||||||
|                 max_rank: 4, |  | ||||||
|             }, |  | ||||||
|         ), |  | ||||||
|     ], |  | ||||||
|     [ |     [ | ||||||
|         Proximity( |         Proximity( | ||||||
|             Rank { |             Rank { | ||||||
| @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" | |||||||
|     [ |     [ | ||||||
|         Proximity( |         Proximity( | ||||||
|             Rank { |             Rank { | ||||||
|                 rank: 2, |                 rank: 1, | ||||||
|  |                 max_rank: 4, | ||||||
|  |             }, | ||||||
|  |         ), | ||||||
|  |     ], | ||||||
|  |     [ | ||||||
|  |         Proximity( | ||||||
|  |             Rank { | ||||||
|  |                 rank: 1, | ||||||
|                 max_rank: 4, |                 max_rank: 4, | ||||||
|             }, |             }, | ||||||
|         ), |         ), | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")" | |||||||
|     [ |     [ | ||||||
|         Proximity( |         Proximity( | ||||||
|             Rank { |             Rank { | ||||||
|                 rank: 1, |                 rank: 4, | ||||||
|                 max_rank: 4, |                 max_rank: 4, | ||||||
|             }, |             }, | ||||||
|         ), |         ), | ||||||
| @@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")" | |||||||
|     [ |     [ | ||||||
|         Proximity( |         Proximity( | ||||||
|             Rank { |             Rank { | ||||||
|                 rank: 1, |                 rank: 2, | ||||||
|                 max_rank: 4, |                 max_rank: 4, | ||||||
|             }, |             }, | ||||||
|         ), |         ), | ||||||
|   | |||||||
| @@ -13,6 +13,7 @@ This module tests the `sort` ranking rule: | |||||||
|  |  | ||||||
| use big_s::S; | use big_s::S; | ||||||
| use maplit::hashset; | use maplit::hashset; | ||||||
|  | use meili_snap::insta; | ||||||
|  |  | ||||||
| use crate::index::tests::TempIndex; | use crate::index::tests::TempIndex; | ||||||
| use crate::search::new::tests::collect_field_values; | use crate::search::new::tests::collect_field_values; | ||||||
|   | |||||||
| @@ -4,9 +4,8 @@ use std::path::Path; | |||||||
|  |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::facet::FacetType; |  | ||||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; | use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; | ||||||
| use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index}; | use crate::{make_db_snap_from_iter, obkv_to_json, Index}; | ||||||
|  |  | ||||||
| #[track_caller] | #[track_caller] | ||||||
| pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { | pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { | ||||||
| @@ -98,7 +97,6 @@ Create a snapshot test of the given database. | |||||||
|     - `facet_id_string_docids` |     - `facet_id_string_docids` | ||||||
|     - `documents_ids` |     - `documents_ids` | ||||||
|     - `stop_words` |     - `stop_words` | ||||||
|     - `soft_deleted_documents_ids` |  | ||||||
|     - `field_distribution` |     - `field_distribution` | ||||||
|     - `fields_ids_map` |     - `fields_ids_map` | ||||||
|     - `geo_faceted_documents_ids` |     - `geo_faceted_documents_ids` | ||||||
| @@ -221,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String { | |||||||
|         &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) |         &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) | ||||||
|     }) |     }) | ||||||
| } | } | ||||||
| pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { |  | ||||||
|     make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( |  | ||||||
|         (proximity, word1, prefix), |  | ||||||
|         b, |  | ||||||
|     )| { |  | ||||||
|         &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) |  | ||||||
|     }) |  | ||||||
| } |  | ||||||
| pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { |  | ||||||
|     make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( |  | ||||||
|         (proximity, prefix, word2), |  | ||||||
|         b, |  | ||||||
|     )| { |  | ||||||
|         &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) |  | ||||||
|     }) |  | ||||||
| } |  | ||||||
| pub fn snap_word_position_docids(index: &Index) -> String { | pub fn snap_word_position_docids(index: &Index) -> String { | ||||||
|     make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { |     make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { | ||||||
|         &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) |         &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) | ||||||
| @@ -308,12 +290,6 @@ pub fn snap_stop_words(index: &Index) -> String { | |||||||
|     let snap = format!("{stop_words:?}"); |     let snap = format!("{stop_words:?}"); | ||||||
|     snap |     snap | ||||||
| } | } | ||||||
| pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { |  | ||||||
|     let rtxn = index.read_txn().unwrap(); |  | ||||||
|     let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); |  | ||||||
|  |  | ||||||
|     display_bitmap(&soft_deleted_documents_ids) |  | ||||||
| } |  | ||||||
| pub fn snap_field_distributions(index: &Index) -> String { | pub fn snap_field_distributions(index: &Index) -> String { | ||||||
|     let rtxn = index.read_txn().unwrap(); |     let rtxn = index.read_txn().unwrap(); | ||||||
|     let mut snap = String::new(); |     let mut snap = String::new(); | ||||||
| @@ -340,50 +316,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { | |||||||
| } | } | ||||||
| pub fn snap_external_documents_ids(index: &Index) -> String { | pub fn snap_external_documents_ids(index: &Index) -> String { | ||||||
|     let rtxn = index.read_txn().unwrap(); |     let rtxn = index.read_txn().unwrap(); | ||||||
|     let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); |     let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap(); | ||||||
|  |     // ensure fixed order (not guaranteed by hashmap) | ||||||
|  |     let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect(); | ||||||
|  |     external_ids.sort_by(|(l, _), (r, _)| l.cmp(r)); | ||||||
|  |  | ||||||
|     let mut snap = String::new(); |     let mut snap = String::new(); | ||||||
|  |  | ||||||
|     writeln!(&mut snap, "soft:").unwrap(); |     writeln!(&mut snap, "docids:").unwrap(); | ||||||
|     let stream_soft = soft.stream(); |     for (key, id) in external_ids { | ||||||
|     let soft_external_ids = stream_soft.into_str_vec().unwrap(); |  | ||||||
|     for (key, id) in soft_external_ids { |  | ||||||
|         writeln!(&mut snap, "{key:<24} {id}").unwrap(); |  | ||||||
|     } |  | ||||||
|     writeln!(&mut snap, "hard:").unwrap(); |  | ||||||
|     let stream_hard = hard.stream(); |  | ||||||
|     let hard_external_ids = stream_hard.into_str_vec().unwrap(); |  | ||||||
|     for (key, id) in hard_external_ids { |  | ||||||
|         writeln!(&mut snap, "{key:<24} {id}").unwrap(); |         writeln!(&mut snap, "{key:<24} {id}").unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     snap |     snap | ||||||
| } | } | ||||||
| pub fn snap_number_faceted_documents_ids(index: &Index) -> String { |  | ||||||
|     let rtxn = index.read_txn().unwrap(); |  | ||||||
|     let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); |  | ||||||
|     let mut snap = String::new(); |  | ||||||
|     for field_id in fields_ids_map.ids() { |  | ||||||
|         let number_faceted_documents_ids = |  | ||||||
|             index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); |  | ||||||
|         writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) |  | ||||||
|             .unwrap(); |  | ||||||
|     } |  | ||||||
|     snap |  | ||||||
| } |  | ||||||
| pub fn snap_string_faceted_documents_ids(index: &Index) -> String { |  | ||||||
|     let rtxn = index.read_txn().unwrap(); |  | ||||||
|     let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); |  | ||||||
|  |  | ||||||
|     let mut snap = String::new(); |  | ||||||
|     for field_id in fields_ids_map.ids() { |  | ||||||
|         let string_faceted_documents_ids = |  | ||||||
|             index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); |  | ||||||
|         writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) |  | ||||||
|             .unwrap(); |  | ||||||
|     } |  | ||||||
|     snap |  | ||||||
| } |  | ||||||
| pub fn snap_words_fst(index: &Index) -> String { | pub fn snap_words_fst(index: &Index) -> String { | ||||||
|     let rtxn = index.read_txn().unwrap(); |     let rtxn = index.read_txn().unwrap(); | ||||||
|     let words_fst = index.words_fst(&rtxn).unwrap(); |     let words_fst = index.words_fst(&rtxn).unwrap(); | ||||||
| @@ -516,9 +463,6 @@ macro_rules! full_snap_of_db { | |||||||
|     ($index:ident, stop_words) => {{ |     ($index:ident, stop_words) => {{ | ||||||
|         $crate::snapshot_tests::snap_stop_words(&$index) |         $crate::snapshot_tests::snap_stop_words(&$index) | ||||||
|     }}; |     }}; | ||||||
|     ($index:ident, soft_deleted_documents_ids) => {{ |  | ||||||
|         $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) |  | ||||||
|     }}; |  | ||||||
|     ($index:ident, field_distribution) => {{ |     ($index:ident, field_distribution) => {{ | ||||||
|         $crate::snapshot_tests::snap_field_distributions(&$index) |         $crate::snapshot_tests::snap_field_distributions(&$index) | ||||||
|     }}; |     }}; | ||||||
| @@ -531,12 +475,6 @@ macro_rules! full_snap_of_db { | |||||||
|     ($index:ident, external_documents_ids) => {{ |     ($index:ident, external_documents_ids) => {{ | ||||||
|         $crate::snapshot_tests::snap_external_documents_ids(&$index) |         $crate::snapshot_tests::snap_external_documents_ids(&$index) | ||||||
|     }}; |     }}; | ||||||
|     ($index:ident, number_faceted_documents_ids) => {{ |  | ||||||
|         $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) |  | ||||||
|     }}; |  | ||||||
|     ($index:ident, string_faceted_documents_ids) => {{ |  | ||||||
|         $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) |  | ||||||
|     }}; |  | ||||||
|     ($index:ident, words_fst) => {{ |     ($index:ident, words_fst) => {{ | ||||||
|         $crate::snapshot_tests::snap_words_fst(&$index) |         $crate::snapshot_tests::snap_words_fst(&$index) | ||||||
|     }}; |     }}; | ||||||
|   | |||||||
| @@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl AvailableDocumentsIds { | impl AvailableDocumentsIds { | ||||||
|     pub fn from_documents_ids( |     pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { | ||||||
|         docids: &RoaringBitmap, |         match docids.max() { | ||||||
|         soft_deleted_docids: &RoaringBitmap, |  | ||||||
|     ) -> AvailableDocumentsIds { |  | ||||||
|         let used_docids = docids | soft_deleted_docids; |  | ||||||
|  |  | ||||||
|         match used_docids.max() { |  | ||||||
|             Some(last_id) => { |             Some(last_id) => { | ||||||
|                 let mut available = RoaringBitmap::from_iter(0..last_id); |                 let mut available = RoaringBitmap::from_iter(0..last_id); | ||||||
|                 available -= used_docids; |                 available -= docids; | ||||||
|  |  | ||||||
|                 let iter = match last_id.checked_add(1) { |                 let iter = match last_id.checked_add(1) { | ||||||
|                     Some(id) => id..=u32::max_value(), |                     Some(id) => id..=u32::max_value(), | ||||||
| @@ -50,7 +45,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn empty() { |     fn empty() { | ||||||
|         let base = RoaringBitmap::new(); |         let base = RoaringBitmap::new(); | ||||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); |         let left = AvailableDocumentsIds::from_documents_ids(&base); | ||||||
|         let right = 0..=u32::max_value(); |         let right = 0..=u32::max_value(); | ||||||
|         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); |         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); | ||||||
|     } |     } | ||||||
| @@ -63,28 +58,8 @@ mod tests { | |||||||
|         base.insert(100); |         base.insert(100); | ||||||
|         base.insert(405); |         base.insert(405); | ||||||
|  |  | ||||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); |         let left = AvailableDocumentsIds::from_documents_ids(&base); | ||||||
|         let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); |         let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); | ||||||
|         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); |         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn soft_deleted() { |  | ||||||
|         let mut base = RoaringBitmap::new(); |  | ||||||
|         base.insert(0); |  | ||||||
|         base.insert(10); |  | ||||||
|         base.insert(100); |  | ||||||
|         base.insert(405); |  | ||||||
|  |  | ||||||
|         let mut soft_deleted = RoaringBitmap::new(); |  | ||||||
|         soft_deleted.insert(1); |  | ||||||
|         soft_deleted.insert(11); |  | ||||||
|         soft_deleted.insert(101); |  | ||||||
|         soft_deleted.insert(406); |  | ||||||
|  |  | ||||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); |  | ||||||
|         let right = |  | ||||||
|             (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); |  | ||||||
|         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,8 +1,7 @@ | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use time::OffsetDateTime; | use time::OffsetDateTime; | ||||||
|  |  | ||||||
| use crate::facet::FacetType; | use crate::{FieldDistribution, Index, Result}; | ||||||
| use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; |  | ||||||
|  |  | ||||||
| pub struct ClearDocuments<'t, 'u, 'i> { | pub struct ClearDocuments<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -21,13 +20,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         let Index { |         let Index { | ||||||
|             env: _env, |             env: _env, | ||||||
|             main: _main, |             main: _main, | ||||||
|  |             external_documents_ids, | ||||||
|             word_docids, |             word_docids, | ||||||
|             exact_word_docids, |             exact_word_docids, | ||||||
|             word_prefix_docids, |             word_prefix_docids, | ||||||
|             exact_word_prefix_docids, |             exact_word_prefix_docids, | ||||||
|             word_pair_proximity_docids, |             word_pair_proximity_docids, | ||||||
|             word_prefix_pair_proximity_docids, |  | ||||||
|             prefix_word_pair_proximity_docids, |  | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_fid_docids, |             word_fid_docids, | ||||||
|             field_id_word_count_docids, |             field_id_word_count_docids, | ||||||
| @@ -51,43 +49,23 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|  |  | ||||||
|         // We retrieve the number of documents ids that we are deleting. |         // We retrieve the number of documents ids that we are deleting. | ||||||
|         let number_of_documents = self.index.number_of_documents(self.wtxn)?; |         let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; |  | ||||||
|  |  | ||||||
|         // We clean some of the main engine datastructures. |         // We clean some of the main engine datastructures. | ||||||
|         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; |         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; | ||||||
|         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; |         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; | ||||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; |  | ||||||
|         self.index.put_documents_ids(self.wtxn, &empty_roaring)?; |         self.index.put_documents_ids(self.wtxn, &empty_roaring)?; | ||||||
|         self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; |  | ||||||
|         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; |         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; | ||||||
|         self.index.delete_geo_rtree(self.wtxn)?; |         self.index.delete_geo_rtree(self.wtxn)?; | ||||||
|         self.index.delete_geo_faceted_documents_ids(self.wtxn)?; |         self.index.delete_geo_faceted_documents_ids(self.wtxn)?; | ||||||
|         self.index.delete_vector_hnsw(self.wtxn)?; |         self.index.delete_vector_hnsw(self.wtxn)?; | ||||||
|  |  | ||||||
|         // We clean all the faceted documents ids. |  | ||||||
|         for field_id in faceted_fields { |  | ||||||
|             self.index.put_faceted_documents_ids( |  | ||||||
|                 self.wtxn, |  | ||||||
|                 field_id, |  | ||||||
|                 FacetType::Number, |  | ||||||
|                 &empty_roaring, |  | ||||||
|             )?; |  | ||||||
|             self.index.put_faceted_documents_ids( |  | ||||||
|                 self.wtxn, |  | ||||||
|                 field_id, |  | ||||||
|                 FacetType::String, |  | ||||||
|                 &empty_roaring, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // Clear the other databases. |         // Clear the other databases. | ||||||
|  |         external_documents_ids.clear(self.wtxn)?; | ||||||
|         word_docids.clear(self.wtxn)?; |         word_docids.clear(self.wtxn)?; | ||||||
|         exact_word_docids.clear(self.wtxn)?; |         exact_word_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_docids.clear(self.wtxn)?; |         word_prefix_docids.clear(self.wtxn)?; | ||||||
|         exact_word_prefix_docids.clear(self.wtxn)?; |         exact_word_prefix_docids.clear(self.wtxn)?; | ||||||
|         word_pair_proximity_docids.clear(self.wtxn)?; |         word_pair_proximity_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; |  | ||||||
|         prefix_word_pair_proximity_docids.clear(self.wtxn)?; |  | ||||||
|         word_position_docids.clear(self.wtxn)?; |         word_position_docids.clear(self.wtxn)?; | ||||||
|         word_fid_docids.clear(self.wtxn)?; |         word_fid_docids.clear(self.wtxn)?; | ||||||
|         field_id_word_count_docids.clear(self.wtxn)?; |         field_id_word_count_docids.clear(self.wtxn)?; | ||||||
| @@ -140,7 +118,7 @@ mod tests { | |||||||
|  |  | ||||||
|         assert!(index.words_fst(&rtxn).unwrap().is_empty()); |         assert!(index.words_fst(&rtxn).unwrap().is_empty()); | ||||||
|         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); |         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); | ||||||
|         assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); |         assert!(index.external_documents_ids().is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.documents_ids(&rtxn).unwrap().is_empty()); |         assert!(index.documents_ids(&rtxn).unwrap().is_empty()); | ||||||
|         assert!(index.field_distribution(&rtxn).unwrap().is_empty()); |         assert!(index.field_distribution(&rtxn).unwrap().is_empty()); | ||||||
|         assert!(index.geo_rtree(&rtxn).unwrap().is_none()); |         assert!(index.geo_rtree(&rtxn).unwrap().is_none()); | ||||||
| @@ -150,7 +128,6 @@ mod tests { | |||||||
|         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); |         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); |         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); |         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); |  | ||||||
|         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); |         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); |         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); | ||||||
|         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); |         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); | ||||||
|   | |||||||
							
								
								
									
										125
									
								
								milli/src/update/del_add.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								milli/src/update/del_add.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,125 @@ | |||||||
|  | use obkv::Key; | ||||||
|  |  | ||||||
|  | pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>; | ||||||
|  | pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>; | ||||||
|  |  | ||||||
|  | /// DelAdd defines the new value to add in the database and old value to delete from the database. | ||||||
|  | /// | ||||||
|  | /// Its used in an OBKV to be serialized in grenad files. | ||||||
|  | #[repr(u8)] | ||||||
|  | #[derive(Clone, Copy, PartialOrd, PartialEq, Debug)] | ||||||
|  | pub enum DelAdd { | ||||||
|  |     Deletion = 0, | ||||||
|  |     Addition = 1, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Key for DelAdd { | ||||||
|  |     const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>(); | ||||||
|  |     type BYTES = [u8; Self::BYTES_SIZE]; | ||||||
|  |  | ||||||
|  |     fn to_be_bytes(&self) -> Self::BYTES { | ||||||
|  |         u8::to_be_bytes(*self as u8) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn from_be_bytes(array: Self::BYTES) -> Self { | ||||||
|  |         match u8::from_be_bytes(array) { | ||||||
|  |             0 => Self::Deletion, | ||||||
|  |             1 => Self::Addition, | ||||||
|  |             otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value> | ||||||
|  | /// | ||||||
|  | /// Deletion: put all the values under DelAdd::Deletion | ||||||
|  | /// Addition: put all the values under DelAdd::Addition, | ||||||
|  | /// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition, | ||||||
|  | pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>( | ||||||
|  |     reader: obkv::KvReader<K>, | ||||||
|  |     operation: DelAddOperation, | ||||||
|  |     buffer: &mut Vec<u8>, | ||||||
|  | ) -> Result<(), std::io::Error> { | ||||||
|  |     let mut writer = obkv::KvWriter::new(buffer); | ||||||
|  |     let mut value_buffer = Vec::new(); | ||||||
|  |     for (key, value) in reader.iter() { | ||||||
|  |         value_buffer.clear(); | ||||||
|  |         let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |         if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { | ||||||
|  |             value_writer.insert(DelAdd::Deletion, value)?; | ||||||
|  |         } | ||||||
|  |         if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) { | ||||||
|  |             value_writer.insert(DelAdd::Addition, value)?; | ||||||
|  |         } | ||||||
|  |         value_writer.finish()?; | ||||||
|  |         writer.insert(key, &value_buffer)?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     writer.finish() | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Enum controlling the side of the DelAdd obkv in which the provided value will be written. | ||||||
|  | #[derive(Debug, Clone, Copy)] | ||||||
|  | pub enum DelAddOperation { | ||||||
|  |     Deletion, | ||||||
|  |     Addition, | ||||||
|  |     DeletionAndAddition, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value> | ||||||
|  | /// | ||||||
|  | /// putting each deletion obkv's keys under an DelAdd::Deletion | ||||||
|  | /// and putting each addition obkv's keys under an DelAdd::Addition | ||||||
|  | pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>( | ||||||
|  |     deletion: obkv::KvReader<K>, | ||||||
|  |     addition: obkv::KvReader<K>, | ||||||
|  |     buffer: &mut Vec<u8>, | ||||||
|  | ) -> Result<(), std::io::Error> { | ||||||
|  |     use itertools::merge_join_by; | ||||||
|  |     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||||
|  |  | ||||||
|  |     let mut writer = obkv::KvWriter::new(buffer); | ||||||
|  |     let mut value_buffer = Vec::new(); | ||||||
|  |  | ||||||
|  |     for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) { | ||||||
|  |         value_buffer.clear(); | ||||||
|  |         match eob { | ||||||
|  |             Left((k, v)) => { | ||||||
|  |                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, v).unwrap(); | ||||||
|  |                 writer.insert(k, value_writer.into_inner()?).unwrap(); | ||||||
|  |             } | ||||||
|  |             Right((k, v)) => { | ||||||
|  |                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                 value_writer.insert(DelAdd::Addition, v).unwrap(); | ||||||
|  |                 writer.insert(k, value_writer.into_inner()?).unwrap(); | ||||||
|  |             } | ||||||
|  |             Both((k, deletion), (_, addition)) => { | ||||||
|  |                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, deletion).unwrap(); | ||||||
|  |                 value_writer.insert(DelAdd::Addition, addition).unwrap(); | ||||||
|  |                 writer.insert(k, value_writer.into_inner()?).unwrap(); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     writer.finish() | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { | ||||||
|  |     del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// A function that extracts and returns the Add side of a DelAdd obkv. | ||||||
|  | /// This is useful when there are no previous value in the database and | ||||||
|  | /// therefore we don't need to do a diff with what's already there. | ||||||
|  | /// | ||||||
|  | /// If there is no Add side we currently write an empty buffer | ||||||
|  | /// which is a valid CboRoaringBitmap. | ||||||
|  | #[allow(clippy::ptr_arg)] // required to avoid signature mismatch | ||||||
|  | pub fn deladd_serialize_add_side<'a>( | ||||||
|  |     obkv: &'a [u8], | ||||||
|  |     _buffer: &mut Vec<u8>, | ||||||
|  | ) -> crate::Result<&'a [u8]> { | ||||||
|  |     Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) | ||||||
|  | } | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,10 +1,9 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::BufReader; | use std::io::BufReader; | ||||||
|  |  | ||||||
| use grenad::CompressionType; | use grenad::CompressionType; | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use heed::{BytesEncode, Error, RoTxn, RwTxn}; | use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; | use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; | ||||||
| @@ -13,17 +12,15 @@ use crate::heed_codec::facet::{ | |||||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, |     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, | ||||||
| }; | }; | ||||||
| use crate::heed_codec::ByteSliceRefCodec; | use crate::heed_codec::ByteSliceRefCodec; | ||||||
|  | use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||||
| use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; | use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; | ||||||
| use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; | use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; | ||||||
|  |  | ||||||
| /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases | /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases | ||||||
| /// by rebuilding the database "from scratch". | /// by rebuilding the database "from scratch". | ||||||
| /// | /// | ||||||
| /// First, the new elements are inserted into the level 0 of the database. Then, the | /// First, the new elements are inserted into the level 0 of the database. Then, the | ||||||
| /// higher levels are cleared and recomputed from the content of level 0. | /// higher levels are cleared and recomputed from the content of level 0. | ||||||
| /// |  | ||||||
| /// Finally, the `faceted_documents_ids` value in the main database of `Index` |  | ||||||
| /// is updated to contain the new set of faceted documents. |  | ||||||
| pub struct FacetsUpdateBulk<'i> { | pub struct FacetsUpdateBulk<'i> { | ||||||
|     index: &'i Index, |     index: &'i Index, | ||||||
|     group_size: u8, |     group_size: u8, | ||||||
| @@ -31,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> { | |||||||
|     facet_type: FacetType, |     facet_type: FacetType, | ||||||
|     field_ids: Vec<FieldId>, |     field_ids: Vec<FieldId>, | ||||||
|     // None if level 0 does not need to be updated |     // None if level 0 does not need to be updated | ||||||
|     new_data: Option<grenad::Reader<BufReader<File>>>, |     delta_data: Option<grenad::Reader<BufReader<File>>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'i> FacetsUpdateBulk<'i> { | impl<'i> FacetsUpdateBulk<'i> { | ||||||
| @@ -39,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> { | |||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         field_ids: Vec<FieldId>, |         field_ids: Vec<FieldId>, | ||||||
|         facet_type: FacetType, |         facet_type: FacetType, | ||||||
|         new_data: grenad::Reader<BufReader<File>>, |         delta_data: grenad::Reader<BufReader<File>>, | ||||||
|         group_size: u8, |         group_size: u8, | ||||||
|         min_level_size: u8, |         min_level_size: u8, | ||||||
|     ) -> FacetsUpdateBulk<'i> { |     ) -> FacetsUpdateBulk<'i> { | ||||||
| @@ -49,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> { | |||||||
|             group_size, |             group_size, | ||||||
|             min_level_size, |             min_level_size, | ||||||
|             facet_type, |             facet_type, | ||||||
|             new_data: Some(new_data), |             delta_data: Some(delta_data), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -64,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> { | |||||||
|             group_size: FACET_GROUP_SIZE, |             group_size: FACET_GROUP_SIZE, | ||||||
|             min_level_size: FACET_MIN_LEVEL_SIZE, |             min_level_size: FACET_MIN_LEVEL_SIZE, | ||||||
|             facet_type, |             facet_type, | ||||||
|             new_data: None, |             delta_data: None, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[logging_timer::time("FacetsUpdateBulk::{}")] |     #[logging_timer::time("FacetsUpdateBulk::{}")] | ||||||
|     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { |     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { | ||||||
|         let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; |         let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; | ||||||
|  |  | ||||||
|         let db = match facet_type { |         let db = match facet_type { | ||||||
|             FacetType::String => index |             FacetType::String => index | ||||||
| @@ -81,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> { | |||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; |         let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; | ||||||
|  |  | ||||||
|         inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { |         inner.update(wtxn, &field_ids)?; | ||||||
|             index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; |  | ||||||
|             Ok(()) |  | ||||||
|         })?; |  | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
| @@ -95,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> { | |||||||
| /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type | /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type | ||||||
| pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> { | pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> { | ||||||
|     pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, |     pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||||
|     pub new_data: Option<grenad::Reader<R>>, |     pub delta_data: Option<grenad::Reader<R>>, | ||||||
|     pub group_size: u8, |     pub group_size: u8, | ||||||
|     pub min_level_size: u8, |     pub min_level_size: u8, | ||||||
| } | } | ||||||
| impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||||
|     pub fn update( |     pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> { | ||||||
|         mut self, |  | ||||||
|         wtxn: &mut RwTxn, |  | ||||||
|         field_ids: &[u16], |  | ||||||
|         mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         self.update_level0(wtxn)?; |         self.update_level0(wtxn)?; | ||||||
|         for &field_id in field_ids.iter() { |         for &field_id in field_ids.iter() { | ||||||
|             self.clear_levels(wtxn, field_id)?; |             self.clear_levels(wtxn, field_id)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for &field_id in field_ids.iter() { |         for &field_id in field_ids.iter() { | ||||||
|             let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; |             let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; | ||||||
|  |  | ||||||
|             handle_all_docids(wtxn, field_id, all_docids)?; |  | ||||||
|  |  | ||||||
|             for level_reader in level_readers { |             for level_reader in level_readers { | ||||||
|                 let mut cursor = level_reader.into_cursor()?; |                 let mut cursor = level_reader.into_cursor()?; | ||||||
| @@ -133,19 +120,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
|         self.db.delete_range(wtxn, &range).map(drop)?; |         self.db.delete_range(wtxn, &range).map(drop)?; | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { |     fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { | ||||||
|         let new_data = match self.new_data.take() { |         let delta_data = match self.delta_data.take() { | ||||||
|             Some(x) => x, |             Some(x) => x, | ||||||
|             None => return Ok(()), |             None => return Ok(()), | ||||||
|         }; |         }; | ||||||
|         if self.db.is_empty(wtxn)? { |         if self.db.is_empty(wtxn)? { | ||||||
|             let mut buffer = Vec::new(); |             let mut buffer = Vec::new(); | ||||||
|             let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>(); |             let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>(); | ||||||
|             let mut cursor = new_data.into_cursor()?; |             let mut cursor = delta_data.into_cursor()?; | ||||||
|             while let Some((key, value)) = cursor.move_on_next()? { |             while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|                 if !valid_lmdb_key(key) { |                 if !valid_lmdb_key(key) { | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|  |                 let value = KvReaderDelAdd::new(value); | ||||||
|  |  | ||||||
|  |                 // DB is empty, it is safe to ignore Del operations | ||||||
|  |                 let Some(value) = value.get(DelAdd::Addition) else { | ||||||
|  |                     continue; | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|                 buffer.clear(); |                 buffer.clear(); | ||||||
|                 // the group size for level 0 |                 // the group size for level 0 | ||||||
|                 buffer.push(1); |                 buffer.push(1); | ||||||
| @@ -157,11 +152,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
|             let mut buffer = Vec::new(); |             let mut buffer = Vec::new(); | ||||||
|             let database = self.db.remap_types::<ByteSlice, ByteSlice>(); |             let database = self.db.remap_types::<ByteSlice, ByteSlice>(); | ||||||
|  |  | ||||||
|             let mut cursor = new_data.into_cursor()?; |             let mut cursor = delta_data.into_cursor()?; | ||||||
|             while let Some((key, value)) = cursor.move_on_next()? { |             while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|                 if !valid_lmdb_key(key) { |                 if !valid_lmdb_key(key) { | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|  |                 let value = KvReaderDelAdd::new(value); | ||||||
|  |  | ||||||
|                 // the value is a CboRoaringBitmap, but I still need to prepend the |                 // the value is a CboRoaringBitmap, but I still need to prepend the | ||||||
|                 // group size for level 0 (= 1) to it |                 // group size for level 0 (= 1) to it | ||||||
|                 buffer.clear(); |                 buffer.clear(); | ||||||
| @@ -169,17 +167,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
|                 // then we extend the buffer with the docids bitmap |                 // then we extend the buffer with the docids bitmap | ||||||
|                 match database.get(wtxn, key)? { |                 match database.get(wtxn, key)? { | ||||||
|                     Some(prev_value) => { |                     Some(prev_value) => { | ||||||
|  |                         // prev_value is the group size for level 0, followed by the previous bitmap. | ||||||
|                         let old_bitmap = &prev_value[1..]; |                         let old_bitmap = &prev_value[1..]; | ||||||
|                         CboRoaringBitmapCodec::merge_into( |                         CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?; | ||||||
|                             &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], |  | ||||||
|                             &mut buffer, |  | ||||||
|                         )?; |  | ||||||
|                     } |                     } | ||||||
|                     None => { |                     None => { | ||||||
|  |                         // it is safe to ignore the del in that case. | ||||||
|  |                         let Some(value) = value.get(DelAdd::Addition) else { | ||||||
|  |                             // won't put the key in DB as the value would be empty | ||||||
|  |                             continue; | ||||||
|  |                         }; | ||||||
|  |  | ||||||
|                         buffer.extend_from_slice(value); |                         buffer.extend_from_slice(value); | ||||||
|                     } |                     } | ||||||
|                 }; |                 }; | ||||||
|                 database.put(wtxn, key, &buffer)?; |                 let new_bitmap = &buffer[1..]; | ||||||
|  |                 // if the new bitmap is empty, let's remove it | ||||||
|  |                 if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 { | ||||||
|  |                     database.delete(wtxn, key)?; | ||||||
|  |                 } else { | ||||||
|  |                     database.put(wtxn, key, &buffer)?; | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -188,16 +196,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
|         &self, |         &self, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
|         txn: &RoTxn, |         txn: &RoTxn, | ||||||
|     ) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> { |     ) -> Result<Vec<grenad::Reader<BufReader<File>>>> { | ||||||
|         let mut all_docids = RoaringBitmap::new(); |         let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?; | ||||||
|         let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { |  | ||||||
|             for bitmap in bitmaps { |  | ||||||
|                 all_docids |= bitmap; |  | ||||||
|             } |  | ||||||
|             Ok(()) |  | ||||||
|         })?; |  | ||||||
|  |  | ||||||
|         Ok((subwriters, all_docids)) |         Ok(subwriters) | ||||||
|     } |     } | ||||||
|     #[allow(clippy::type_complexity)] |     #[allow(clippy::type_complexity)] | ||||||
|     fn read_level_0<'t>( |     fn read_level_0<'t>( | ||||||
| @@ -491,7 +493,6 @@ mod tests { | |||||||
|         index.add_documents(documents).unwrap(); |         index.add_documents(documents).unwrap(); | ||||||
|  |  | ||||||
|         db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); |         db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); | ||||||
|         db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|   | |||||||
| @@ -1,360 +0,0 @@ | |||||||
| use std::collections::{HashMap, HashSet}; |  | ||||||
|  |  | ||||||
| use heed::RwTxn; |  | ||||||
| use log::debug; |  | ||||||
| use roaring::RoaringBitmap; |  | ||||||
| use time::OffsetDateTime; |  | ||||||
|  |  | ||||||
| use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; |  | ||||||
| use crate::facet::FacetType; |  | ||||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; |  | ||||||
| use crate::heed_codec::ByteSliceRefCodec; |  | ||||||
| use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; |  | ||||||
| use crate::{FieldId, Index, Result}; |  | ||||||
|  |  | ||||||
| /// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. |  | ||||||
| /// |  | ||||||
| /// Depending on the number of removed elements and the existing size of the database, we use either |  | ||||||
| /// a bulk delete method or an incremental delete method. |  | ||||||
| pub struct FacetsDelete<'i, 'b> { |  | ||||||
|     index: &'i Index, |  | ||||||
|     database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, |  | ||||||
|     facet_type: FacetType, |  | ||||||
|     affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>, |  | ||||||
|     docids_to_delete: &'b RoaringBitmap, |  | ||||||
|     group_size: u8, |  | ||||||
|     max_group_size: u8, |  | ||||||
|     min_level_size: u8, |  | ||||||
| } |  | ||||||
| impl<'i, 'b> FacetsDelete<'i, 'b> { |  | ||||||
|     pub fn new( |  | ||||||
|         index: &'i Index, |  | ||||||
|         facet_type: FacetType, |  | ||||||
|         affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>, |  | ||||||
|         docids_to_delete: &'b RoaringBitmap, |  | ||||||
|     ) -> Self { |  | ||||||
|         let database = match facet_type { |  | ||||||
|             FacetType::String => index |  | ||||||
|                 .facet_id_string_docids |  | ||||||
|                 .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), |  | ||||||
|             FacetType::Number => { |  | ||||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>() |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
|         Self { |  | ||||||
|             index, |  | ||||||
|             database, |  | ||||||
|             facet_type, |  | ||||||
|             affected_facet_values, |  | ||||||
|             docids_to_delete, |  | ||||||
|             group_size: FACET_GROUP_SIZE, |  | ||||||
|             max_group_size: FACET_MAX_GROUP_SIZE, |  | ||||||
|             min_level_size: FACET_MIN_LEVEL_SIZE, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { |  | ||||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); |  | ||||||
|         self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; |  | ||||||
|  |  | ||||||
|         for (field_id, affected_facet_values) in self.affected_facet_values { |  | ||||||
|             // This is an incorrect condition, since we assume that the length of the database is equal |  | ||||||
|             // to the number of facet values for the given field_id. It means that in some cases, we might |  | ||||||
|             // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could |  | ||||||
|             // really be a performance problem is when we fully delete a large ratio of all facet values for |  | ||||||
|             // each field id. This would almost never happen. Still, to be overly cautious, I have added a |  | ||||||
|             // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance |  | ||||||
|             // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. |  | ||||||
|             if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { |  | ||||||
|                 // Bulk delete |  | ||||||
|                 let mut modified = false; |  | ||||||
|  |  | ||||||
|                 for facet_value in affected_facet_values { |  | ||||||
|                     let key = |  | ||||||
|                         FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; |  | ||||||
|                     let mut old = self.database.get(wtxn, &key)?.unwrap(); |  | ||||||
|                     let previous_len = old.bitmap.len(); |  | ||||||
|                     old.bitmap -= self.docids_to_delete; |  | ||||||
|                     if old.bitmap.is_empty() { |  | ||||||
|                         modified = true; |  | ||||||
|                         self.database.delete(wtxn, &key)?; |  | ||||||
|                     } else if old.bitmap.len() != previous_len { |  | ||||||
|                         modified = true; |  | ||||||
|                         self.database.put(wtxn, &key, &old)?; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 if modified { |  | ||||||
|                     let builder = FacetsUpdateBulk::new_not_updating_level_0( |  | ||||||
|                         self.index, |  | ||||||
|                         vec![field_id], |  | ||||||
|                         self.facet_type, |  | ||||||
|                     ); |  | ||||||
|                     builder.execute(wtxn)?; |  | ||||||
|                 } |  | ||||||
|             } else { |  | ||||||
|                 // Incremental |  | ||||||
|                 let inc = FacetsUpdateIncrementalInner { |  | ||||||
|                     db: self.database, |  | ||||||
|                     group_size: self.group_size, |  | ||||||
|                     min_level_size: self.min_level_size, |  | ||||||
|                     max_group_size: self.max_group_size, |  | ||||||
|                 }; |  | ||||||
|                 for facet_value in affected_facet_values { |  | ||||||
|                     inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use std::iter::FromIterator; |  | ||||||
|  |  | ||||||
|     use big_s::S; |  | ||||||
|     use maplit::hashset; |  | ||||||
|     use rand::seq::SliceRandom; |  | ||||||
|     use rand::SeedableRng; |  | ||||||
|     use roaring::RoaringBitmap; |  | ||||||
|  |  | ||||||
|     use crate::db_snap; |  | ||||||
|     use crate::documents::documents_batch_reader_from_objects; |  | ||||||
|     use crate::index::tests::TempIndex; |  | ||||||
|     use crate::update::facet::test_helpers::ordered_string; |  | ||||||
|     use crate::update::{DeleteDocuments, DeletionStrategy}; |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn delete_mixed_incremental_and_bulk() { |  | ||||||
|         // The point of this test is to create an index populated with documents |  | ||||||
|         // containing different filterable attributes. Then, we delete a bunch of documents |  | ||||||
|         // such that a mix of the incremental and bulk indexer is used (depending on the field id) |  | ||||||
|         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_filterable_fields( |  | ||||||
|                     hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, |  | ||||||
|                 ); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = vec![]; |  | ||||||
|         for i in 0..1000 { |  | ||||||
|             documents.push( |  | ||||||
|                 serde_json::json! { |  | ||||||
|                     { |  | ||||||
|                         "id": i, |  | ||||||
|                         "label": i / 10, |  | ||||||
|                         "colour": i / 100, |  | ||||||
|                         "timestamp": i / 2, |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 .as_object() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .clone(), |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let documents = documents_batch_reader_from_objects(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); |  | ||||||
|         db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); |  | ||||||
|  |  | ||||||
|         let mut wtxn = index.env.write_txn().unwrap(); |  | ||||||
|  |  | ||||||
|         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|         builder.strategy(DeletionStrategy::AlwaysHard); |  | ||||||
|         builder.delete_documents(&RoaringBitmap::from_iter(0..100)); |  | ||||||
|         // by deleting the first 100 documents, we expect that: |  | ||||||
|         // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) |  | ||||||
|         // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 |  | ||||||
|         // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 |  | ||||||
|         // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 |  | ||||||
|         // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test |  | ||||||
|         builder.execute().unwrap(); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); |  | ||||||
|         db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); |  | ||||||
|         db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Same test as above but working with string values for the facets |  | ||||||
|     #[test] |  | ||||||
|     fn delete_mixed_incremental_and_bulk_string() { |  | ||||||
|         // The point of this test is to create an index populated with documents |  | ||||||
|         // containing different filterable attributes. Then, we delete a bunch of documents |  | ||||||
|         // such that a mix of the incremental and bulk indexer is used (depending on the field id) |  | ||||||
|         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_filterable_fields( |  | ||||||
|                     hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, |  | ||||||
|                 ); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = vec![]; |  | ||||||
|         for i in 0..1000 { |  | ||||||
|             documents.push( |  | ||||||
|                 serde_json::json! { |  | ||||||
|                     { |  | ||||||
|                         "id": i, |  | ||||||
|                         "label": ordered_string(i / 10), |  | ||||||
|                         "colour": ordered_string(i / 100), |  | ||||||
|                         "timestamp": ordered_string(i / 2), |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 .as_object() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .clone(), |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let documents = documents_batch_reader_from_objects(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) |  | ||||||
|         db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); |  | ||||||
|         db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); |  | ||||||
|  |  | ||||||
|         let mut wtxn = index.env.write_txn().unwrap(); |  | ||||||
|  |  | ||||||
|         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|         builder.strategy(DeletionStrategy::AlwaysHard); |  | ||||||
|         builder.delete_documents(&RoaringBitmap::from_iter(0..100)); |  | ||||||
|         // by deleting the first 100 documents, we expect that: |  | ||||||
|         // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) |  | ||||||
|         // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 |  | ||||||
|         // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 |  | ||||||
|         // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 |  | ||||||
|         // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test |  | ||||||
|         builder.execute().unwrap(); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); |  | ||||||
|         db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); |  | ||||||
|         db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn delete_almost_all_incrementally_string() { |  | ||||||
|         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_filterable_fields( |  | ||||||
|                     hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, |  | ||||||
|                 ); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = vec![]; |  | ||||||
|         for i in 0..1000 { |  | ||||||
|             documents.push( |  | ||||||
|                 serde_json::json! { |  | ||||||
|                     { |  | ||||||
|                         "id": i, |  | ||||||
|                         "label": ordered_string(i / 10), |  | ||||||
|                         "colour": ordered_string(i / 100), |  | ||||||
|                         "timestamp": ordered_string(i / 2), |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 .as_object() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .clone(), |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let documents = documents_batch_reader_from_objects(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) |  | ||||||
|         db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); |  | ||||||
|         db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); |  | ||||||
|  |  | ||||||
|         let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); |  | ||||||
|  |  | ||||||
|         let mut docids_to_delete = (0..1000).collect::<Vec<_>>(); |  | ||||||
|         docids_to_delete.shuffle(&mut rng); |  | ||||||
|         for docid in docids_to_delete.into_iter().take(990) { |  | ||||||
|             let mut wtxn = index.env.write_txn().unwrap(); |  | ||||||
|             let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|             builder.strategy(DeletionStrategy::AlwaysHard); |  | ||||||
|             builder.delete_documents(&RoaringBitmap::from_iter([docid])); |  | ||||||
|             builder.execute().unwrap(); |  | ||||||
|             wtxn.commit().unwrap(); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); |  | ||||||
|         db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); |  | ||||||
|         db_snap!(index, string_faceted_documents_ids, 2, @r###" |  | ||||||
|         0   [] |  | ||||||
|         1   [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] |  | ||||||
|         2   [292, 324, 358, 381, 493, 839, 852, ] |  | ||||||
|         3   [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] |  | ||||||
|         "###); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[allow(unused)] |  | ||||||
| #[cfg(test)] |  | ||||||
| mod comparison_bench { |  | ||||||
|     use std::iter::once; |  | ||||||
|  |  | ||||||
|     use rand::Rng; |  | ||||||
|     use roaring::RoaringBitmap; |  | ||||||
|  |  | ||||||
|     use crate::heed_codec::facet::OrderedF64Codec; |  | ||||||
|     use crate::update::facet::test_helpers::FacetIndex; |  | ||||||
|  |  | ||||||
|     // This is a simple test to get an intuition on the relative speed |  | ||||||
|     // of the incremental vs. bulk indexer. |  | ||||||
|     // |  | ||||||
|     // The benchmark shows the worst-case scenario for the incremental indexer, since |  | ||||||
|     // each facet value contains only one document ID. |  | ||||||
|     // |  | ||||||
|     // In that scenario, it appears that the incremental indexer is about 70 times slower than the |  | ||||||
|     // bulk indexer. |  | ||||||
|     // #[test] |  | ||||||
|     fn benchmark_facet_indexing_delete() { |  | ||||||
|         let mut r = rand::thread_rng(); |  | ||||||
|  |  | ||||||
|         for i in 1..=20 { |  | ||||||
|             let size = 50_000 * i; |  | ||||||
|             let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5); |  | ||||||
|  |  | ||||||
|             let mut txn = index.env.write_txn().unwrap(); |  | ||||||
|             let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); |  | ||||||
|             for i in 0..size { |  | ||||||
|                 // field id = 0, left_bound = i, docids = [i] |  | ||||||
|                 elements.push(((0, i as f64), once(i).collect())); |  | ||||||
|             } |  | ||||||
|             let timer = std::time::Instant::now(); |  | ||||||
|             index.bulk_insert(&mut txn, &[0], elements.iter()); |  | ||||||
|             let time_spent = timer.elapsed().as_millis(); |  | ||||||
|             println!("bulk {size} : {time_spent}ms"); |  | ||||||
|  |  | ||||||
|             txn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|             for nbr_doc in [1, 100, 1000, 10_000] { |  | ||||||
|                 let mut txn = index.env.write_txn().unwrap(); |  | ||||||
|                 let timer = std::time::Instant::now(); |  | ||||||
|                 // |  | ||||||
|                 // delete one document |  | ||||||
|                 // |  | ||||||
|                 for _ in 0..nbr_doc { |  | ||||||
|                     let deleted_u32 = r.gen::<u32>() % size; |  | ||||||
|                     let deleted_f64 = deleted_u32 as f64; |  | ||||||
|                     index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) |  | ||||||
|                 } |  | ||||||
|                 let time_spent = timer.elapsed().as_millis(); |  | ||||||
|                 println!("    delete {nbr_doc} : {time_spent}ms"); |  | ||||||
|                 txn.abort().unwrap(); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,9 +1,9 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::BufReader; | use std::io::BufReader; | ||||||
|  |  | ||||||
| use heed::types::{ByteSlice, DecodeIgnore}; | use heed::types::{ByteSlice, DecodeIgnore}; | ||||||
| use heed::{BytesDecode, Error, RoTxn, RwTxn}; | use heed::{BytesDecode, Error, RoTxn, RwTxn}; | ||||||
|  | use obkv::KvReader; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::facet::FacetType; | use crate::facet::FacetType; | ||||||
| @@ -12,8 +12,9 @@ use crate::heed_codec::facet::{ | |||||||
| }; | }; | ||||||
| use crate::heed_codec::ByteSliceRefCodec; | use crate::heed_codec::ByteSliceRefCodec; | ||||||
| use crate::search::facet::get_highest_level; | use crate::search::facet::get_highest_level; | ||||||
|  | use crate::update::del_add::DelAdd; | ||||||
| use crate::update::index_documents::valid_lmdb_key; | use crate::update::index_documents::valid_lmdb_key; | ||||||
| use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; | use crate::{CboRoaringBitmapCodec, Index, Result}; | ||||||
|  |  | ||||||
| enum InsertionResult { | enum InsertionResult { | ||||||
|     InPlace, |     InPlace, | ||||||
| @@ -28,27 +29,21 @@ enum DeletionResult { | |||||||
|  |  | ||||||
| /// Algorithm to incrementally insert and delete elememts into the | /// Algorithm to incrementally insert and delete elememts into the | ||||||
| /// `facet_id_(string/f64)_docids` databases. | /// `facet_id_(string/f64)_docids` databases. | ||||||
| /// | pub struct FacetsUpdateIncremental { | ||||||
| /// Rhe `faceted_documents_ids` value in the main database of `Index` |  | ||||||
| /// is also updated to contain the new set of faceted documents. |  | ||||||
| pub struct FacetsUpdateIncremental<'i> { |  | ||||||
|     index: &'i Index, |  | ||||||
|     inner: FacetsUpdateIncrementalInner, |     inner: FacetsUpdateIncrementalInner, | ||||||
|     facet_type: FacetType, |     delta_data: grenad::Reader<BufReader<File>>, | ||||||
|     new_data: grenad::Reader<BufReader<File>>, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'i> FacetsUpdateIncremental<'i> { | impl FacetsUpdateIncremental { | ||||||
|     pub fn new( |     pub fn new( | ||||||
|         index: &'i Index, |         index: &Index, | ||||||
|         facet_type: FacetType, |         facet_type: FacetType, | ||||||
|         new_data: grenad::Reader<BufReader<File>>, |         delta_data: grenad::Reader<BufReader<File>>, | ||||||
|         group_size: u8, |         group_size: u8, | ||||||
|         min_level_size: u8, |         min_level_size: u8, | ||||||
|         max_group_size: u8, |         max_group_size: u8, | ||||||
|     ) -> Self { |     ) -> Self { | ||||||
|         FacetsUpdateIncremental { |         FacetsUpdateIncremental { | ||||||
|             index, |  | ||||||
|             inner: FacetsUpdateIncrementalInner { |             inner: FacetsUpdateIncrementalInner { | ||||||
|                 db: match facet_type { |                 db: match facet_type { | ||||||
|                     FacetType::String => index |                     FacetType::String => index | ||||||
| @@ -62,31 +57,41 @@ impl<'i> FacetsUpdateIncremental<'i> { | |||||||
|                 max_group_size, |                 max_group_size, | ||||||
|                 min_level_size, |                 min_level_size, | ||||||
|             }, |             }, | ||||||
|             facet_type, |             delta_data, | ||||||
|             new_data, |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { |     pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { | ||||||
|         let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default(); |         let mut cursor = self.delta_data.into_cursor()?; | ||||||
|  |  | ||||||
|         let mut cursor = self.new_data.into_cursor()?; |  | ||||||
|         while let Some((key, value)) = cursor.move_on_next()? { |         while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|             if !valid_lmdb_key(key) { |             if !valid_lmdb_key(key) { | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|             let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key) |             let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key) | ||||||
|                 .ok_or(heed::Error::Encoding)?; |                 .ok_or(heed::Error::Encoding)?; | ||||||
|             let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; |             let value = KvReader::new(value); | ||||||
|             self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; |  | ||||||
|             *new_faceted_docids.entry(key.field_id).or_default() |= docids; |             let docids_to_delete = value | ||||||
|  |                 .get(DelAdd::Deletion) | ||||||
|  |                 .map(CboRoaringBitmapCodec::bytes_decode) | ||||||
|  |                 .map(|o| o.ok_or(heed::Error::Encoding)); | ||||||
|  |  | ||||||
|  |             let docids_to_add = value | ||||||
|  |                 .get(DelAdd::Addition) | ||||||
|  |                 .map(CboRoaringBitmapCodec::bytes_decode) | ||||||
|  |                 .map(|o| o.ok_or(heed::Error::Encoding)); | ||||||
|  |  | ||||||
|  |             if let Some(docids_to_delete) = docids_to_delete { | ||||||
|  |                 let docids_to_delete = docids_to_delete?; | ||||||
|  |                 self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             if let Some(docids_to_add) = docids_to_add { | ||||||
|  |                 let docids_to_add = docids_to_add?; | ||||||
|  |                 self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for (field_id, new_docids) in new_faceted_docids { |  | ||||||
|             let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; |  | ||||||
|             docids |= new_docids; |  | ||||||
|             self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; |  | ||||||
|         } |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -14,7 +14,7 @@ The databases must be able to return results for queries such as: | |||||||
| The algorithms that implement these queries are found in the `src/search/facet` folder. | The algorithms that implement these queries are found in the `src/search/facet` folder. | ||||||
|  |  | ||||||
| To make these queries fast to compute, the database adopts a tree structure: | To make these queries fast to compute, the database adopts a tree structure: | ||||||
| ```ignore | ```text | ||||||
|             ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ |             ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ | ||||||
| ┌───────┐   │           "ab" (2)            │           "gaf" (2)           │   "woz" (1)   │ | ┌───────┐   │           "ab" (2)            │           "gaf" (2)           │   "woz" (1)   │ | ||||||
| │Level 2│   │                               │                               │               │ | │Level 2│   │                               │                               │               │ | ||||||
| @@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`. | |||||||
| In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a | In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a | ||||||
| [`FacetGroupValue`], which have the following format: | [`FacetGroupValue`], which have the following format: | ||||||
|  |  | ||||||
| ```ignore | ```text | ||||||
| FacetGroupKey: | FacetGroupKey: | ||||||
| - field id  : u16 | - field id  : u16 | ||||||
| - level     : u8 | - level     : u8 | ||||||
| @@ -98,7 +98,6 @@ use crate::update::merge_btreeset_string; | |||||||
| use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; | use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; | ||||||
|  |  | ||||||
| pub mod bulk; | pub mod bulk; | ||||||
| pub mod delete; |  | ||||||
| pub mod incremental; | pub mod incremental; | ||||||
|  |  | ||||||
| /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. | /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. | ||||||
| @@ -109,7 +108,7 @@ pub struct FacetsUpdate<'i> { | |||||||
|     index: &'i Index, |     index: &'i Index, | ||||||
|     database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, |     database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||||
|     facet_type: FacetType, |     facet_type: FacetType, | ||||||
|     new_data: grenad::Reader<BufReader<File>>, |     delta_data: grenad::Reader<BufReader<File>>, | ||||||
|     group_size: u8, |     group_size: u8, | ||||||
|     max_group_size: u8, |     max_group_size: u8, | ||||||
|     min_level_size: u8, |     min_level_size: u8, | ||||||
| @@ -118,7 +117,7 @@ impl<'i> FacetsUpdate<'i> { | |||||||
|     pub fn new( |     pub fn new( | ||||||
|         index: &'i Index, |         index: &'i Index, | ||||||
|         facet_type: FacetType, |         facet_type: FacetType, | ||||||
|         new_data: grenad::Reader<BufReader<File>>, |         delta_data: grenad::Reader<BufReader<File>>, | ||||||
|     ) -> Self { |     ) -> Self { | ||||||
|         let database = match facet_type { |         let database = match facet_type { | ||||||
|             FacetType::String => index |             FacetType::String => index | ||||||
| @@ -135,26 +134,26 @@ impl<'i> FacetsUpdate<'i> { | |||||||
|             max_group_size: FACET_MAX_GROUP_SIZE, |             max_group_size: FACET_MAX_GROUP_SIZE, | ||||||
|             min_level_size: FACET_MIN_LEVEL_SIZE, |             min_level_size: FACET_MIN_LEVEL_SIZE, | ||||||
|             facet_type, |             facet_type, | ||||||
|             new_data, |             delta_data, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { |     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { | ||||||
|         if self.new_data.is_empty() { |         if self.delta_data.is_empty() { | ||||||
|             return Ok(()); |             return Ok(()); | ||||||
|         } |         } | ||||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); |         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||||
|         self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; |         self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; | ||||||
|  |  | ||||||
|         // See self::comparison_bench::benchmark_facet_indexing |         // See self::comparison_bench::benchmark_facet_indexing | ||||||
|         if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { |         if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) { | ||||||
|             let field_ids = |             let field_ids = | ||||||
|                 self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>(); |                 self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>(); | ||||||
|             let bulk_update = FacetsUpdateBulk::new( |             let bulk_update = FacetsUpdateBulk::new( | ||||||
|                 self.index, |                 self.index, | ||||||
|                 field_ids, |                 field_ids, | ||||||
|                 self.facet_type, |                 self.facet_type, | ||||||
|                 self.new_data, |                 self.delta_data, | ||||||
|                 self.group_size, |                 self.group_size, | ||||||
|                 self.min_level_size, |                 self.min_level_size, | ||||||
|             ); |             ); | ||||||
| @@ -163,7 +162,7 @@ impl<'i> FacetsUpdate<'i> { | |||||||
|             let incremental_update = FacetsUpdateIncremental::new( |             let incremental_update = FacetsUpdateIncremental::new( | ||||||
|                 self.index, |                 self.index, | ||||||
|                 self.facet_type, |                 self.facet_type, | ||||||
|                 self.new_data, |                 self.delta_data, | ||||||
|                 self.group_size, |                 self.group_size, | ||||||
|                 self.min_level_size, |                 self.min_level_size, | ||||||
|                 self.max_group_size, |                 self.max_group_size, | ||||||
| @@ -279,6 +278,7 @@ pub(crate) mod test_helpers { | |||||||
|     use crate::heed_codec::ByteSliceRefCodec; |     use crate::heed_codec::ByteSliceRefCodec; | ||||||
|     use crate::search::facet::get_highest_level; |     use crate::search::facet::get_highest_level; | ||||||
|     use crate::snapshot_tests::display_bitmap; |     use crate::snapshot_tests::display_bitmap; | ||||||
|  |     use crate::update::del_add::{DelAdd, KvWriterDelAdd}; | ||||||
|     use crate::update::FacetsUpdateIncrementalInner; |     use crate::update::FacetsUpdateIncrementalInner; | ||||||
|     use crate::CboRoaringBitmapCodec; |     use crate::CboRoaringBitmapCodec; | ||||||
|  |  | ||||||
| @@ -455,20 +455,22 @@ pub(crate) mod test_helpers { | |||||||
|                 let key: FacetGroupKey<&[u8]> = |                 let key: FacetGroupKey<&[u8]> = | ||||||
|                     FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; |                     FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; | ||||||
|                 let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap(); |                 let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap(); | ||||||
|  |                 let mut inner_writer = KvWriterDelAdd::memory(); | ||||||
|                 let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap(); |                 let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap(); | ||||||
|                 writer.insert(&key, &value).unwrap(); |                 inner_writer.insert(DelAdd::Addition, value).unwrap(); | ||||||
|  |                 writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap(); | ||||||
|             } |             } | ||||||
|             writer.finish().unwrap(); |             writer.finish().unwrap(); | ||||||
|             let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); |             let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); | ||||||
|  |  | ||||||
|             let update = FacetsUpdateBulkInner { |             let update = FacetsUpdateBulkInner { | ||||||
|                 db: self.content, |                 db: self.content, | ||||||
|                 new_data: Some(reader), |                 delta_data: Some(reader), | ||||||
|                 group_size: self.group_size.get(), |                 group_size: self.group_size.get(), | ||||||
|                 min_level_size: self.min_level_size.get(), |                 min_level_size: self.min_level_size.get(), | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); |             update.update(wtxn, field_ids).unwrap(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { |         pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { | ||||||
| @@ -556,101 +558,6 @@ pub(crate) mod test_helpers { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use big_s::S; |  | ||||||
|     use maplit::hashset; |  | ||||||
|  |  | ||||||
|     use crate::db_snap; |  | ||||||
|     use crate::documents::documents_batch_reader_from_objects; |  | ||||||
|     use crate::index::tests::TempIndex; |  | ||||||
|     use crate::update::DeletionStrategy; |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn replace_all_identical_soft_deletion_then_hard_deletion() { |  | ||||||
|         let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); |  | ||||||
|  |  | ||||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_filterable_fields(hashset! { S("size") }); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let mut documents = vec![]; |  | ||||||
|         for i in 0..1000 { |  | ||||||
|             documents.push( |  | ||||||
|                 serde_json::json! { |  | ||||||
|                     { |  | ||||||
|                         "id": i, |  | ||||||
|                         "size": i % 250, |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 .as_object() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .clone(), |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let documents = documents_batch_reader_from_objects(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); |  | ||||||
|         db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); |  | ||||||
|  |  | ||||||
|         let mut documents = vec![]; |  | ||||||
|         for i in 0..999 { |  | ||||||
|             documents.push( |  | ||||||
|                 serde_json::json! { |  | ||||||
|                     { |  | ||||||
|                         "id": i, |  | ||||||
|                         "size": i % 250, |  | ||||||
|                         "other": 0, |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 .as_object() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .clone(), |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let documents = documents_batch_reader_from_objects(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); |  | ||||||
|         db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); |  | ||||||
|  |  | ||||||
|         // Then replace the last document while disabling soft_deletion |  | ||||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; |  | ||||||
|         let mut documents = vec![]; |  | ||||||
|         for i in 999..1000 { |  | ||||||
|             documents.push( |  | ||||||
|                 serde_json::json! { |  | ||||||
|                     { |  | ||||||
|                         "id": i, |  | ||||||
|                         "size": i % 250, |  | ||||||
|                         "other": 0, |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 .as_object() |  | ||||||
|                 .unwrap() |  | ||||||
|                 .clone(), |  | ||||||
|             ); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let documents = documents_batch_reader_from_objects(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); |  | ||||||
|         db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[allow(unused)] | #[allow(unused)] | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod comparison_bench { | mod comparison_bench { | ||||||
|   | |||||||
| @@ -1,20 +1,17 @@ | |||||||
|  | use std::fmt; | ||||||
| use std::io::{BufWriter, Read, Seek}; | use std::io::{BufWriter, Read, Seek}; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
| use std::{fmt, iter}; |  | ||||||
|  |  | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; | use crate::documents::{ | ||||||
|  |     DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, | ||||||
|  |     EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, | ||||||
|  | }; | ||||||
| use crate::error::{GeoError, InternalError, UserError}; | use crate::error::{GeoError, InternalError, UserError}; | ||||||
| use crate::update::index_documents::{obkv_to_object, writer_into_reader}; | use crate::update::index_documents::{obkv_to_object, writer_into_reader}; | ||||||
| use crate::{FieldId, Index, Object, Result}; | use crate::{FieldId, Index, Result}; | ||||||
|  |  | ||||||
| /// The symbol used to define levels in a nested primary key. |  | ||||||
| const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; |  | ||||||
|  |  | ||||||
| /// The default primary that is used when not specified. |  | ||||||
| const DEFAULT_PRIMARY_KEY: &str = "id"; |  | ||||||
|  |  | ||||||
| /// This function validates and enrich the documents by checking that: | /// This function validates and enrich the documents by checking that: | ||||||
| ///  - we can infer a primary key, | ///  - we can infer a primary key, | ||||||
| @@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>( | |||||||
|     // The primary key *field id* that has already been set for this index or the one |     // The primary key *field id* that has already been set for this index or the one | ||||||
|     // we will guess by searching for the first key that contains "id" as a substring. |     // we will guess by searching for the first key that contains "id" as a substring. | ||||||
|     let primary_key = match index.primary_key(rtxn)? { |     let primary_key = match index.primary_key(rtxn)? { | ||||||
|         Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { |         Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { | ||||||
|             PrimaryKey::nested(primary_key) |             Some(primary_key) => primary_key, | ||||||
|         } |             None if autogenerate_docids => PrimaryKey::Flat { | ||||||
|         Some(primary_key) => match documents_batch_index.id(primary_key) { |                 name: primary_key, | ||||||
|             Some(id) => PrimaryKey::flat(primary_key, id), |                 field_id: documents_batch_index.insert(primary_key), | ||||||
|             None if autogenerate_docids => { |             }, | ||||||
|                 PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) |  | ||||||
|             } |  | ||||||
|             None => { |             None => { | ||||||
|                 return match cursor.next_document()? { |                 return match cursor.next_document()? { | ||||||
|                     Some(first_document) => Ok(Err(UserError::MissingDocumentId { |                     Some(first_document) => Ok(Err(UserError::MissingDocumentId { | ||||||
| @@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>( | |||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             match guesses.as_slice() { |             match guesses.as_slice() { | ||||||
|                 [] if autogenerate_docids => PrimaryKey::flat( |                 [] if autogenerate_docids => PrimaryKey::Flat { | ||||||
|                     DEFAULT_PRIMARY_KEY, |                     name: DEFAULT_PRIMARY_KEY, | ||||||
|                     documents_batch_index.insert(DEFAULT_PRIMARY_KEY), |                     field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), | ||||||
|                 ), |                 }, | ||||||
|                 [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), |                 [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), | ||||||
|                 [(field_id, name)] => { |                 [(field_id, name)] => { | ||||||
|                     log::info!("Primary key was not specified in index. Inferred to '{name}'"); |                     log::info!("Primary key was not specified in index. Inferred to '{name}'"); | ||||||
|                     PrimaryKey::flat(name, *field_id) |                     PrimaryKey::Flat { name, field_id: *field_id } | ||||||
|                 } |                 } | ||||||
|                 multiple => { |                 multiple => { | ||||||
|                     return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { |                     return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { | ||||||
| @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( | |||||||
|     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], |     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], | ||||||
|     count: u32, |     count: u32, | ||||||
| ) -> Result<StdResult<DocumentId, UserError>> { | ) -> Result<StdResult<DocumentId, UserError>> { | ||||||
|     match primary_key { |     Ok(match primary_key.document_id(document, documents_batch_index)? { | ||||||
|         PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { |         Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), | ||||||
|             match document.get(primary_key_id) { |         Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), | ||||||
|                 Some(document_id_bytes) => { |         Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { | ||||||
|                     let document_id = serde_json::from_slice(document_id_bytes) |             let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); | ||||||
|                         .map_err(InternalError::SerdeJson)?; |             Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) | ||||||
|                     match validate_document_id_value(document_id)? { |  | ||||||
|                         Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), |  | ||||||
|                         Err(user_error) => Ok(Err(user_error)), |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 None if autogenerate_docids => { |  | ||||||
|                     let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); |  | ||||||
|                     Ok(Ok(DocumentId::generated(uuid.to_string(), count))) |  | ||||||
|                 } |  | ||||||
|                 None => Ok(Err(UserError::MissingDocumentId { |  | ||||||
|                     primary_key: primary_key.to_string(), |  | ||||||
|                     document: obkv_to_object(document, documents_batch_index)?, |  | ||||||
|                 })), |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|         nested @ PrimaryKey::Nested { .. } => { |         Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { | ||||||
|             let mut matching_documents_ids = Vec::new(); |             primary_key: primary_key.name().to_string(), | ||||||
|             for (first_level_name, right) in nested.possible_level_names() { |             document: obkv_to_object(document, documents_batch_index)?, | ||||||
|                 if let Some(field_id) = documents_batch_index.id(first_level_name) { |         }), | ||||||
|                     if let Some(value_bytes) = document.get(field_id) { |         Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { | ||||||
|                         let object = serde_json::from_slice(value_bytes) |             Err(UserError::TooManyDocumentIds { | ||||||
|                             .map_err(InternalError::SerdeJson)?; |                 primary_key: primary_key.name().to_string(), | ||||||
|                         fetch_matching_values(object, right, &mut matching_documents_ids); |                 document: obkv_to_object(document, documents_batch_index)?, | ||||||
|  |             }) | ||||||
|                         if matching_documents_ids.len() >= 2 { |  | ||||||
|                             return Ok(Err(UserError::TooManyDocumentIds { |  | ||||||
|                                 primary_key: nested.name().to_string(), |  | ||||||
|                                 document: obkv_to_object(document, documents_batch_index)?, |  | ||||||
|                             })); |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             match matching_documents_ids.pop() { |  | ||||||
|                 Some(document_id) => match validate_document_id_value(document_id)? { |  | ||||||
|                     Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), |  | ||||||
|                     Err(user_error) => Ok(Err(user_error)), |  | ||||||
|                 }, |  | ||||||
|                 None => Ok(Err(UserError::MissingDocumentId { |  | ||||||
|                     primary_key: nested.name().to_string(), |  | ||||||
|                     document: obkv_to_object(document, documents_batch_index)?, |  | ||||||
|                 })), |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     }) | ||||||
| } |  | ||||||
|  |  | ||||||
| /// A type that represent the type of primary key that has been set |  | ||||||
| /// for this index, a classic flat one or a nested one. |  | ||||||
| #[derive(Debug, Clone, Copy)] |  | ||||||
| enum PrimaryKey<'a> { |  | ||||||
|     Flat { name: &'a str, field_id: FieldId }, |  | ||||||
|     Nested { name: &'a str }, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl PrimaryKey<'_> { |  | ||||||
|     fn flat(name: &str, field_id: FieldId) -> PrimaryKey { |  | ||||||
|         PrimaryKey::Flat { name, field_id } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn nested(name: &str) -> PrimaryKey { |  | ||||||
|         PrimaryKey::Nested { name } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn name(&self) -> &str { |  | ||||||
|         match self { |  | ||||||
|             PrimaryKey::Flat { name, .. } => name, |  | ||||||
|             PrimaryKey::Nested { name } => name, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Returns an `Iterator` that gives all the possible fields names the primary key |  | ||||||
|     /// can have depending of the first level name and deepnes of the objects. |  | ||||||
|     fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ { |  | ||||||
|         let name = self.name(); |  | ||||||
|         name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) |  | ||||||
|             .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) |  | ||||||
|             .chain(iter::once((name, ""))) |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /// A type that represents a document id that has been retrieved from a document or auto-generated. | /// A type that represents a document id that has been retrieved from a document or auto-generated. | ||||||
| @@ -255,14 +182,6 @@ pub enum DocumentId { | |||||||
| } | } | ||||||
|  |  | ||||||
| impl DocumentId { | impl DocumentId { | ||||||
|     fn retrieved(value: String) -> DocumentId { |  | ||||||
|         DocumentId::Retrieved { value } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn generated(value: String, document_nth: u32) -> DocumentId { |  | ||||||
|         DocumentId::Generated { value, document_nth } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn debug(&self) -> String { |     fn debug(&self) -> String { | ||||||
|         format!("{:?}", self) |         format!("{:?}", self) | ||||||
|     } |     } | ||||||
| @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn starts_with(selector: &str, key: &str) -> bool { |  | ||||||
|     selector.strip_prefix(key).map_or(false, |tail| { |  | ||||||
|         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) |  | ||||||
|     }) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) { |  | ||||||
|     match value { |  | ||||||
|         Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), |  | ||||||
|         otherwise => output.push(otherwise), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn fetch_matching_values_in_object( |  | ||||||
|     object: Object, |  | ||||||
|     selector: &str, |  | ||||||
|     base_key: &str, |  | ||||||
|     output: &mut Vec<Value>, |  | ||||||
| ) { |  | ||||||
|     for (key, value) in object { |  | ||||||
|         let base_key = if base_key.is_empty() { |  | ||||||
|             key.to_string() |  | ||||||
|         } else { |  | ||||||
|             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         if starts_with(selector, &base_key) { |  | ||||||
|             match value { |  | ||||||
|                 Value::Object(object) => { |  | ||||||
|                     fetch_matching_values_in_object(object, selector, &base_key, output) |  | ||||||
|                 } |  | ||||||
|                 value => output.push(value), |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn validate_document_id(document_id: &str) -> Option<&str> { |  | ||||||
|     if !document_id.is_empty() |  | ||||||
|         && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) |  | ||||||
|     { |  | ||||||
|         Some(document_id) |  | ||||||
|     } else { |  | ||||||
|         None |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Parses a Json encoded document id and validate it, returning a user error when it is one. |  | ||||||
| pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> { |  | ||||||
|     match document_id { |  | ||||||
|         Value::String(string) => match validate_document_id(&string) { |  | ||||||
|             Some(s) if s.len() == string.len() => Ok(Ok(string)), |  | ||||||
|             Some(s) => Ok(Ok(s.to_string())), |  | ||||||
|             None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), |  | ||||||
|         }, |  | ||||||
|         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), |  | ||||||
|         content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Try to extract an `f64` from a JSON `Value` and return the `Value` | /// Try to extract an `f64` from a JSON `Value` and return the `Value` | ||||||
| /// in the `Err` variant if it failed. | /// in the `Err` variant if it failed. | ||||||
| pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> { | pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> { | ||||||
|   | |||||||
| @@ -5,18 +5,16 @@ use std::io::BufReader; | |||||||
| use std::{io, mem, str}; | use std::{io, mem, str}; | ||||||
|  |  | ||||||
| use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | ||||||
| use obkv::KvReader; | use obkv::{KvReader, KvWriterU16}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; | use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; | ||||||
| use crate::error::{InternalError, SerializationError}; | use crate::error::{InternalError, SerializationError}; | ||||||
| use crate::update::index_documents::MergeFn; | use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; | ||||||
| use crate::{ | use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; | ||||||
|     absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, |  | ||||||
| }; |  | ||||||
|  |  | ||||||
| pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; | pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; | ||||||
|  |  | ||||||
| /// Extracts the word and positions where this word appear and | /// Extracts the word and positions where this word appear and | ||||||
| /// prefixes it by the document id. | /// prefixes it by the document id. | ||||||
| @@ -32,25 +30,162 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|     allowed_separators: Option<&[&str]>, |     allowed_separators: Option<&[&str]>, | ||||||
|     dictionary: Option<&[&str]>, |     dictionary: Option<&[&str]>, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|     let max_positions_per_attributes = max_positions_per_attributes |     let max_positions_per_attributes = max_positions_per_attributes | ||||||
|         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); |         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |  | ||||||
|  |     // initialize destination values. | ||||||
|     let mut documents_ids = RoaringBitmap::new(); |     let mut documents_ids = RoaringBitmap::new(); | ||||||
|     let mut script_language_docids = HashMap::new(); |     let mut script_language_docids = HashMap::new(); | ||||||
|     let mut docid_word_positions_sorter = create_sorter( |     let mut docid_word_positions_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Stable, |         grenad::SortAlgorithm::Stable, | ||||||
|         concat_u32s_array, |         keep_latest_obkv, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory, |         max_memory, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut buffers = Buffers::default(); |     // initialize buffers. | ||||||
|  |     let mut del_buffers = Buffers::default(); | ||||||
|  |     let mut add_buffers = Buffers::default(); | ||||||
|  |     let mut key_buffer = Vec::new(); | ||||||
|  |     let mut value_buffer = Vec::new(); | ||||||
|  |  | ||||||
|  |     // initialize tokenizer. | ||||||
|  |     let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); | ||||||
|  |     let tokenizer = builder.build(); | ||||||
|  |  | ||||||
|  |     // iterate over documents. | ||||||
|  |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|  |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|  |         let document_id = key | ||||||
|  |             .try_into() | ||||||
|  |             .map(u32::from_be_bytes) | ||||||
|  |             .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||||
|  |         let obkv = KvReader::<FieldId>::new(value); | ||||||
|  |  | ||||||
|  |         // if the searchable fields didn't change, skip the searchable indexing for this document. | ||||||
|  |         if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) { | ||||||
|  |             continue; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         documents_ids.push(document_id); | ||||||
|  |  | ||||||
|  |         // Update key buffer prefix. | ||||||
|  |         key_buffer.clear(); | ||||||
|  |         key_buffer.extend_from_slice(&document_id.to_be_bytes()); | ||||||
|  |  | ||||||
|  |         // Tokenize deletions and additions in 2 diffferent threads. | ||||||
|  |         let (del, add): (Result<_>, Result<_>) = rayon::join( | ||||||
|  |             || { | ||||||
|  |                 // deletions | ||||||
|  |                 lang_safe_tokens_from_document( | ||||||
|  |                     &obkv, | ||||||
|  |                     searchable_fields, | ||||||
|  |                     &tokenizer, | ||||||
|  |                     stop_words, | ||||||
|  |                     allowed_separators, | ||||||
|  |                     dictionary, | ||||||
|  |                     max_positions_per_attributes, | ||||||
|  |                     DelAdd::Deletion, | ||||||
|  |                     &mut del_buffers, | ||||||
|  |                 ) | ||||||
|  |             }, | ||||||
|  |             || { | ||||||
|  |                 // additions | ||||||
|  |                 lang_safe_tokens_from_document( | ||||||
|  |                     &obkv, | ||||||
|  |                     searchable_fields, | ||||||
|  |                     &tokenizer, | ||||||
|  |                     stop_words, | ||||||
|  |                     allowed_separators, | ||||||
|  |                     dictionary, | ||||||
|  |                     max_positions_per_attributes, | ||||||
|  |                     DelAdd::Addition, | ||||||
|  |                     &mut add_buffers, | ||||||
|  |                 ) | ||||||
|  |             }, | ||||||
|  |         ); | ||||||
|  |  | ||||||
|  |         let (del_obkv, del_script_language_word_count) = del?; | ||||||
|  |         let (add_obkv, add_script_language_word_count) = add?; | ||||||
|  |  | ||||||
|  |         // merge deletions and additions. | ||||||
|  |         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> | ||||||
|  |         value_buffer.clear(); | ||||||
|  |         del_add_from_two_obkvs( | ||||||
|  |             KvReader::<FieldId>::new(del_obkv), | ||||||
|  |             KvReader::<FieldId>::new(add_obkv), | ||||||
|  |             &mut value_buffer, | ||||||
|  |         )?; | ||||||
|  |  | ||||||
|  |         // write each KV<DelAdd, KV<u16, String>> into the sorter, field by field. | ||||||
|  |         let obkv = KvReader::<FieldId>::new(&value_buffer); | ||||||
|  |         for (field_id, value) in obkv.iter() { | ||||||
|  |             key_buffer.truncate(mem::size_of::<u32>()); | ||||||
|  |             key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||||
|  |             docid_word_positions_sorter.insert(&key_buffer, value)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // update script_language_docids deletions. | ||||||
|  |         for (script, languages_frequency) in del_script_language_word_count { | ||||||
|  |             for (language, _) in languages_frequency { | ||||||
|  |                 let entry = script_language_docids | ||||||
|  |                     .entry((script, language)) | ||||||
|  |                     .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); | ||||||
|  |                 entry.0.push(document_id); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // update script_language_docids additions. | ||||||
|  |         for (script, languages_frequency) in add_script_language_word_count { | ||||||
|  |             for (language, _) in languages_frequency { | ||||||
|  |                 let entry = script_language_docids | ||||||
|  |                     .entry((script, language)) | ||||||
|  |                     .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); | ||||||
|  |                 entry.1.push(document_id); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>. | ||||||
|  |     sorter_into_reader(docid_word_positions_sorter, indexer) | ||||||
|  |         .map(|reader| (reader, script_language_docids)) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Check if any searchable fields of a document changed. | ||||||
|  | fn searchable_fields_changed( | ||||||
|  |     obkv: &KvReader<FieldId>, | ||||||
|  |     searchable_fields: &Option<HashSet<FieldId>>, | ||||||
|  | ) -> bool { | ||||||
|  |     for (field_id, field_bytes) in obkv.iter() { | ||||||
|  |         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { | ||||||
|  |             let del_add = KvReaderDelAdd::new(field_bytes); | ||||||
|  |             match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { | ||||||
|  |                 // if both fields are None, check the next field. | ||||||
|  |                 (None, None) => (), | ||||||
|  |                 // if both contains a value and values are the same, check the next field. | ||||||
|  |                 (Some(del), Some(add)) if del == add => (), | ||||||
|  |                 // otherwise the fields are different, return true. | ||||||
|  |                 _otherwise => return true, | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     false | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Factorize tokenizer building. | ||||||
|  | fn tokenizer_builder<'a>( | ||||||
|  |     stop_words: Option<&'a fst::Set<&[u8]>>, | ||||||
|  |     allowed_separators: Option<&'a [&str]>, | ||||||
|  |     dictionary: Option<&'a [&str]>, | ||||||
|  |     script_language: Option<&'a HashMap<Script, Vec<Language>>>, | ||||||
|  | ) -> TokenizerBuilder<'a, &'a [u8]> { | ||||||
|     let mut tokenizer_builder = TokenizerBuilder::new(); |     let mut tokenizer_builder = TokenizerBuilder::new(); | ||||||
|     if let Some(stop_words) = stop_words { |     if let Some(stop_words) = stop_words { | ||||||
|         tokenizer_builder.stop_words(stop_words); |         tokenizer_builder.stop_words(stop_words); | ||||||
| @@ -61,130 +196,147 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|     if let Some(separators) = allowed_separators { |     if let Some(separators) = allowed_separators { | ||||||
|         tokenizer_builder.separators(separators); |         tokenizer_builder.separators(separators); | ||||||
|     } |     } | ||||||
|     let tokenizer = tokenizer_builder.build(); |  | ||||||
|  |  | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     if let Some(script_language) = script_language { | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |         tokenizer_builder.allow_list(script_language); | ||||||
|         let document_id = key |     } | ||||||
|             .try_into() |  | ||||||
|             .map(u32::from_be_bytes) |  | ||||||
|             .map_err(|_| SerializationError::InvalidNumberSerialization)?; |  | ||||||
|         let obkv = KvReader::<FieldId>::new(value); |  | ||||||
|  |  | ||||||
|         documents_ids.push(document_id); |     tokenizer_builder | ||||||
|         buffers.key_buffer.clear(); | } | ||||||
|         buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes()); |  | ||||||
|  |  | ||||||
|         let mut script_language_word_count = HashMap::new(); | /// Extract words mapped with their positions of a document, | ||||||
|  | /// ensuring no Language detection mistakes was made. | ||||||
|  | #[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct | ||||||
|  | fn lang_safe_tokens_from_document<'a>( | ||||||
|  |     obkv: &KvReader<FieldId>, | ||||||
|  |     searchable_fields: &Option<HashSet<FieldId>>, | ||||||
|  |     tokenizer: &Tokenizer, | ||||||
|  |     stop_words: Option<&fst::Set<&[u8]>>, | ||||||
|  |     allowed_separators: Option<&[&str]>, | ||||||
|  |     dictionary: Option<&[&str]>, | ||||||
|  |     max_positions_per_attributes: u32, | ||||||
|  |     del_add: DelAdd, | ||||||
|  |     buffers: &'a mut Buffers, | ||||||
|  | ) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> { | ||||||
|  |     let mut script_language_word_count = HashMap::new(); | ||||||
|  |  | ||||||
|         extract_tokens_from_document( |     tokens_from_document( | ||||||
|             &obkv, |         obkv, | ||||||
|             searchable_fields, |         searchable_fields, | ||||||
|             &tokenizer, |         tokenizer, | ||||||
|             max_positions_per_attributes, |         max_positions_per_attributes, | ||||||
|             &mut buffers, |         del_add, | ||||||
|             &mut script_language_word_count, |         buffers, | ||||||
|             &mut docid_word_positions_sorter, |         &mut script_language_word_count, | ||||||
|         )?; |     )?; | ||||||
|  |  | ||||||
|         // if we detect a potetial mistake in the language detection, |     // if we detect a potetial mistake in the language detection, | ||||||
|         // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. |     // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. | ||||||
|         // context: https://github.com/meilisearch/meilisearch/issues/3565 |     // context: https://github.com/meilisearch/meilisearch/issues/3565 | ||||||
|         if script_language_word_count |     if script_language_word_count | ||||||
|             .values() |         .values() | ||||||
|             .map(Vec::as_slice) |         .map(Vec::as_slice) | ||||||
|             .any(potential_language_detection_error) |         .any(potential_language_detection_error) | ||||||
|         { |     { | ||||||
|             // build an allow list with the most frequent detected languages in the document. |         // build an allow list with the most frequent detected languages in the document. | ||||||
|             let script_language: HashMap<_, _> = |         let script_language: HashMap<_, _> = | ||||||
|                 script_language_word_count.iter().filter_map(most_frequent_languages).collect(); |             script_language_word_count.iter().filter_map(most_frequent_languages).collect(); | ||||||
|  |  | ||||||
|             // if the allow list is empty, meaning that no Language is considered frequent, |         // if the allow list is empty, meaning that no Language is considered frequent, | ||||||
|             // then we don't rerun the extraction. |         // then we don't rerun the extraction. | ||||||
|             if !script_language.is_empty() { |         if !script_language.is_empty() { | ||||||
|                 // build a new temporary tokenizer including the allow list. |             // build a new temporary tokenizer including the allow list. | ||||||
|                 let mut tokenizer_builder = TokenizerBuilder::new(); |             let mut builder = tokenizer_builder( | ||||||
|                 if let Some(stop_words) = stop_words { |                 stop_words, | ||||||
|                     tokenizer_builder.stop_words(stop_words); |                 allowed_separators, | ||||||
|                 } |                 dictionary, | ||||||
|                 tokenizer_builder.allow_list(&script_language); |                 Some(&script_language), | ||||||
|                 let tokenizer = tokenizer_builder.build(); |             ); | ||||||
|  |             let tokenizer = builder.build(); | ||||||
|  |  | ||||||
|                 script_language_word_count.clear(); |             script_language_word_count.clear(); | ||||||
|  |  | ||||||
|                 // rerun the extraction. |             // rerun the extraction. | ||||||
|                 extract_tokens_from_document( |             tokens_from_document( | ||||||
|                     &obkv, |                 obkv, | ||||||
|                     searchable_fields, |                 searchable_fields, | ||||||
|                     &tokenizer, |                 &tokenizer, | ||||||
|                     max_positions_per_attributes, |                 max_positions_per_attributes, | ||||||
|                     &mut buffers, |                 del_add, | ||||||
|                     &mut script_language_word_count, |                 buffers, | ||||||
|                     &mut docid_word_positions_sorter, |                 &mut script_language_word_count, | ||||||
|                 )?; |             )?; | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         for (script, languages_frequency) in script_language_word_count { |  | ||||||
|             for (language, _) in languages_frequency { |  | ||||||
|                 let entry = script_language_docids |  | ||||||
|                     .entry((script, language)) |  | ||||||
|                     .or_insert_with(RoaringBitmap::new); |  | ||||||
|                 entry.push(document_id); |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(docid_word_positions_sorter, indexer) |     // returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>) | ||||||
|         .map(|reader| (documents_ids, reader, script_language_docids)) |     Ok((&buffers.obkv_buffer, script_language_word_count)) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn extract_tokens_from_document( | /// Extract words mapped with their positions of a document. | ||||||
|  | fn tokens_from_document<'a>( | ||||||
|     obkv: &KvReader<FieldId>, |     obkv: &KvReader<FieldId>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     searchable_fields: &Option<HashSet<FieldId>>, | ||||||
|     tokenizer: &Tokenizer, |     tokenizer: &Tokenizer, | ||||||
|     max_positions_per_attributes: u32, |     max_positions_per_attributes: u32, | ||||||
|     buffers: &mut Buffers, |     del_add: DelAdd, | ||||||
|  |     buffers: &'a mut Buffers, | ||||||
|     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, |     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, | ||||||
|     docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>, | ) -> Result<&'a [u8]> { | ||||||
| ) -> Result<()> { |     buffers.obkv_buffer.clear(); | ||||||
|  |     let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); | ||||||
|     for (field_id, field_bytes) in obkv.iter() { |     for (field_id, field_bytes) in obkv.iter() { | ||||||
|  |         // if field is searchable. | ||||||
|         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { |         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { | ||||||
|             let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; |             // extract deletion or addition only. | ||||||
|             buffers.field_buffer.clear(); |             if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { | ||||||
|             if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { |                 // parse json. | ||||||
|                 let tokens = process_tokens(tokenizer.tokenize(field)) |                 let value = | ||||||
|                     .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); |                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||||
|  |  | ||||||
|                 for (index, token) in tokens { |                 // prepare writing destination. | ||||||
|                     // if a language has been detected for the token, we update the counter. |                 buffers.obkv_positions_buffer.clear(); | ||||||
|                     if let Some(language) = token.language { |                 let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer); | ||||||
|                         let script = token.script; |  | ||||||
|                         let entry = |                 // convert json into a unique string. | ||||||
|                             script_language_word_count.entry(script).or_insert_with(Vec::new); |                 buffers.field_buffer.clear(); | ||||||
|                         match entry.iter_mut().find(|(l, _)| *l == language) { |                 if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { | ||||||
|                             Some((_, n)) => *n += 1, |                     // create an iterator of token with their positions. | ||||||
|                             None => entry.push((language, 1)), |                     let tokens = process_tokens(tokenizer.tokenize(field)) | ||||||
|  |                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||||
|  |  | ||||||
|  |                     for (index, token) in tokens { | ||||||
|  |                         // if a language has been detected for the token, we update the counter. | ||||||
|  |                         if let Some(language) = token.language { | ||||||
|  |                             let script = token.script; | ||||||
|  |                             let entry = | ||||||
|  |                                 script_language_word_count.entry(script).or_insert_with(Vec::new); | ||||||
|  |                             match entry.iter_mut().find(|(l, _)| *l == language) { | ||||||
|  |                                 Some((_, n)) => *n += 1, | ||||||
|  |                                 None => entry.push((language, 1)), | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // keep a word only if it is not empty and fit in a LMDB key. | ||||||
|  |                         let token = token.lemma().trim(); | ||||||
|  |                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||||
|  |                             let position: u16 = index | ||||||
|  |                                 .try_into() | ||||||
|  |                                 .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||||
|  |                             writer.insert(position, token.as_bytes())?; | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                     let token = token.lemma().trim(); |  | ||||||
|                     if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { |  | ||||||
|                         buffers.key_buffer.truncate(mem::size_of::<u32>()); |  | ||||||
|                         buffers.key_buffer.extend_from_slice(token.as_bytes()); |  | ||||||
|  |  | ||||||
|                         let position: u16 = index |                     // write positions into document. | ||||||
|                             .try_into() |                     let positions = writer.into_inner()?; | ||||||
|                             .map_err(|_| SerializationError::InvalidNumberSerialization)?; |                     document_writer.insert(field_id, positions)?; | ||||||
|                         let position = absolute_from_relative_position(field_id, position); |  | ||||||
|                         docid_word_positions_sorter |  | ||||||
|                             .insert(&buffers.key_buffer, position.to_ne_bytes())?; |  | ||||||
|                     } |  | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(()) |     // returns a KV<FieldId, KV<u16, String>> | ||||||
|  |     Ok(document_writer.into_inner().map(|v| v.as_slice())?) | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Transform a JSON value into a string that can be indexed. | /// Transform a JSON value into a string that can be indexed. | ||||||
| @@ -287,10 +439,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) | |||||||
|  |  | ||||||
| #[derive(Default)] | #[derive(Default)] | ||||||
| struct Buffers { | struct Buffers { | ||||||
|     // the key buffer is the concatenation of the internal document id with the field id. |  | ||||||
|     // The buffer has to be completelly cleared between documents, |  | ||||||
|     // and the field id part must be cleared between each field. |  | ||||||
|     key_buffer: Vec<u8>, |  | ||||||
|     // the field buffer for each fields desserialization, and must be cleared between each field. |     // the field buffer for each fields desserialization, and must be cleared between each field. | ||||||
|     field_buffer: String, |     field_buffer: String, | ||||||
|  |     // buffer used to store the value data containing an obkv. | ||||||
|  |     obkv_buffer: Vec<u8>, | ||||||
|  |     // buffer used to store the value data containing an obkv of tokens with their positions. | ||||||
|  |     obkv_positions_buffer: Vec<u8>, | ||||||
| } | } | ||||||
|   | |||||||
| @@ -4,11 +4,12 @@ use std::io::{self, BufReader}; | |||||||
| use heed::{BytesDecode, BytesEncode}; | use heed::{BytesDecode, BytesEncode}; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, |     create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, | ||||||
| }; | }; | ||||||
| use crate::heed_codec::facet::{ | use crate::heed_codec::facet::{ | ||||||
|     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, |     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, | ||||||
| }; | }; | ||||||
|  | use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::Result; | use crate::Result; | ||||||
|  |  | ||||||
| /// Extracts the facet number and the documents ids where this facet number appear. | /// Extracts the facet number and the documents ids where this facet number appear. | ||||||
| @@ -17,7 +18,7 @@ use crate::Result; | |||||||
| /// documents ids from the given chunk of docid facet number positions. | /// documents ids from the given chunk of docid facet number positions. | ||||||
| #[logging_timer::time] | #[logging_timer::time] | ||||||
| pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | ||||||
|     docid_fid_facet_number: grenad::Reader<R>, |     fid_docid_facet_number: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
| @@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let mut facet_number_docids_sorter = create_sorter( |     let mut facet_number_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Unstable, |         grenad::SortAlgorithm::Unstable, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory, |         max_memory, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut cursor = docid_fid_facet_number.into_cursor()?; |     let mut buffer = Vec::new(); | ||||||
|     while let Some((key_bytes, _)) = cursor.move_on_next()? { |     let mut cursor = fid_docid_facet_number.into_cursor()?; | ||||||
|  |     while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? { | ||||||
|         let (field_id, document_id, number) = |         let (field_id, document_id, number) = | ||||||
|             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); |             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); | ||||||
|  |  | ||||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: number }; |         let key = FacetGroupKey { field_id, level: 0, left_bound: number }; | ||||||
|         let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap(); |         let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap(); | ||||||
|         facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; |  | ||||||
|  |         buffer.clear(); | ||||||
|  |         let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |         for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() { | ||||||
|  |             obkv.insert(deladd_key, document_id.to_ne_bytes())?; | ||||||
|  |         } | ||||||
|  |         obkv.finish()?; | ||||||
|  |  | ||||||
|  |         facet_number_docids_sorter.insert(key_bytes, &buffer)?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(facet_number_docids_sorter, indexer) |     sorter_into_reader(facet_number_docids_sorter, indexer) | ||||||
|   | |||||||
| @@ -1,13 +1,15 @@ | |||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::BufReader; | ||||||
|  | use std::{io, str}; | ||||||
|  |  | ||||||
| use heed::BytesEncode; | use heed::BytesEncode; | ||||||
|  |  | ||||||
| use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; | use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; | ||||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; | use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; | ||||||
| use crate::heed_codec::StrRefCodec; | use crate::heed_codec::StrRefCodec; | ||||||
| use crate::update::index_documents::merge_cbo_roaring_bitmaps; | use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; | ||||||
|  | use crate::{FieldId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the facet string and the documents ids where this facet string appear. | /// Extracts the facet string and the documents ids where this facet string appear. | ||||||
| /// | /// | ||||||
| @@ -24,15 +26,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let mut facet_string_docids_sorter = create_sorter( |     let mut facet_string_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Stable, |         grenad::SortAlgorithm::Stable, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory, |         max_memory, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|     let mut cursor = docid_fid_facet_string.into_cursor()?; |     let mut cursor = docid_fid_facet_string.into_cursor()?; | ||||||
|     while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { |     while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { | ||||||
|         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); |         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); | ||||||
|         let field_id = FieldId::from_be_bytes(field_id_bytes); |         let field_id = FieldId::from_be_bytes(field_id_bytes); | ||||||
|  |  | ||||||
| @@ -40,21 +43,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|             try_split_array_at::<_, 4>(bytes).unwrap(); |             try_split_array_at::<_, 4>(bytes).unwrap(); | ||||||
|         let document_id = u32::from_be_bytes(document_id_bytes); |         let document_id = u32::from_be_bytes(document_id_bytes); | ||||||
|  |  | ||||||
|         let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; |         let normalized_value = str::from_utf8(normalized_value_bytes)?; | ||||||
|  |         let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; | ||||||
|         let normalised_truncated_value: String; |  | ||||||
|         if normalised_value.len() > MAX_FACET_VALUE_LENGTH { |  | ||||||
|             normalised_truncated_value = normalised_value |  | ||||||
|                 .char_indices() |  | ||||||
|                 .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) |  | ||||||
|                 .map(|(_, c)| c) |  | ||||||
|                 .collect(); |  | ||||||
|             normalised_value = normalised_truncated_value.as_str(); |  | ||||||
|         } |  | ||||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; |  | ||||||
|         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); |         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); | ||||||
|         // document id is encoded in native-endian because of the CBO roaring bitmap codec |  | ||||||
|         facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; |         buffer.clear(); | ||||||
|  |         let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |         for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { | ||||||
|  |             obkv.insert(deladd_key, document_id.to_ne_bytes())?; | ||||||
|  |         } | ||||||
|  |         obkv.finish()?; | ||||||
|  |         facet_string_docids_sorter.insert(&key_bytes, &buffer)?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(facet_string_docids_sorter, indexer) |     sorter_into_reader(facet_string_docids_sorter, indexer) | ||||||
|   | |||||||
| @@ -1,24 +1,36 @@ | |||||||
|  | use std::borrow::Cow; | ||||||
| use std::collections::{BTreeMap, HashSet}; | use std::collections::{BTreeMap, HashSet}; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
| use std::mem::size_of; | use std::mem::size_of; | ||||||
|  | use std::result::Result as StdResult; | ||||||
|  |  | ||||||
|  | use grenad::Sorter; | ||||||
| use heed::zerocopy::AsBytes; | use heed::zerocopy::AsBytes; | ||||||
| use heed::BytesEncode; | use heed::BytesEncode; | ||||||
|  | use itertools::EitherOrBoth; | ||||||
|  | use ordered_float::OrderedFloat; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::{from_slice, Value}; | use serde_json::{from_slice, Value}; | ||||||
|  | use FilterableValues::{Empty, Null, Values}; | ||||||
|  |  | ||||||
| use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; | use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; | ||||||
| use crate::error::InternalError; | use crate::error::InternalError; | ||||||
| use crate::facet::value_encoding::f64_into_bytes; | use crate::facet::value_encoding::f64_into_bytes; | ||||||
|  | use crate::update::del_add::{DelAdd, KvWriterDelAdd}; | ||||||
| use crate::update::index_documents::{create_writer, writer_into_reader}; | use crate::update::index_documents::{create_writer, writer_into_reader}; | ||||||
| use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; | use crate::{ | ||||||
|  |     CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH, | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | /// The length of the elements that are always in the buffer when inserting new values. | ||||||
|  | const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>(); | ||||||
|  |  | ||||||
| /// The extracted facet values stored in grenad files by type. | /// The extracted facet values stored in grenad files by type. | ||||||
| pub struct ExtractedFacetValues { | pub struct ExtractedFacetValues { | ||||||
|     pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>, |     pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>, | ||||||
|     pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>, |     pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>, | ||||||
|     pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>, |     pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>, | ||||||
|     pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>, |     pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>, | ||||||
|     pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>, |     pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>, | ||||||
| @@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | |||||||
|         max_memory.map(|m| m / 2), |         max_memory.map(|m| m / 2), | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); |     // The tuples represents the Del and Add side for a bitmap | ||||||
|     let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); |     let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); | ||||||
|     let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); |     let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); | ||||||
|  |     let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); | ||||||
|  |  | ||||||
|  |     // We create two buffers for mutable ref issues with closures. | ||||||
|  |     let mut numbers_key_buffer = Vec::new(); | ||||||
|  |     let mut strings_key_buffer = Vec::new(); | ||||||
|  |  | ||||||
|     let mut key_buffer = Vec::new(); |  | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { |     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||||
|         let obkv = obkv::KvReader::new(value); |         let obkv = obkv::KvReader::new(value); | ||||||
|  |  | ||||||
|         for (field_id, field_bytes) in obkv.iter() { |         for (field_id, field_bytes) in obkv.iter() { | ||||||
|             if faceted_fields.contains(&field_id) { |             if faceted_fields.contains(&field_id) { | ||||||
|                 key_buffer.clear(); |                 numbers_key_buffer.clear(); | ||||||
|  |                 strings_key_buffer.clear(); | ||||||
|  |  | ||||||
|                 // Set key to the field_id |                 // Set key to the field_id | ||||||
|                 // Note: this encoding is consistent with FieldIdCodec |                 // Note: this encoding is consistent with FieldIdCodec | ||||||
|                 key_buffer.extend_from_slice(&field_id.to_be_bytes()); |                 numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||||
|  |                 strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||||
|  |  | ||||||
|                 // Here, we know already that the document must be added to the “field id exists” database |  | ||||||
|                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); |                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); | ||||||
|                 let document = BEU32::from(document).get(); |                 let document = BEU32::from(document).get(); | ||||||
|  |  | ||||||
|                 facet_exists_docids.entry(field_id).or_default().insert(document); |  | ||||||
|  |  | ||||||
|                 // For the other extraction tasks, prefix the key with the field_id and the document_id |                 // For the other extraction tasks, prefix the key with the field_id and the document_id | ||||||
|                 key_buffer.extend_from_slice(docid_bytes); |                 numbers_key_buffer.extend_from_slice(docid_bytes); | ||||||
|  |                 strings_key_buffer.extend_from_slice(docid_bytes); | ||||||
|  |  | ||||||
|                 let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; |                 let del_add_obkv = obkv::KvReader::new(field_bytes); | ||||||
|  |                 let del_value = match del_add_obkv.get(DelAdd::Deletion) { | ||||||
|  |                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), | ||||||
|  |                     None => None, | ||||||
|  |                 }; | ||||||
|  |                 let add_value = match del_add_obkv.get(DelAdd::Addition) { | ||||||
|  |                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), | ||||||
|  |                     None => None, | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|                 match extract_facet_values( |                 // We insert the document id on the Del and the Add side if the field exists. | ||||||
|                     &value, |                 let (ref mut del_exists, ref mut add_exists) = | ||||||
|                     geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng), |                     facet_exists_docids.entry(field_id).or_default(); | ||||||
|                 ) { |                 let (ref mut del_is_null, ref mut add_is_null) = | ||||||
|                     FilterableValues::Null => { |                     facet_is_null_docids.entry(field_id).or_default(); | ||||||
|                         facet_is_null_docids.entry(field_id).or_default().insert(document); |                 let (ref mut del_is_empty, ref mut add_is_empty) = | ||||||
|                     } |                     facet_is_empty_docids.entry(field_id).or_default(); | ||||||
|                     FilterableValues::Empty => { |  | ||||||
|                         facet_is_empty_docids.entry(field_id).or_default().insert(document); |  | ||||||
|                     } |  | ||||||
|                     FilterableValues::Values { numbers, strings } => { |  | ||||||
|                         // insert facet numbers in sorter |  | ||||||
|                         for number in numbers { |  | ||||||
|                             key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); |  | ||||||
|                             if let Some(value_bytes) = f64_into_bytes(number) { |  | ||||||
|                                 key_buffer.extend_from_slice(&value_bytes); |  | ||||||
|                                 key_buffer.extend_from_slice(&number.to_be_bytes()); |  | ||||||
|  |  | ||||||
|                                 fid_docid_facet_numbers_sorter |                 if del_value.is_some() { | ||||||
|                                     .insert(&key_buffer, ().as_bytes())?; |                     del_exists.insert(document); | ||||||
|                             } |                 } | ||||||
|  |                 if add_value.is_some() { | ||||||
|  |                     add_exists.insert(document); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 let geo_support = | ||||||
|  |                     geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); | ||||||
|  |                 let del_filterable_values = | ||||||
|  |                     del_value.map(|value| extract_facet_values(&value, geo_support)); | ||||||
|  |                 let add_filterable_values = | ||||||
|  |                     add_value.map(|value| extract_facet_values(&value, geo_support)); | ||||||
|  |  | ||||||
|  |                 // Those closures are just here to simplify things a bit. | ||||||
|  |                 let mut insert_numbers_diff = |del_numbers, add_numbers| { | ||||||
|  |                     insert_numbers_diff( | ||||||
|  |                         &mut fid_docid_facet_numbers_sorter, | ||||||
|  |                         &mut numbers_key_buffer, | ||||||
|  |                         del_numbers, | ||||||
|  |                         add_numbers, | ||||||
|  |                     ) | ||||||
|  |                 }; | ||||||
|  |                 let mut insert_strings_diff = |del_strings, add_strings| { | ||||||
|  |                     insert_strings_diff( | ||||||
|  |                         &mut fid_docid_facet_strings_sorter, | ||||||
|  |                         &mut strings_key_buffer, | ||||||
|  |                         del_strings, | ||||||
|  |                         add_strings, | ||||||
|  |                     ) | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|  |                 match (del_filterable_values, add_filterable_values) { | ||||||
|  |                     (None, None) => (), | ||||||
|  |                     (Some(del_filterable_values), None) => match del_filterable_values { | ||||||
|  |                         Null => { | ||||||
|  |                             del_is_null.insert(document); | ||||||
|                         } |                         } | ||||||
|  |                         Empty => { | ||||||
|                         // insert normalized and original facet string in sorter |                             del_is_empty.insert(document); | ||||||
|                         for (normalized, original) in |                         } | ||||||
|                             strings.into_iter().filter(|(n, _)| !n.is_empty()) |                         Values { numbers, strings } => { | ||||||
|                         { |                             insert_numbers_diff(numbers, vec![])?; | ||||||
|                             let normalized_truncated_value: String = normalized |                             insert_strings_diff(strings, vec![])?; | ||||||
|                                 .char_indices() |                         } | ||||||
|                                 .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) |                     }, | ||||||
|                                 .map(|(_, c)| c) |                     (None, Some(add_filterable_values)) => match add_filterable_values { | ||||||
|                                 .collect(); |                         Null => { | ||||||
|  |                             add_is_null.insert(document); | ||||||
|                             key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); |                         } | ||||||
|                             key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); |                         Empty => { | ||||||
|                             fid_docid_facet_strings_sorter |                             add_is_empty.insert(document); | ||||||
|                                 .insert(&key_buffer, original.as_bytes())?; |                         } | ||||||
|  |                         Values { numbers, strings } => { | ||||||
|  |                             insert_numbers_diff(vec![], numbers)?; | ||||||
|  |                             insert_strings_diff(vec![], strings)?; | ||||||
|  |                         } | ||||||
|  |                     }, | ||||||
|  |                     (Some(del_filterable_values), Some(add_filterable_values)) => { | ||||||
|  |                         match (del_filterable_values, add_filterable_values) { | ||||||
|  |                             (Null, Null) | (Empty, Empty) => (), | ||||||
|  |                             (Null, Empty) => { | ||||||
|  |                                 del_is_null.insert(document); | ||||||
|  |                                 add_is_empty.insert(document); | ||||||
|  |                             } | ||||||
|  |                             (Empty, Null) => { | ||||||
|  |                                 del_is_empty.insert(document); | ||||||
|  |                                 add_is_null.insert(document); | ||||||
|  |                             } | ||||||
|  |                             (Null, Values { numbers, strings }) => { | ||||||
|  |                                 insert_numbers_diff(vec![], numbers)?; | ||||||
|  |                                 insert_strings_diff(vec![], strings)?; | ||||||
|  |                                 del_is_null.insert(document); | ||||||
|  |                             } | ||||||
|  |                             (Empty, Values { numbers, strings }) => { | ||||||
|  |                                 insert_numbers_diff(vec![], numbers)?; | ||||||
|  |                                 insert_strings_diff(vec![], strings)?; | ||||||
|  |                                 del_is_empty.insert(document); | ||||||
|  |                             } | ||||||
|  |                             (Values { numbers, strings }, Null) => { | ||||||
|  |                                 add_is_null.insert(document); | ||||||
|  |                                 insert_numbers_diff(numbers, vec![])?; | ||||||
|  |                                 insert_strings_diff(strings, vec![])?; | ||||||
|  |                             } | ||||||
|  |                             (Values { numbers, strings }, Empty) => { | ||||||
|  |                                 add_is_empty.insert(document); | ||||||
|  |                                 insert_numbers_diff(numbers, vec![])?; | ||||||
|  |                                 insert_strings_diff(strings, vec![])?; | ||||||
|  |                             } | ||||||
|  |                             ( | ||||||
|  |                                 Values { numbers: del_numbers, strings: del_strings }, | ||||||
|  |                                 Values { numbers: add_numbers, strings: add_strings }, | ||||||
|  |                             ) => { | ||||||
|  |                                 insert_numbers_diff(del_numbers, add_numbers)?; | ||||||
|  |                                 insert_strings_diff(del_strings, add_strings)?; | ||||||
|  |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|     let mut facet_exists_docids_writer = create_writer( |     let mut facet_exists_docids_writer = create_writer( | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         tempfile::tempfile()?, |         tempfile::tempfile()?, | ||||||
|     ); |     ); | ||||||
|     for (fid, bitmap) in facet_exists_docids.into_iter() { |     for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() { | ||||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); |         deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; | ||||||
|         facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; |         facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?; | ||||||
|     } |     } | ||||||
|     let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; |     let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; | ||||||
|  |  | ||||||
| @@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | |||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         tempfile::tempfile()?, |         tempfile::tempfile()?, | ||||||
|     ); |     ); | ||||||
|     for (fid, bitmap) in facet_is_null_docids.into_iter() { |     for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() { | ||||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); |         deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; | ||||||
|         facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; |         facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?; | ||||||
|     } |     } | ||||||
|     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; |     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; | ||||||
|  |  | ||||||
| @@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | |||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         tempfile::tempfile()?, |         tempfile::tempfile()?, | ||||||
|     ); |     ); | ||||||
|     for (fid, bitmap) in facet_is_empty_docids.into_iter() { |     for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() { | ||||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); |         deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; | ||||||
|         facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; |         facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?; | ||||||
|     } |     } | ||||||
|     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; |     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; | ||||||
|  |  | ||||||
|     Ok(ExtractedFacetValues { |     Ok(ExtractedFacetValues { | ||||||
|         docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, |         fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, | ||||||
|         docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, |         fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, | ||||||
|         fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, |         fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, | ||||||
|         fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, |         fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, | ||||||
|         fid_facet_exists_docids_chunk: facet_exists_docids_reader, |         fid_facet_exists_docids_chunk: facet_exists_docids_reader, | ||||||
|     }) |     }) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Generates a vector of bytes containing a DelAdd obkv with two bitmaps. | ||||||
|  | fn deladd_obkv_cbo_roaring_bitmaps( | ||||||
|  |     buffer: &mut Vec<u8>, | ||||||
|  |     del_bitmap: &RoaringBitmap, | ||||||
|  |     add_bitmap: &RoaringBitmap, | ||||||
|  | ) -> io::Result<()> { | ||||||
|  |     buffer.clear(); | ||||||
|  |     let mut obkv = KvWriterDelAdd::new(buffer); | ||||||
|  |     let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap(); | ||||||
|  |     let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap(); | ||||||
|  |     obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; | ||||||
|  |     obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; | ||||||
|  |     obkv.finish() | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Truncates a string to the biggest valid LMDB key size. | ||||||
|  | fn truncate_string(s: String) -> String { | ||||||
|  |     s.char_indices() | ||||||
|  |         .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) | ||||||
|  |         .map(|(_, c)| c) | ||||||
|  |         .collect() | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Computes the diff between both Del and Add numbers and | ||||||
|  | /// only inserts the parts that differ in the sorter. | ||||||
|  | fn insert_numbers_diff<MF>( | ||||||
|  |     fid_docid_facet_numbers_sorter: &mut Sorter<MF>, | ||||||
|  |     key_buffer: &mut Vec<u8>, | ||||||
|  |     mut del_numbers: Vec<f64>, | ||||||
|  |     mut add_numbers: Vec<f64>, | ||||||
|  | ) -> Result<()> | ||||||
|  | where | ||||||
|  |     MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>, | ||||||
|  | { | ||||||
|  |     // We sort and dedup the float numbers | ||||||
|  |     del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); | ||||||
|  |     add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); | ||||||
|  |     del_numbers.dedup_by_key(|f| OrderedFloat(*f)); | ||||||
|  |     add_numbers.dedup_by_key(|f| OrderedFloat(*f)); | ||||||
|  |  | ||||||
|  |     let merged_numbers_iter = itertools::merge_join_by( | ||||||
|  |         del_numbers.into_iter().map(OrderedFloat), | ||||||
|  |         add_numbers.into_iter().map(OrderedFloat), | ||||||
|  |         |del, add| del.cmp(add), | ||||||
|  |     ); | ||||||
|  |  | ||||||
|  |     // insert facet numbers in sorter | ||||||
|  |     for eob in merged_numbers_iter { | ||||||
|  |         key_buffer.truncate(TRUNCATE_SIZE); | ||||||
|  |         match eob { | ||||||
|  |             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||||
|  |             EitherOrBoth::Left(OrderedFloat(number)) => { | ||||||
|  |                 if let Some(value_bytes) = f64_into_bytes(number) { | ||||||
|  |                     key_buffer.extend_from_slice(&value_bytes); | ||||||
|  |                     key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||||
|  |  | ||||||
|  |                     // We insert only the Del part of the Obkv to inform | ||||||
|  |                     // that we only want to remove all those numbers. | ||||||
|  |                     let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |                     obkv.insert(DelAdd::Deletion, ().as_bytes())?; | ||||||
|  |                     let bytes = obkv.into_inner()?; | ||||||
|  |                     fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             EitherOrBoth::Right(OrderedFloat(number)) => { | ||||||
|  |                 if let Some(value_bytes) = f64_into_bytes(number) { | ||||||
|  |                     key_buffer.extend_from_slice(&value_bytes); | ||||||
|  |                     key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||||
|  |  | ||||||
|  |                     // We insert only the Add part of the Obkv to inform | ||||||
|  |                     // that we only want to remove all those numbers. | ||||||
|  |                     let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |                     obkv.insert(DelAdd::Addition, ().as_bytes())?; | ||||||
|  |                     let bytes = obkv.into_inner()?; | ||||||
|  |                     fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Computes the diff between both Del and Add strings and | ||||||
|  | /// only inserts the parts that differ in the sorter. | ||||||
|  | fn insert_strings_diff<MF>( | ||||||
|  |     fid_docid_facet_strings_sorter: &mut Sorter<MF>, | ||||||
|  |     key_buffer: &mut Vec<u8>, | ||||||
|  |     mut del_strings: Vec<(String, String)>, | ||||||
|  |     mut add_strings: Vec<(String, String)>, | ||||||
|  | ) -> Result<()> | ||||||
|  | where | ||||||
|  |     MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>, | ||||||
|  | { | ||||||
|  |     // We sort and dedup the normalized and original strings | ||||||
|  |     del_strings.sort_unstable(); | ||||||
|  |     add_strings.sort_unstable(); | ||||||
|  |     del_strings.dedup(); | ||||||
|  |     add_strings.dedup(); | ||||||
|  |  | ||||||
|  |     let merged_strings_iter = itertools::merge_join_by( | ||||||
|  |         del_strings.into_iter().filter(|(n, _)| !n.is_empty()), | ||||||
|  |         add_strings.into_iter().filter(|(n, _)| !n.is_empty()), | ||||||
|  |         |del, add| del.cmp(add), | ||||||
|  |     ); | ||||||
|  |  | ||||||
|  |     // insert normalized and original facet string in sorter | ||||||
|  |     for eob in merged_strings_iter { | ||||||
|  |         key_buffer.truncate(TRUNCATE_SIZE); | ||||||
|  |         match eob { | ||||||
|  |             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||||
|  |             EitherOrBoth::Left((normalized, original)) => { | ||||||
|  |                 let truncated = truncate_string(normalized); | ||||||
|  |                 key_buffer.extend_from_slice(truncated.as_bytes()); | ||||||
|  |  | ||||||
|  |                 let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |                 obkv.insert(DelAdd::Deletion, original)?; | ||||||
|  |                 let bytes = obkv.into_inner()?; | ||||||
|  |                 fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; | ||||||
|  |             } | ||||||
|  |             EitherOrBoth::Right((normalized, original)) => { | ||||||
|  |                 let truncated = truncate_string(normalized); | ||||||
|  |                 key_buffer.extend_from_slice(truncated.as_bytes()); | ||||||
|  |  | ||||||
|  |                 let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |                 obkv.insert(DelAdd::Addition, original)?; | ||||||
|  |                 let bytes = obkv.into_inner()?; | ||||||
|  |                 fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|  |  | ||||||
| /// Represent what a document field contains. | /// Represent what a document field contains. | ||||||
| enum FilterableValues { | enum FilterableValues { | ||||||
|     /// Corresponds to the JSON `null` value. |     /// Corresponds to the JSON `null` value. | ||||||
| @@ -182,6 +409,7 @@ enum FilterableValues { | |||||||
|     Values { numbers: Vec<f64>, strings: Vec<(String, String)> }, |     Values { numbers: Vec<f64>, strings: Vec<(String, String)> }, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Extracts the facet values of a JSON field. | ||||||
| fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { | fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { | ||||||
|     fn inner_extract_facet_values( |     fn inner_extract_facet_values( | ||||||
|         value: &Value, |         value: &Value, | ||||||
|   | |||||||
| @@ -1,16 +1,18 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
|  |  | ||||||
| use grenad::Sorter; | use obkv::KvReaderU16; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, |     create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, | ||||||
|     try_split_array_at, GrenadParameters, MergeFn, |     GrenadParameters, | ||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::Result; | ||||||
|  |  | ||||||
|  | const MAX_COUNTED_WORDS: usize = 30; | ||||||
|  |  | ||||||
| /// Extracts the field id word count and the documents ids where | /// Extracts the field id word count and the documents ids where | ||||||
| /// this field id with this amount of words appear. | /// this field id with this amount of words appear. | ||||||
| @@ -28,70 +30,62 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let mut fid_word_count_docids_sorter = create_sorter( |     let mut fid_word_count_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Unstable, |         grenad::SortAlgorithm::Unstable, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory, |         max_memory, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     // This map is assumed to not consume a lot of memory. |     let mut key_buffer = Vec::new(); | ||||||
|     let mut document_fid_wordcount = HashMap::new(); |     let mut value_buffer = Vec::new(); | ||||||
|     let mut current_document_id = None; |  | ||||||
|  |  | ||||||
|     let mut cursor = docid_word_positions.into_cursor()?; |     let mut cursor = docid_word_positions.into_cursor()?; | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|         let (document_id_bytes, _word_bytes) = try_split_array_at(key) |         let (document_id_bytes, fid_bytes) = try_split_array_at(key) | ||||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; |             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|         let document_id = u32::from_be_bytes(document_id_bytes); |         let document_id = u32::from_be_bytes(document_id_bytes); | ||||||
|  |  | ||||||
|         let curr_document_id = *current_document_id.get_or_insert(document_id); |         let del_add_reader = KvReaderDelAdd::new(value); | ||||||
|         if curr_document_id != document_id { |         let deletion = del_add_reader | ||||||
|             drain_document_fid_wordcount_into_sorter( |             // get deleted words | ||||||
|                 &mut fid_word_count_docids_sorter, |             .get(DelAdd::Deletion) | ||||||
|                 &mut document_fid_wordcount, |             // count deleted words | ||||||
|                 curr_document_id, |             .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()) | ||||||
|             )?; |             // keep the count if under or equal to MAX_COUNTED_WORDS | ||||||
|             current_document_id = Some(document_id); |             .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); | ||||||
|  |         let addition = del_add_reader | ||||||
|  |             // get added words | ||||||
|  |             .get(DelAdd::Addition) | ||||||
|  |             // count added words | ||||||
|  |             .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count()) | ||||||
|  |             // keep the count if under or equal to MAX_COUNTED_WORDS | ||||||
|  |             .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); | ||||||
|  |  | ||||||
|  |         if deletion != addition { | ||||||
|  |             // Insert deleted word count in sorter if exist. | ||||||
|  |             if let Some(word_count) = deletion { | ||||||
|  |                 value_buffer.clear(); | ||||||
|  |                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key_buffer.clear(); | ||||||
|  |                 key_buffer.extend_from_slice(fid_bytes); | ||||||
|  |                 key_buffer.push(word_count as u8); | ||||||
|  |                 fid_word_count_docids_sorter | ||||||
|  |                     .insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||||
|  |             } | ||||||
|  |             // Insert added word count in sorter if exist. | ||||||
|  |             if let Some(word_count) = addition { | ||||||
|  |                 value_buffer.clear(); | ||||||
|  |                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key_buffer.clear(); | ||||||
|  |                 key_buffer.extend_from_slice(fid_bytes); | ||||||
|  |                 key_buffer.push(word_count as u8); | ||||||
|  |                 fid_word_count_docids_sorter | ||||||
|  |                     .insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for position in read_u32_ne_bytes(value) { |  | ||||||
|             let (field_id, _) = relative_from_absolute_position(position); |  | ||||||
|  |  | ||||||
|             let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); |  | ||||||
|             *value += 1; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if let Some(document_id) = current_document_id { |  | ||||||
|         // We must make sure that don't lose the current document field id |  | ||||||
|         // word count map if we break because we reached the end of the chunk. |  | ||||||
|         drain_document_fid_wordcount_into_sorter( |  | ||||||
|             &mut fid_word_count_docids_sorter, |  | ||||||
|             &mut document_fid_wordcount, |  | ||||||
|             document_id, |  | ||||||
|         )?; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     sorter_into_reader(fid_word_count_docids_sorter, indexer) |     sorter_into_reader(fid_word_count_docids_sorter, indexer) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn drain_document_fid_wordcount_into_sorter( |  | ||||||
|     fid_word_count_docids_sorter: &mut Sorter<MergeFn>, |  | ||||||
|     document_fid_wordcount: &mut HashMap<FieldId, u32>, |  | ||||||
|     document_id: DocumentId, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let mut key_buffer = Vec::new(); |  | ||||||
|  |  | ||||||
|     for (fid, count) in document_fid_wordcount.drain() { |  | ||||||
|         if count <= 30 { |  | ||||||
|             key_buffer.clear(); |  | ||||||
|             key_buffer.extend_from_slice(&fid.to_be_bytes()); |  | ||||||
|             key_buffer.push(count as u8); |  | ||||||
|  |  | ||||||
|             fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ use serde_json::Value; | |||||||
|  |  | ||||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||||
| use crate::error::GeoError; | use crate::error::GeoError; | ||||||
|  | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::update::index_documents::extract_finite_float_from_value; | use crate::update::index_documents::extract_finite_float_from_value; | ||||||
| use crate::{FieldId, InternalError, Result}; | use crate::{FieldId, InternalError, Result}; | ||||||
|  |  | ||||||
| @@ -30,39 +31,71 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | |||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { |     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||||
|         let obkv = obkv::KvReader::new(value); |         let obkv = obkv::KvReader::new(value); | ||||||
|         // since we only needs the primary key when we throw an error we create this getter to |         // since we only need the primary key when we throw an error | ||||||
|         // lazily get it when needed |         // we create this getter to lazily get it when needed | ||||||
|         let document_id = || -> Value { |         let document_id = || -> Value { | ||||||
|             let document_id = obkv.get(primary_key_id).unwrap(); |             let document_id = obkv.get(primary_key_id).unwrap(); | ||||||
|             serde_json::from_slice(document_id).unwrap() |             serde_json::from_slice(document_id).unwrap() | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // first we get the two fields |         // first we get the two fields | ||||||
|         let lat = obkv.get(lat_fid); |         match (obkv.get(lat_fid), obkv.get(lng_fid)) { | ||||||
|         let lng = obkv.get(lng_fid); |             (Some(lat), Some(lng)) => { | ||||||
|  |                 let deladd_lat_obkv = KvReaderDelAdd::new(lat); | ||||||
|  |                 let deladd_lng_obkv = KvReaderDelAdd::new(lng); | ||||||
|  |  | ||||||
|         if let Some((lat, lng)) = lat.zip(lng) { |                 // then we extract the values | ||||||
|             // then we extract the values |                 let del_lat_lng = deladd_lat_obkv | ||||||
|             let lat = extract_finite_float_from_value( |                     .get(DelAdd::Deletion) | ||||||
|                 serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, |                     .zip(deladd_lng_obkv.get(DelAdd::Deletion)) | ||||||
|             ) |                     .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) | ||||||
|             .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; |                     .transpose()?; | ||||||
|  |                 let add_lat_lng = deladd_lat_obkv | ||||||
|  |                     .get(DelAdd::Addition) | ||||||
|  |                     .zip(deladd_lng_obkv.get(DelAdd::Addition)) | ||||||
|  |                     .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) | ||||||
|  |                     .transpose()?; | ||||||
|  |  | ||||||
|             let lng = extract_finite_float_from_value( |                 if del_lat_lng != add_lat_lng { | ||||||
|                 serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, |                     let mut obkv = KvWriterDelAdd::memory(); | ||||||
|             ) |                     if let Some([lat, lng]) = del_lat_lng { | ||||||
|             .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; |                         #[allow(clippy::drop_non_drop)] | ||||||
|  |                         let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||||
|             #[allow(clippy::drop_non_drop)] |                         obkv.insert(DelAdd::Deletion, bytes)?; | ||||||
|             let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; |                     } | ||||||
|             writer.insert(docid_bytes, bytes)?; |                     if let Some([lat, lng]) = add_lat_lng { | ||||||
|         } else if lat.is_none() && lng.is_some() { |                         #[allow(clippy::drop_non_drop)] | ||||||
|             return Err(GeoError::MissingLatitude { document_id: document_id() })?; |                         let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||||
|         } else if lat.is_some() && lng.is_none() { |                         obkv.insert(DelAdd::Addition, bytes)?; | ||||||
|             return Err(GeoError::MissingLongitude { document_id: document_id() })?; |                     } | ||||||
|  |                     let bytes = obkv.into_inner()?; | ||||||
|  |                     writer.insert(docid_bytes, bytes)?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             (None, Some(_)) => { | ||||||
|  |                 return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) | ||||||
|  |             } | ||||||
|  |             (Some(_), None) => { | ||||||
|  |                 return Err(GeoError::MissingLongitude { document_id: document_id() }.into()) | ||||||
|  |             } | ||||||
|  |             (None, None) => (), | ||||||
|         } |         } | ||||||
|         // else => the _geo object was `null`, there is nothing to do |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     writer_into_reader(writer) |     writer_into_reader(writer) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Extract the finite floats lat and lng from two bytes slices. | ||||||
|  | fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> { | ||||||
|  |     let lat = extract_finite_float_from_value( | ||||||
|  |         serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, | ||||||
|  |     ) | ||||||
|  |     .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; | ||||||
|  |  | ||||||
|  |     let lng = extract_finite_float_from_value( | ||||||
|  |         serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, | ||||||
|  |     ) | ||||||
|  |     .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; | ||||||
|  |  | ||||||
|  |     Ok([lat, lng]) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,13 +1,24 @@ | |||||||
|  | use std::cmp::Ordering; | ||||||
| use std::convert::TryFrom; | use std::convert::TryFrom; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader, BufWriter}; | ||||||
|  | use std::mem::size_of; | ||||||
|  | use std::str::from_utf8; | ||||||
|  |  | ||||||
| use bytemuck::cast_slice; | use bytemuck::cast_slice; | ||||||
|  | use grenad::Writer; | ||||||
|  | use itertools::EitherOrBoth; | ||||||
|  | use ordered_float::OrderedFloat; | ||||||
| use serde_json::{from_slice, Value}; | use serde_json::{from_slice, Value}; | ||||||
|  |  | ||||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||||
| use crate::error::UserError; | use crate::error::UserError; | ||||||
| use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::index_documents::helpers::try_split_at; | ||||||
|  | use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; | ||||||
|  |  | ||||||
|  | /// The length of the elements that are always in the buffer when inserting new values. | ||||||
|  | const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); | ||||||
|  |  | ||||||
| /// Extracts the embedding vector contained in each document under the `_vectors` field. | /// Extracts the embedding vector contained in each document under the `_vectors` field. | ||||||
| /// | /// | ||||||
| @@ -16,7 +27,6 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; | |||||||
| pub fn extract_vector_points<R: io::Read + io::Seek>( | pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||||
|     obkv_documents: grenad::Reader<R>, |     obkv_documents: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     primary_key_id: FieldId, |  | ||||||
|     vectors_fid: FieldId, |     vectors_fid: FieldId, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
| @@ -27,43 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|         tempfile::tempfile()?, |         tempfile::tempfile()?, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|  |     let mut key_buffer = Vec::new(); | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|  |         // this must always be serialized as (docid, external_docid); | ||||||
|  |         let (docid_bytes, external_id_bytes) = | ||||||
|  |             try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap(); | ||||||
|  |         debug_assert!(from_utf8(external_id_bytes).is_ok()); | ||||||
|  |  | ||||||
|         let obkv = obkv::KvReader::new(value); |         let obkv = obkv::KvReader::new(value); | ||||||
|  |         key_buffer.clear(); | ||||||
|  |         key_buffer.extend_from_slice(docid_bytes); | ||||||
|  |  | ||||||
|         // since we only needs the primary key when we throw an error we create this getter to |         // since we only needs the primary key when we throw an error we create this getter to | ||||||
|         // lazily get it when needed |         // lazily get it when needed | ||||||
|         let document_id = || -> Value { |         let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; | ||||||
|             let document_id = obkv.get(primary_key_id).unwrap(); |  | ||||||
|             from_slice(document_id).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         // first we retrieve the _vectors field |         // first we retrieve the _vectors field | ||||||
|         if let Some(vectors) = obkv.get(vectors_fid) { |         if let Some(value) = obkv.get(vectors_fid) { | ||||||
|             // extract the vectors |             let vectors_obkv = KvReaderDelAdd::new(value); | ||||||
|             let vectors = match from_slice(vectors) { |  | ||||||
|                 Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors), |  | ||||||
|                 Err(_) => { |  | ||||||
|                     return Err(UserError::InvalidVectorsType { |  | ||||||
|                         document_id: document_id(), |  | ||||||
|                         value: from_slice(vectors).map_err(InternalError::SerdeJson)?, |  | ||||||
|                     } |  | ||||||
|                     .into()) |  | ||||||
|                 } |  | ||||||
|             }; |  | ||||||
|  |  | ||||||
|             if let Some(vectors) = vectors { |             // then we extract the values | ||||||
|                 for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) { |             let del_vectors = vectors_obkv | ||||||
|                     let index = u16::try_from(i).unwrap(); |                 .get(DelAdd::Deletion) | ||||||
|                     let mut key = docid_bytes.to_vec(); |                 .map(|vectors| extract_vectors(vectors, document_id)) | ||||||
|                     key.extend_from_slice(&index.to_be_bytes()); |                 .transpose()? | ||||||
|                     let bytes = cast_slice(&vector); |                 .flatten(); | ||||||
|                     writer.insert(key, bytes)?; |             let add_vectors = vectors_obkv | ||||||
|                 } |                 .get(DelAdd::Addition) | ||||||
|             } |                 .map(|vectors| extract_vectors(vectors, document_id)) | ||||||
|  |                 .transpose()? | ||||||
|  |                 .flatten(); | ||||||
|  |  | ||||||
|  |             // and we finally push the unique vectors into the writer | ||||||
|  |             push_vectors_diff( | ||||||
|  |                 &mut writer, | ||||||
|  |                 &mut key_buffer, | ||||||
|  |                 del_vectors.unwrap_or_default(), | ||||||
|  |                 add_vectors.unwrap_or_default(), | ||||||
|  |             )?; | ||||||
|         } |         } | ||||||
|         // else => the `_vectors` object was `null`, there is nothing to do |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     writer_into_reader(writer) |     writer_into_reader(writer) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Computes the diff between both Del and Add numbers and | ||||||
|  | /// only inserts the parts that differ in the sorter. | ||||||
|  | fn push_vectors_diff( | ||||||
|  |     writer: &mut Writer<BufWriter<File>>, | ||||||
|  |     key_buffer: &mut Vec<u8>, | ||||||
|  |     mut del_vectors: Vec<Vec<f32>>, | ||||||
|  |     mut add_vectors: Vec<Vec<f32>>, | ||||||
|  | ) -> Result<()> { | ||||||
|  |     // We sort and dedup the vectors | ||||||
|  |     del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); | ||||||
|  |     add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); | ||||||
|  |     del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); | ||||||
|  |     add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); | ||||||
|  |  | ||||||
|  |     let merged_vectors_iter = | ||||||
|  |         itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); | ||||||
|  |  | ||||||
|  |     // insert vectors into the writer | ||||||
|  |     for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { | ||||||
|  |         // Generate the key by extending the unique index to it. | ||||||
|  |         key_buffer.truncate(TRUNCATE_SIZE); | ||||||
|  |         let index = u16::try_from(i).unwrap(); | ||||||
|  |         key_buffer.extend_from_slice(&index.to_be_bytes()); | ||||||
|  |  | ||||||
|  |         match eob { | ||||||
|  |             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||||
|  |             EitherOrBoth::Left(vector) => { | ||||||
|  |                 // We insert only the Del part of the Obkv to inform | ||||||
|  |                 // that we only want to remove all those vectors. | ||||||
|  |                 let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |                 obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; | ||||||
|  |                 let bytes = obkv.into_inner()?; | ||||||
|  |                 writer.insert(&key_buffer, bytes)?; | ||||||
|  |             } | ||||||
|  |             EitherOrBoth::Right(vector) => { | ||||||
|  |                 // We insert only the Add part of the Obkv to inform | ||||||
|  |                 // that we only want to remove all those vectors. | ||||||
|  |                 let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |                 obkv.insert(DelAdd::Addition, cast_slice(&vector))?; | ||||||
|  |                 let bytes = obkv.into_inner()?; | ||||||
|  |                 writer.insert(&key_buffer, bytes)?; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Compares two vectors by using the OrderingFloat helper. | ||||||
|  | fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { | ||||||
|  |     a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Extracts the vectors from a JSON value. | ||||||
|  | fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> { | ||||||
|  |     match from_slice(value) { | ||||||
|  |         Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)), | ||||||
|  |         Err(_) => Err(UserError::InvalidVectorsType { | ||||||
|  |             document_id: document_id(), | ||||||
|  |             value: from_slice(value).map_err(InternalError::SerdeJson)?, | ||||||
|  |         } | ||||||
|  |         .into()), | ||||||
|  |     } | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,18 +1,20 @@ | |||||||
| use std::collections::HashSet; | use std::collections::{BTreeSet, HashSet}; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
| use std::iter::FromIterator; |  | ||||||
|  |  | ||||||
| use roaring::RoaringBitmap; | use heed::BytesDecode; | ||||||
|  | use obkv::KvReaderU16; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, |     create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, | ||||||
|     try_split_array_at, GrenadParameters, |     try_split_array_at, writer_into_reader, GrenadParameters, | ||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
|  | use crate::heed_codec::StrBEU16Codec; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::update::index_documents::helpers::read_u32_ne_bytes; | use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::{relative_from_absolute_position, FieldId, Result}; | use crate::update::MergeFn; | ||||||
|  | use crate::{DocumentId, FieldId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the word and the documents ids where this word appear. | /// Extracts the word and the documents ids where this word appear. | ||||||
| /// | /// | ||||||
| @@ -26,65 +28,152 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | |||||||
|     docid_word_positions: grenad::Reader<R>, |     docid_word_positions: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     exact_attributes: &HashSet<FieldId>, |     exact_attributes: &HashSet<FieldId>, | ||||||
| ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ) -> Result<( | ||||||
|  |     grenad::Reader<BufReader<File>>, | ||||||
|  |     grenad::Reader<BufReader<File>>, | ||||||
|  |     grenad::Reader<BufReader<File>>, | ||||||
|  | )> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |  | ||||||
|     let mut word_docids_sorter = create_sorter( |     let mut word_fid_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Unstable, |         grenad::SortAlgorithm::Unstable, | ||||||
|         merge_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory.map(|x| x / 2), |         max_memory.map(|x| x / 3), | ||||||
|  |     ); | ||||||
|  |     let mut key_buffer = Vec::new(); | ||||||
|  |     let mut del_words = BTreeSet::new(); | ||||||
|  |     let mut add_words = BTreeSet::new(); | ||||||
|  |     let mut cursor = docid_word_positions.into_cursor()?; | ||||||
|  |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|  |         let (document_id_bytes, fid_bytes) = try_split_array_at(key) | ||||||
|  |             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|  |         let (fid_bytes, _) = try_split_array_at(fid_bytes) | ||||||
|  |             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|  |         let document_id = u32::from_be_bytes(document_id_bytes); | ||||||
|  |         let fid = u16::from_be_bytes(fid_bytes); | ||||||
|  |  | ||||||
|  |         let del_add_reader = KvReaderDelAdd::new(value); | ||||||
|  |         // extract all unique words to remove. | ||||||
|  |         if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { | ||||||
|  |             for (_pos, word) in KvReaderU16::new(deletion).iter() { | ||||||
|  |                 del_words.insert(word.to_vec()); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // extract all unique additional words. | ||||||
|  |         if let Some(addition) = del_add_reader.get(DelAdd::Addition) { | ||||||
|  |             for (_pos, word) in KvReaderU16::new(addition).iter() { | ||||||
|  |                 add_words.insert(word.to_vec()); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         words_into_sorter( | ||||||
|  |             document_id, | ||||||
|  |             fid, | ||||||
|  |             &mut key_buffer, | ||||||
|  |             &del_words, | ||||||
|  |             &add_words, | ||||||
|  |             &mut word_fid_docids_sorter, | ||||||
|  |         )?; | ||||||
|  |  | ||||||
|  |         del_words.clear(); | ||||||
|  |         add_words.clear(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     let mut word_docids_sorter = create_sorter( | ||||||
|  |         grenad::SortAlgorithm::Unstable, | ||||||
|  |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|  |         indexer.chunk_compression_type, | ||||||
|  |         indexer.chunk_compression_level, | ||||||
|  |         indexer.max_nb_chunks, | ||||||
|  |         max_memory.map(|x| x / 3), | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut exact_word_docids_sorter = create_sorter( |     let mut exact_word_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Unstable, |         grenad::SortAlgorithm::Unstable, | ||||||
|         merge_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory.map(|x| x / 2), |         max_memory.map(|x| x / 3), | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut value_buffer = Vec::new(); |     let mut word_fid_docids_writer = create_writer( | ||||||
|     let mut cursor = docid_word_positions.into_cursor()?; |         indexer.chunk_compression_type, | ||||||
|     while let Some((key, positions)) = cursor.move_on_next()? { |         indexer.chunk_compression_level, | ||||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) |         tempfile::tempfile()?, | ||||||
|  |     ); | ||||||
|  |  | ||||||
|  |     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; | ||||||
|  |     // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. | ||||||
|  |     while let Some((key, value)) = iter.next()? { | ||||||
|  |         // only keep the value if their is a change to apply in the DB. | ||||||
|  |         if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { | ||||||
|  |             word_fid_docids_writer.insert(key, value)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let (word, fid) = StrBEU16Codec::bytes_decode(key) | ||||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; |             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|         let document_id = u32::from_be_bytes(document_id_bytes); |  | ||||||
|  |  | ||||||
|         let bitmap = RoaringBitmap::from_iter(Some(document_id)); |         // every words contained in an attribute set to exact must be pushed in the exact_words list. | ||||||
|         serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; |         if exact_attributes.contains(&fid) { | ||||||
|  |             exact_word_docids_sorter.insert(word.as_bytes(), value)?; | ||||||
|         // If there are no exact attributes, we do not need to iterate over positions. |  | ||||||
|         if exact_attributes.is_empty() { |  | ||||||
|             word_docids_sorter.insert(word_bytes, &value_buffer)?; |  | ||||||
|         } else { |         } else { | ||||||
|             let mut added_to_exact = false; |             word_docids_sorter.insert(word.as_bytes(), value)?; | ||||||
|             let mut added_to_word_docids = false; |  | ||||||
|             for position in read_u32_ne_bytes(positions) { |  | ||||||
|                 // as soon as we know that this word had been to both readers, we don't need to |  | ||||||
|                 // iterate over the positions. |  | ||||||
|                 if added_to_exact && added_to_word_docids { |  | ||||||
|                     break; |  | ||||||
|                 } |  | ||||||
|                 let (fid, _) = relative_from_absolute_position(position); |  | ||||||
|                 if exact_attributes.contains(&fid) && !added_to_exact { |  | ||||||
|                     exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; |  | ||||||
|                     added_to_exact = true; |  | ||||||
|                 } else if !added_to_word_docids { |  | ||||||
|                     word_docids_sorter.insert(word_bytes, &value_buffer)?; |  | ||||||
|                     added_to_word_docids = true; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(( |     Ok(( | ||||||
|         sorter_into_reader(word_docids_sorter, indexer)?, |         sorter_into_reader(word_docids_sorter, indexer)?, | ||||||
|         sorter_into_reader(exact_word_docids_sorter, indexer)?, |         sorter_into_reader(exact_word_docids_sorter, indexer)?, | ||||||
|  |         writer_into_reader(word_fid_docids_writer)?, | ||||||
|     )) |     )) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn words_into_sorter( | ||||||
|  |     document_id: DocumentId, | ||||||
|  |     fid: FieldId, | ||||||
|  |     key_buffer: &mut Vec<u8>, | ||||||
|  |     del_words: &BTreeSet<Vec<u8>>, | ||||||
|  |     add_words: &BTreeSet<Vec<u8>>, | ||||||
|  |     word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>, | ||||||
|  | ) -> Result<()> { | ||||||
|  |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|  |     use itertools::merge_join_by; | ||||||
|  |     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||||
|  |  | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|  |     for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) { | ||||||
|  |         buffer.clear(); | ||||||
|  |         let mut value_writer = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |         let word_bytes = match eob { | ||||||
|  |             Left(word_bytes) => { | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 word_bytes | ||||||
|  |             } | ||||||
|  |             Right(word_bytes) => { | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 word_bytes | ||||||
|  |             } | ||||||
|  |             Both(word_bytes, _) => { | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 word_bytes | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         key_buffer.clear(); | ||||||
|  |         key_buffer.extend_from_slice(word_bytes); | ||||||
|  |         key_buffer.push(0); | ||||||
|  |         key_buffer.extend_from_slice(&fid.to_be_bytes()); | ||||||
|  |         word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,51 +0,0 @@ | |||||||
| use std::fs::File; |  | ||||||
| use std::io::{self, BufReader}; |  | ||||||
|  |  | ||||||
| use super::helpers::{ |  | ||||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, |  | ||||||
|     try_split_array_at, GrenadParameters, |  | ||||||
| }; |  | ||||||
| use crate::error::SerializationError; |  | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; |  | ||||||
| use crate::{relative_from_absolute_position, DocumentId, Result}; |  | ||||||
|  |  | ||||||
| /// Extracts the word, field id, and the documents ids where this word appear at this field id. |  | ||||||
| #[logging_timer::time] |  | ||||||
| pub fn extract_word_fid_docids<R: io::Read + io::Seek>( |  | ||||||
|     docid_word_positions: grenad::Reader<R>, |  | ||||||
|     indexer: GrenadParameters, |  | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { |  | ||||||
|     puffin::profile_function!(); |  | ||||||
|  |  | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |  | ||||||
|  |  | ||||||
|     let mut word_fid_docids_sorter = create_sorter( |  | ||||||
|         grenad::SortAlgorithm::Unstable, |  | ||||||
|         merge_cbo_roaring_bitmaps, |  | ||||||
|         indexer.chunk_compression_type, |  | ||||||
|         indexer.chunk_compression_level, |  | ||||||
|         indexer.max_nb_chunks, |  | ||||||
|         max_memory, |  | ||||||
|     ); |  | ||||||
|  |  | ||||||
|     let mut key_buffer = Vec::new(); |  | ||||||
|     let mut cursor = docid_word_positions.into_cursor()?; |  | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |  | ||||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) |  | ||||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; |  | ||||||
|         let document_id = DocumentId::from_be_bytes(document_id_bytes); |  | ||||||
|  |  | ||||||
|         for position in read_u32_ne_bytes(value) { |  | ||||||
|             key_buffer.clear(); |  | ||||||
|             key_buffer.extend_from_slice(word_bytes); |  | ||||||
|             key_buffer.push(0); |  | ||||||
|             let (fid, _) = relative_from_absolute_position(position); |  | ||||||
|             key_buffer.extend_from_slice(&fid.to_be_bytes()); |  | ||||||
|             word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; |  | ||||||
|  |  | ||||||
|     Ok(word_fid_docids_reader) |  | ||||||
| } |  | ||||||
| @@ -1,16 +1,18 @@ | |||||||
| use std::cmp::Ordering; | use std::collections::{BTreeMap, VecDeque}; | ||||||
| use std::collections::{BinaryHeap, HashMap}; |  | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::BufReader; | use std::io::BufReader; | ||||||
| use std::{cmp, io, mem, str, vec}; | use std::{cmp, io}; | ||||||
|  |  | ||||||
|  | use obkv::KvReaderU16; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, |     create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, | ||||||
|     try_split_array_at, GrenadParameters, MergeFn, |     writer_into_reader, GrenadParameters, MergeFn, | ||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::proximity::{positions_proximity, MAX_DISTANCE}; | use crate::proximity::{index_proximity, MAX_DISTANCE}; | ||||||
|  | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::{DocumentId, Result}; | use crate::{DocumentId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. | /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. | ||||||
| @@ -26,58 +28,137 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |  | ||||||
|     let mut word_pair_proximity_docids_sorter = create_sorter( |     let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) | ||||||
|         grenad::SortAlgorithm::Unstable, |         .map(|_| { | ||||||
|         merge_cbo_roaring_bitmaps, |             create_sorter( | ||||||
|         indexer.chunk_compression_type, |                 grenad::SortAlgorithm::Unstable, | ||||||
|         indexer.chunk_compression_level, |                 merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.max_nb_chunks, |                 indexer.chunk_compression_type, | ||||||
|         max_memory.map(|m| m / 2), |                 indexer.chunk_compression_level, | ||||||
|     ); |                 indexer.max_nb_chunks, | ||||||
|  |                 max_memory.map(|m| m / MAX_DISTANCE as usize), | ||||||
|  |             ) | ||||||
|  |         }) | ||||||
|  |         .collect(); | ||||||
|  |  | ||||||
|     // This map is assumed to not consume a lot of memory. |     let mut del_word_positions: VecDeque<(String, u16)> = | ||||||
|     let mut document_word_positions_heap = BinaryHeap::new(); |         VecDeque::with_capacity(MAX_DISTANCE as usize); | ||||||
|  |     let mut add_word_positions: VecDeque<(String, u16)> = | ||||||
|  |         VecDeque::with_capacity(MAX_DISTANCE as usize); | ||||||
|  |     let mut del_word_pair_proximity = BTreeMap::new(); | ||||||
|  |     let mut add_word_pair_proximity = BTreeMap::new(); | ||||||
|     let mut current_document_id = None; |     let mut current_document_id = None; | ||||||
|  |  | ||||||
|     let mut cursor = docid_word_positions.into_cursor()?; |     let mut cursor = docid_word_positions.into_cursor()?; | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) |         let (document_id_bytes, _fid_bytes) = try_split_array_at(key) | ||||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; |             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|         let document_id = u32::from_be_bytes(document_id_bytes); |         let document_id = u32::from_be_bytes(document_id_bytes); | ||||||
|         let word = str::from_utf8(word_bytes)?; |  | ||||||
|  |  | ||||||
|         let curr_document_id = *current_document_id.get_or_insert(document_id); |         // if we change document, we fill the sorter | ||||||
|         if curr_document_id != document_id { |         if current_document_id.map_or(false, |id| id != document_id) { | ||||||
|             let document_word_positions_heap = mem::take(&mut document_word_positions_heap); |             puffin::profile_scope!("Document into sorter"); | ||||||
|  |  | ||||||
|             document_word_positions_into_sorter( |             document_word_positions_into_sorter( | ||||||
|                 curr_document_id, |                 current_document_id.unwrap(), | ||||||
|                 document_word_positions_heap, |                 &del_word_pair_proximity, | ||||||
|                 &mut word_pair_proximity_docids_sorter, |                 &add_word_pair_proximity, | ||||||
|  |                 &mut word_pair_proximity_docids_sorters, | ||||||
|             )?; |             )?; | ||||||
|             current_document_id = Some(document_id); |             del_word_pair_proximity.clear(); | ||||||
|  |             add_word_pair_proximity.clear(); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let word = word.to_string(); |         current_document_id = Some(document_id); | ||||||
|         let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); |  | ||||||
|         positions.sort_unstable(); |         let (del, add): (Result<_>, Result<_>) = rayon::join( | ||||||
|         let mut iter = positions.into_iter(); |             || { | ||||||
|         if let Some(position) = iter.next() { |                 // deletions | ||||||
|             document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); |                 if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { | ||||||
|         } |                     for (position, word) in KvReaderU16::new(deletion).iter() { | ||||||
|  |                         // drain the proximity window until the head word is considered close to the word we are inserting. | ||||||
|  |                         while del_word_positions.get(0).map_or(false, |(_w, p)| { | ||||||
|  |                             index_proximity(*p as u32, position as u32) >= MAX_DISTANCE | ||||||
|  |                         }) { | ||||||
|  |                             word_positions_into_word_pair_proximity( | ||||||
|  |                                 &mut del_word_positions, | ||||||
|  |                                 &mut del_word_pair_proximity, | ||||||
|  |                             )?; | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // insert the new word. | ||||||
|  |                         let word = std::str::from_utf8(word)?; | ||||||
|  |                         del_word_positions.push_back((word.to_string(), position)); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     while !del_word_positions.is_empty() { | ||||||
|  |                         word_positions_into_word_pair_proximity( | ||||||
|  |                             &mut del_word_positions, | ||||||
|  |                             &mut del_word_pair_proximity, | ||||||
|  |                         )?; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 Ok(()) | ||||||
|  |             }, | ||||||
|  |             || { | ||||||
|  |                 // additions | ||||||
|  |                 if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { | ||||||
|  |                     for (position, word) in KvReaderU16::new(addition).iter() { | ||||||
|  |                         // drain the proximity window until the head word is considered close to the word we are inserting. | ||||||
|  |                         while add_word_positions.get(0).map_or(false, |(_w, p)| { | ||||||
|  |                             index_proximity(*p as u32, position as u32) >= MAX_DISTANCE | ||||||
|  |                         }) { | ||||||
|  |                             word_positions_into_word_pair_proximity( | ||||||
|  |                                 &mut add_word_positions, | ||||||
|  |                                 &mut add_word_pair_proximity, | ||||||
|  |                             )?; | ||||||
|  |                         } | ||||||
|  |  | ||||||
|  |                         // insert the new word. | ||||||
|  |                         let word = std::str::from_utf8(word)?; | ||||||
|  |                         add_word_positions.push_back((word.to_string(), position)); | ||||||
|  |                     } | ||||||
|  |  | ||||||
|  |                     while !add_word_positions.is_empty() { | ||||||
|  |                         word_positions_into_word_pair_proximity( | ||||||
|  |                             &mut add_word_positions, | ||||||
|  |                             &mut add_word_pair_proximity, | ||||||
|  |                         )?; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 Ok(()) | ||||||
|  |             }, | ||||||
|  |         ); | ||||||
|  |  | ||||||
|  |         del?; | ||||||
|  |         add?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if let Some(document_id) = current_document_id { |     if let Some(document_id) = current_document_id { | ||||||
|         // We must make sure that don't lose the current document field id |         puffin::profile_scope!("Final document into sorter"); | ||||||
|         // word count map if we break because we reached the end of the chunk. |  | ||||||
|         let document_word_positions_heap = mem::take(&mut document_word_positions_heap); |  | ||||||
|         document_word_positions_into_sorter( |         document_word_positions_into_sorter( | ||||||
|             document_id, |             document_id, | ||||||
|             document_word_positions_heap, |             &del_word_pair_proximity, | ||||||
|             &mut word_pair_proximity_docids_sorter, |             &add_word_pair_proximity, | ||||||
|  |             &mut word_pair_proximity_docids_sorters, | ||||||
|         )?; |         )?; | ||||||
|     } |     } | ||||||
|  |     { | ||||||
|  |         puffin::profile_scope!("sorter_into_reader"); | ||||||
|  |         let mut writer = create_writer( | ||||||
|  |             indexer.chunk_compression_type, | ||||||
|  |             indexer.chunk_compression_level, | ||||||
|  |             tempfile::tempfile()?, | ||||||
|  |         ); | ||||||
|  |  | ||||||
|     sorter_into_reader(word_pair_proximity_docids_sorter, indexer) |         for sorter in word_pair_proximity_docids_sorters { | ||||||
|  |             sorter.write_into_stream_writer(&mut writer)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         writer_into_reader(writer) | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. | /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. | ||||||
| @@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | |||||||
| /// close to each other. | /// close to each other. | ||||||
| fn document_word_positions_into_sorter( | fn document_word_positions_into_sorter( | ||||||
|     document_id: DocumentId, |     document_id: DocumentId, | ||||||
|     mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>, |     del_word_pair_proximity: &BTreeMap<(String, String), u8>, | ||||||
|     word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>, |     add_word_pair_proximity: &BTreeMap<(String, String), u8>, | ||||||
|  |     word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>], | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let mut word_pair_proximity = HashMap::new(); |     use itertools::merge_join_by; | ||||||
|     let mut ordered_peeked_word_positions = Vec::new(); |     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||||
|     while !word_positions_heap.is_empty() { |  | ||||||
|         while let Some(peeked_word_position) = word_positions_heap.pop() { |  | ||||||
|             ordered_peeked_word_positions.push(peeked_word_position); |  | ||||||
|             if ordered_peeked_word_positions.len() == 7 { |  | ||||||
|                 break; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { |  | ||||||
|             for PeekedWordPosition { word, position, .. } in tail { |  | ||||||
|                 let prox = positions_proximity(head.position, *position); |  | ||||||
|                 if prox > 0 && prox < MAX_DISTANCE { |  | ||||||
|                     word_pair_proximity |  | ||||||
|                         .entry((head.word.clone(), word.clone())) |  | ||||||
|                         .and_modify(|p| { |  | ||||||
|                             *p = cmp::min(*p, prox); |  | ||||||
|                         }) |  | ||||||
|                         .or_insert(prox); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             // Push the tail in the heap. |  | ||||||
|             let tail_iter = ordered_peeked_word_positions.drain(1..); |  | ||||||
|             word_positions_heap.extend(tail_iter); |  | ||||||
|  |  | ||||||
|             // Advance the head and push it in the heap. |  | ||||||
|             if let Some(mut head) = ordered_peeked_word_positions.pop() { |  | ||||||
|                 if let Some(next_position) = head.iter.next() { |  | ||||||
|                     let prox = positions_proximity(head.position, next_position); |  | ||||||
|  |  | ||||||
|                     if prox > 0 && prox < MAX_DISTANCE { |  | ||||||
|                         word_pair_proximity |  | ||||||
|                             .entry((head.word.clone(), head.word.clone())) |  | ||||||
|                             .and_modify(|p| { |  | ||||||
|                                 *p = cmp::min(*p, prox); |  | ||||||
|                             }) |  | ||||||
|                             .or_insert(prox); |  | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     word_positions_heap.push(PeekedWordPosition { |  | ||||||
|                         word: head.word, |  | ||||||
|                         position: next_position, |  | ||||||
|                         iter: head.iter, |  | ||||||
|                     }); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|     let mut key_buffer = Vec::new(); |     let mut key_buffer = Vec::new(); | ||||||
|     for ((w1, w2), prox) in word_pair_proximity { |     for eob in | ||||||
|  |         merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { | ||||||
|  |             d.cmp(a) | ||||||
|  |         }) | ||||||
|  |     { | ||||||
|  |         buffer.clear(); | ||||||
|  |         let mut value_writer = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |         let ((w1, w2), prox) = match eob { | ||||||
|  |             Left(key_value) => { | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key_value | ||||||
|  |             } | ||||||
|  |             Right(key_value) => { | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key_value | ||||||
|  |             } | ||||||
|  |             Both(key_value, _) => { | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key_value | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         key_buffer.clear(); |         key_buffer.clear(); | ||||||
|         key_buffer.push(prox as u8); |         key_buffer.push(*prox); | ||||||
|         key_buffer.extend_from_slice(w1.as_bytes()); |         key_buffer.extend_from_slice(w1.as_bytes()); | ||||||
|         key_buffer.push(0); |         key_buffer.push(0); | ||||||
|         key_buffer.extend_from_slice(w2.as_bytes()); |         key_buffer.extend_from_slice(w2.as_bytes()); | ||||||
|  |  | ||||||
|         word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; |         word_pair_proximity_docids_sorters[*prox as usize - 1] | ||||||
|  |             .insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| struct PeekedWordPosition<I> { | fn word_positions_into_word_pair_proximity( | ||||||
|     word: String, |     word_positions: &mut VecDeque<(String, u16)>, | ||||||
|     position: u32, |     word_pair_proximity: &mut BTreeMap<(String, String), u8>, | ||||||
|     iter: I, | ) -> Result<()> { | ||||||
| } |     let (head_word, head_position) = word_positions.pop_front().unwrap(); | ||||||
|  |     for (word, position) in word_positions.iter() { | ||||||
| impl<I> Ord for PeekedWordPosition<I> { |         let prox = index_proximity(head_position as u32, *position as u32) as u8; | ||||||
|     fn cmp(&self, other: &Self) -> Ordering { |         if prox > 0 && prox < MAX_DISTANCE as u8 { | ||||||
|         self.position.cmp(&other.position).reverse() |             word_pair_proximity | ||||||
|     } |                 .entry((head_word.clone(), word.clone())) | ||||||
| } |                 .and_modify(|p| { | ||||||
|  |                     *p = cmp::min(*p, prox); | ||||||
| impl<I> PartialOrd for PeekedWordPosition<I> { |                 }) | ||||||
|     fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |                 .or_insert(prox); | ||||||
|         Some(self.cmp(other)) |         } | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<I> Eq for PeekedWordPosition<I> {} |  | ||||||
|  |  | ||||||
| impl<I> PartialEq for PeekedWordPosition<I> { |  | ||||||
|     fn eq(&self, other: &Self) -> bool { |  | ||||||
|         self.position == other.position |  | ||||||
|     } |     } | ||||||
|  |     Ok(()) | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,13 +1,18 @@ | |||||||
|  | use std::collections::BTreeSet; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
|  |  | ||||||
|  | use obkv::KvReaderU16; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, |     create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, | ||||||
|     try_split_array_at, GrenadParameters, |     GrenadParameters, | ||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::MergeFn; | ||||||
|  | use crate::{bucketed_position, DocumentId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the word positions and the documents ids where this word appear. | /// Extracts the word positions and the documents ids where this word appear. | ||||||
| /// | /// | ||||||
| @@ -24,32 +29,111 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     let mut word_position_docids_sorter = create_sorter( |     let mut word_position_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Unstable, |         grenad::SortAlgorithm::Unstable, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory, |         max_memory, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|  |     let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new(); | ||||||
|  |     let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new(); | ||||||
|  |     let mut current_document_id: Option<u32> = None; | ||||||
|     let mut key_buffer = Vec::new(); |     let mut key_buffer = Vec::new(); | ||||||
|     let mut cursor = docid_word_positions.into_cursor()?; |     let mut cursor = docid_word_positions.into_cursor()?; | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) |         let (document_id_bytes, _fid_bytes) = try_split_array_at(key) | ||||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; |             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|         let document_id = DocumentId::from_be_bytes(document_id_bytes); |         let document_id = DocumentId::from_be_bytes(document_id_bytes); | ||||||
|  |  | ||||||
|         for position in read_u32_ne_bytes(value) { |         if current_document_id.map_or(false, |id| document_id != id) { | ||||||
|             key_buffer.clear(); |             words_position_into_sorter( | ||||||
|             key_buffer.extend_from_slice(word_bytes); |                 current_document_id.unwrap(), | ||||||
|             key_buffer.push(0); |                 &mut key_buffer, | ||||||
|             let (_, position) = relative_from_absolute_position(position); |                 &del_word_positions, | ||||||
|             let position = bucketed_position(position); |                 &add_word_positions, | ||||||
|             key_buffer.extend_from_slice(&position.to_be_bytes()); |                 &mut word_position_docids_sorter, | ||||||
|             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; |             )?; | ||||||
|  |             del_word_positions.clear(); | ||||||
|  |             add_word_positions.clear(); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         current_document_id = Some(document_id); | ||||||
|  |  | ||||||
|  |         let del_add_reader = KvReaderDelAdd::new(value); | ||||||
|  |         // extract all unique words to remove. | ||||||
|  |         if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { | ||||||
|  |             for (position, word_bytes) in KvReaderU16::new(deletion).iter() { | ||||||
|  |                 let position = bucketed_position(position); | ||||||
|  |                 del_word_positions.insert((position, word_bytes.to_vec())); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // extract all unique additional words. | ||||||
|  |         if let Some(addition) = del_add_reader.get(DelAdd::Addition) { | ||||||
|  |             for (position, word_bytes) in KvReaderU16::new(addition).iter() { | ||||||
|  |                 let position = bucketed_position(position); | ||||||
|  |                 add_word_positions.insert((position, word_bytes.to_vec())); | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if let Some(document_id) = current_document_id { | ||||||
|  |         words_position_into_sorter( | ||||||
|  |             document_id, | ||||||
|  |             &mut key_buffer, | ||||||
|  |             &del_word_positions, | ||||||
|  |             &add_word_positions, | ||||||
|  |             &mut word_position_docids_sorter, | ||||||
|  |         )?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // TODO remove noop DelAdd OBKV | ||||||
|     let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; |     let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; | ||||||
|  |  | ||||||
|     Ok(word_position_docids_reader) |     Ok(word_position_docids_reader) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn words_position_into_sorter( | ||||||
|  |     document_id: DocumentId, | ||||||
|  |     key_buffer: &mut Vec<u8>, | ||||||
|  |     del_word_positions: &BTreeSet<(u16, Vec<u8>)>, | ||||||
|  |     add_word_positions: &BTreeSet<(u16, Vec<u8>)>, | ||||||
|  |     word_position_docids_sorter: &mut grenad::Sorter<MergeFn>, | ||||||
|  | ) -> Result<()> { | ||||||
|  |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|  |     use itertools::merge_join_by; | ||||||
|  |     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||||
|  |  | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|  |     for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a)) | ||||||
|  |     { | ||||||
|  |         buffer.clear(); | ||||||
|  |         let mut value_writer = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |         let (position, word_bytes) = match eob { | ||||||
|  |             Left(key) => { | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key | ||||||
|  |             } | ||||||
|  |             Right(key) => { | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key | ||||||
|  |             } | ||||||
|  |             Both(key, _) => { | ||||||
|  |                 // both values needs to be kept because it will be used in other extractors. | ||||||
|  |                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||||
|  |                 key | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         key_buffer.clear(); | ||||||
|  |         key_buffer.extend_from_slice(word_bytes); | ||||||
|  |         key_buffer.push(0); | ||||||
|  |         key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||||
|  |         word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -6,7 +6,6 @@ mod extract_fid_word_count_docids; | |||||||
| mod extract_geo_points; | mod extract_geo_points; | ||||||
| mod extract_vector_points; | mod extract_vector_points; | ||||||
| mod extract_word_docids; | mod extract_word_docids; | ||||||
| mod extract_word_fid_docids; |  | ||||||
| mod extract_word_pair_proximity_docids; | mod extract_word_pair_proximity_docids; | ||||||
| mod extract_word_position_docids; | mod extract_word_position_docids; | ||||||
|  |  | ||||||
| @@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; | |||||||
| use self::extract_geo_points::extract_geo_points; | use self::extract_geo_points::extract_geo_points; | ||||||
| use self::extract_vector_points::extract_vector_points; | use self::extract_vector_points::extract_vector_points; | ||||||
| use self::extract_word_docids::extract_word_docids; | use self::extract_word_docids::extract_word_docids; | ||||||
| use self::extract_word_fid_docids::extract_word_fid_docids; |  | ||||||
| use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | ||||||
| use self::extract_word_position_docids::extract_word_position_docids; | use self::extract_word_position_docids::extract_word_position_docids; | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, |     as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, | ||||||
|     GrenadParameters, MergeFn, MergeableReader, |     MergeFn, MergeableReader, | ||||||
| }; | }; | ||||||
| use super::{helpers, TypedChunk}; | use super::{helpers, TypedChunk}; | ||||||
| use crate::{FieldId, Result}; | use crate::{FieldId, Result}; | ||||||
| @@ -65,7 +63,6 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                 indexer, |                 indexer, | ||||||
|                 lmdb_writer_sx.clone(), |                 lmdb_writer_sx.clone(), | ||||||
|                 vectors_field_id, |                 vectors_field_id, | ||||||
|                 primary_key_id, |  | ||||||
|             ) |             ) | ||||||
|         }) |         }) | ||||||
|         .collect::<Result<()>>()?; |         .collect::<Result<()>>()?; | ||||||
| @@ -94,9 +91,9 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|     let ( |     let ( | ||||||
|         docid_word_positions_chunks, |         docid_word_positions_chunks, | ||||||
|         ( |         ( | ||||||
|             docid_fid_facet_numbers_chunks, |             fid_docid_facet_numbers_chunks, | ||||||
|             ( |             ( | ||||||
|                 docid_fid_facet_strings_chunks, |                 fid_docid_facet_strings_chunks, | ||||||
|                 ( |                 ( | ||||||
|                     facet_is_null_docids_chunks, |                     facet_is_null_docids_chunks, | ||||||
|                     (facet_is_empty_docids_chunks, facet_exists_docids_chunks), |                     (facet_is_empty_docids_chunks, facet_exists_docids_chunks), | ||||||
| @@ -110,7 +107,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); |         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||||
|         rayon::spawn(move || { |         rayon::spawn(move || { | ||||||
|             debug!("merge {} database", "facet-id-exists-docids"); |             debug!("merge {} database", "facet-id-exists-docids"); | ||||||
|             match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { |             match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { | ||||||
|                 Ok(reader) => { |                 Ok(reader) => { | ||||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); |                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); | ||||||
|                 } |                 } | ||||||
| @@ -126,7 +123,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); |         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||||
|         rayon::spawn(move || { |         rayon::spawn(move || { | ||||||
|             debug!("merge {} database", "facet-id-is-null-docids"); |             debug!("merge {} database", "facet-id-is-null-docids"); | ||||||
|             match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { |             match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { | ||||||
|                 Ok(reader) => { |                 Ok(reader) => { | ||||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); |                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); | ||||||
|                 } |                 } | ||||||
| @@ -142,7 +139,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); |         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||||
|         rayon::spawn(move || { |         rayon::spawn(move || { | ||||||
|             debug!("merge {} database", "facet-id-is-empty-docids"); |             debug!("merge {} database", "facet-id-is-empty-docids"); | ||||||
|             match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { |             match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { | ||||||
|                 Ok(reader) => { |                 Ok(reader) => { | ||||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); |                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); | ||||||
|                 } |                 } | ||||||
| @@ -158,7 +155,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx.clone(), |         lmdb_writer_sx.clone(), | ||||||
|         extract_word_pair_proximity_docids, |         extract_word_pair_proximity_docids, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         TypedChunk::WordPairProximityDocids, |         TypedChunk::WordPairProximityDocids, | ||||||
|         "word-pair-proximity-docids", |         "word-pair-proximity-docids", | ||||||
|     ); |     ); | ||||||
| @@ -168,24 +165,31 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx.clone(), |         lmdb_writer_sx.clone(), | ||||||
|         extract_fid_word_count_docids, |         extract_fid_word_count_docids, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         TypedChunk::FieldIdWordcountDocids, |         TypedChunk::FieldIdWordCountDocids, | ||||||
|         "field-id-wordcount-docids", |         "field-id-wordcount-docids", | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     spawn_extraction_task::< |     spawn_extraction_task::< | ||||||
|         _, |         _, | ||||||
|         _, |         _, | ||||||
|         Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>, |         Vec<( | ||||||
|  |             grenad::Reader<BufReader<File>>, | ||||||
|  |             grenad::Reader<BufReader<File>>, | ||||||
|  |             grenad::Reader<BufReader<File>>, | ||||||
|  |         )>, | ||||||
|     >( |     >( | ||||||
|         docid_word_positions_chunks.clone(), |         docid_word_positions_chunks.clone(), | ||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx.clone(), |         lmdb_writer_sx.clone(), | ||||||
|         move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), |         move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), | ||||||
|         merge_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { |         |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { | ||||||
|             word_docids_reader, |             TypedChunk::WordDocids { | ||||||
|             exact_word_docids_reader, |                 word_docids_reader, | ||||||
|  |                 exact_word_docids_reader, | ||||||
|  |                 word_fid_docids_reader, | ||||||
|  |             } | ||||||
|         }, |         }, | ||||||
|         "word-docids", |         "word-docids", | ||||||
|     ); |     ); | ||||||
| @@ -195,36 +199,27 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx.clone(), |         lmdb_writer_sx.clone(), | ||||||
|         extract_word_position_docids, |         extract_word_position_docids, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         TypedChunk::WordPositionDocids, |         TypedChunk::WordPositionDocids, | ||||||
|         "word-position-docids", |         "word-position-docids", | ||||||
|     ); |     ); | ||||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( |  | ||||||
|         docid_word_positions_chunks, |  | ||||||
|         indexer, |  | ||||||
|         lmdb_writer_sx.clone(), |  | ||||||
|         extract_word_fid_docids, |  | ||||||
|         merge_cbo_roaring_bitmaps, |  | ||||||
|         TypedChunk::WordFidDocids, |  | ||||||
|         "word-fid-docids", |  | ||||||
|     ); |  | ||||||
|  |  | ||||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( |     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||||
|         docid_fid_facet_strings_chunks, |         fid_docid_facet_strings_chunks, | ||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx.clone(), |         lmdb_writer_sx.clone(), | ||||||
|         extract_facet_string_docids, |         extract_facet_string_docids, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         TypedChunk::FieldIdFacetStringDocids, |         TypedChunk::FieldIdFacetStringDocids, | ||||||
|         "field-id-facet-string-docids", |         "field-id-facet-string-docids", | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( |     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||||
|         docid_fid_facet_numbers_chunks, |         fid_docid_facet_numbers_chunks, | ||||||
|         indexer, |         indexer, | ||||||
|         lmdb_writer_sx, |         lmdb_writer_sx, | ||||||
|         extract_facet_number_docids, |         extract_facet_number_docids, | ||||||
|         merge_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         TypedChunk::FieldIdFacetNumberDocids, |         TypedChunk::FieldIdFacetNumberDocids, | ||||||
|         "field-id-facet-number-docids", |         "field-id-facet-number-docids", | ||||||
|     ); |     ); | ||||||
| @@ -278,7 +273,6 @@ fn send_original_documents_data( | |||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     vectors_field_id: Option<FieldId>, |     vectors_field_id: Option<FieldId>, | ||||||
|     primary_key_id: FieldId, |  | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let original_documents_chunk = |     let original_documents_chunk = | ||||||
|         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; |         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||||
| @@ -287,12 +281,7 @@ fn send_original_documents_data( | |||||||
|         let documents_chunk_cloned = original_documents_chunk.clone(); |         let documents_chunk_cloned = original_documents_chunk.clone(); | ||||||
|         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); |         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||||
|         rayon::spawn(move || { |         rayon::spawn(move || { | ||||||
|             let result = extract_vector_points( |             let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id); | ||||||
|                 documents_chunk_cloned, |  | ||||||
|                 indexer, |  | ||||||
|                 primary_key_id, |  | ||||||
|                 vectors_field_id, |  | ||||||
|             ); |  | ||||||
|             let _ = match result { |             let _ = match result { | ||||||
|                 Ok(vector_points) => { |                 Ok(vector_points) => { | ||||||
|                     lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) |                     lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) | ||||||
| @@ -356,10 +345,10 @@ fn send_and_extract_flattened_documents_data( | |||||||
|         }); |         }); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = |     let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = | ||||||
|         rayon::join( |         rayon::join( | ||||||
|             || { |             || { | ||||||
|                 let (documents_ids, docid_word_positions_chunk, script_language_pair) = |                 let (docid_word_positions_chunk, script_language_pair) = | ||||||
|                     extract_docid_word_positions( |                     extract_docid_word_positions( | ||||||
|                         flattened_documents_chunk.clone(), |                         flattened_documents_chunk.clone(), | ||||||
|                         indexer, |                         indexer, | ||||||
| @@ -370,9 +359,6 @@ fn send_and_extract_flattened_documents_data( | |||||||
|                         max_positions_per_attributes, |                         max_positions_per_attributes, | ||||||
|                     )?; |                     )?; | ||||||
|  |  | ||||||
|                 // send documents_ids to DB writer |  | ||||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); |  | ||||||
|  |  | ||||||
|                 // send docid_word_positions_chunk to DB writer |                 // send docid_word_positions_chunk to DB writer | ||||||
|                 let docid_word_positions_chunk = |                 let docid_word_positions_chunk = | ||||||
|                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; |                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; | ||||||
| @@ -384,8 +370,8 @@ fn send_and_extract_flattened_documents_data( | |||||||
|             }, |             }, | ||||||
|             || { |             || { | ||||||
|                 let ExtractedFacetValues { |                 let ExtractedFacetValues { | ||||||
|                     docid_fid_facet_numbers_chunk, |                     fid_docid_facet_numbers_chunk, | ||||||
|                     docid_fid_facet_strings_chunk, |                     fid_docid_facet_strings_chunk, | ||||||
|                     fid_facet_is_null_docids_chunk, |                     fid_facet_is_null_docids_chunk, | ||||||
|                     fid_facet_is_empty_docids_chunk, |                     fid_facet_is_empty_docids_chunk, | ||||||
|                     fid_facet_exists_docids_chunk, |                     fid_facet_exists_docids_chunk, | ||||||
| @@ -396,26 +382,26 @@ fn send_and_extract_flattened_documents_data( | |||||||
|                     geo_fields_ids, |                     geo_fields_ids, | ||||||
|                 )?; |                 )?; | ||||||
|  |  | ||||||
|                 // send docid_fid_facet_numbers_chunk to DB writer |                 // send fid_docid_facet_numbers_chunk to DB writer | ||||||
|                 let docid_fid_facet_numbers_chunk = |                 let fid_docid_facet_numbers_chunk = | ||||||
|                     unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; |                     unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? }; | ||||||
|  |  | ||||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( |                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( | ||||||
|                     docid_fid_facet_numbers_chunk.clone(), |                     fid_docid_facet_numbers_chunk.clone(), | ||||||
|                 ))); |                 ))); | ||||||
|  |  | ||||||
|                 // send docid_fid_facet_strings_chunk to DB writer |                 // send fid_docid_facet_strings_chunk to DB writer | ||||||
|                 let docid_fid_facet_strings_chunk = |                 let fid_docid_facet_strings_chunk = | ||||||
|                     unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; |                     unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? }; | ||||||
|  |  | ||||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( |                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( | ||||||
|                     docid_fid_facet_strings_chunk.clone(), |                     fid_docid_facet_strings_chunk.clone(), | ||||||
|                 ))); |                 ))); | ||||||
|  |  | ||||||
|                 Ok(( |                 Ok(( | ||||||
|                     docid_fid_facet_numbers_chunk, |                     fid_docid_facet_numbers_chunk, | ||||||
|                     ( |                     ( | ||||||
|                         docid_fid_facet_strings_chunk, |                         fid_docid_facet_strings_chunk, | ||||||
|                         ( |                         ( | ||||||
|                             fid_facet_is_null_docids_chunk, |                             fid_facet_is_null_docids_chunk, | ||||||
|                             (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), |                             (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), | ||||||
| @@ -425,5 +411,5 @@ fn send_and_extract_flattened_documents_data( | |||||||
|             }, |             }, | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|     Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) |     Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?)) | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,14 +1,12 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader, BufWriter, Seek}; | use std::io::{self, BufReader, BufWriter, Seek}; | ||||||
| use std::time::Instant; |  | ||||||
|  |  | ||||||
| use grenad::{CompressionType, Sorter}; | use grenad::{CompressionType, Sorter}; | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use log::debug; |  | ||||||
|  |  | ||||||
| use super::{ClonableMmap, MergeFn}; | use super::{ClonableMmap, MergeFn}; | ||||||
| use crate::error::InternalError; | use crate::update::index_documents::valid_lmdb_key; | ||||||
| use crate::Result; | use crate::Result; | ||||||
|  |  | ||||||
| pub type CursorClonableMmap = io::Cursor<ClonableMmap>; | pub type CursorClonableMmap = io::Cursor<ClonableMmap>; | ||||||
| @@ -47,6 +45,7 @@ pub fn create_sorter( | |||||||
|         builder.allow_realloc(false); |         builder.allow_realloc(false); | ||||||
|     } |     } | ||||||
|     builder.sort_algorithm(sort_algorithm); |     builder.sort_algorithm(sort_algorithm); | ||||||
|  |     builder.sort_in_parallel(true); | ||||||
|     builder.build() |     builder.build() | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -54,6 +53,7 @@ pub fn sorter_into_reader( | |||||||
|     sorter: grenad::Sorter<MergeFn>, |     sorter: grenad::Sorter<MergeFn>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|  |     puffin::profile_function!(); | ||||||
|     let mut writer = create_writer( |     let mut writer = create_writer( | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
| @@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl MergeableReader | ||||||
|  |     for Vec<( | ||||||
|  |         grenad::Reader<BufReader<File>>, | ||||||
|  |         grenad::Reader<BufReader<File>>, | ||||||
|  |         grenad::Reader<BufReader<File>>, | ||||||
|  |     )> | ||||||
|  | { | ||||||
|  |     type Output = ( | ||||||
|  |         grenad::Reader<BufReader<File>>, | ||||||
|  |         grenad::Reader<BufReader<File>>, | ||||||
|  |         grenad::Reader<BufReader<File>>, | ||||||
|  |     ); | ||||||
|  |  | ||||||
|  |     fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> { | ||||||
|  |         let mut m1 = MergerBuilder::new(merge_fn); | ||||||
|  |         let mut m2 = MergerBuilder::new(merge_fn); | ||||||
|  |         let mut m3 = MergerBuilder::new(merge_fn); | ||||||
|  |         for (r1, r2, r3) in self.into_iter() { | ||||||
|  |             m1.push(r1)?; | ||||||
|  |             m2.push(r2)?; | ||||||
|  |             m3.push(r3)?; | ||||||
|  |         } | ||||||
|  |         Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>); | struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>); | ||||||
|  |  | ||||||
| impl<R: io::Read + io::Seek> MergerBuilder<R> { | impl<R: io::Read + io::Seek> MergerBuilder<R> { | ||||||
| @@ -195,11 +221,13 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>( | |||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         while let Some((document_id, obkv)) = cursor.move_on_next()? { |         while let Some((document_id, obkv)) = cursor.move_on_next()? { | ||||||
|             obkv_documents.insert(document_id, obkv)?; |             if !obkv.is_empty() { | ||||||
|             current_chunk_size += document_id.len() as u64 + obkv.len() as u64; |                 obkv_documents.insert(document_id, obkv)?; | ||||||
|  |                 current_chunk_size += document_id.len() as u64 + obkv.len() as u64; | ||||||
|  |  | ||||||
|             if current_chunk_size >= documents_chunk_size as u64 { |                 if current_chunk_size >= documents_chunk_size as u64 { | ||||||
|                 return writer_into_reader(obkv_documents).map(Some); |                     return writer_into_reader(obkv_documents).map(Some); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -210,45 +238,46 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>( | |||||||
|     Ok(std::iter::from_fn(move || transposer().transpose())) |     Ok(std::iter::from_fn(move || transposer().transpose())) | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn sorter_into_lmdb_database( | /// Write provided sorter in database using serialize_value function. | ||||||
|     wtxn: &mut heed::RwTxn, | /// merge_values function is used if an entry already exist in the database. | ||||||
|     database: heed::PolyDatabase, | pub fn write_sorter_into_database<K, V, FS, FM>( | ||||||
|     sorter: Sorter<MergeFn>, |     sorter: Sorter<MergeFn>, | ||||||
|     merge: MergeFn, |     database: &heed::Database<K, V>, | ||||||
| ) -> Result<()> { |     wtxn: &mut heed::RwTxn, | ||||||
|  |     index_is_empty: bool, | ||||||
|  |     serialize_value: FS, | ||||||
|  |     merge_values: FM, | ||||||
|  | ) -> Result<()> | ||||||
|  | where | ||||||
|  |     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||||
|  |     FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>, | ||||||
|  | { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|     debug!("Writing MTBL sorter..."); |  | ||||||
|     let before = Instant::now(); |     let mut buffer = Vec::new(); | ||||||
|  |     let database = database.remap_types::<ByteSlice, ByteSlice>(); | ||||||
|  |  | ||||||
|     let mut merger_iter = sorter.into_stream_merger_iter()?; |     let mut merger_iter = sorter.into_stream_merger_iter()?; | ||||||
|     if database.is_empty(wtxn)? { |     while let Some((key, value)) = merger_iter.next()? { | ||||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; |         if valid_lmdb_key(key) { | ||||||
|         while let Some((k, v)) = merger_iter.next()? { |             buffer.clear(); | ||||||
|             // safety: we don't keep references from inside the LMDB database. |             let value = if index_is_empty { | ||||||
|             unsafe { out_iter.append(k, v)? }; |                 Some(serialize_value(value, &mut buffer)?) | ||||||
|         } |             } else { | ||||||
|     } else { |                 match database.get(wtxn, key)? { | ||||||
|         while let Some((k, v)) = merger_iter.next()? { |                     Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, | ||||||
|             let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; |                     None => Some(serialize_value(value, &mut buffer)?), | ||||||
|             match iter.next().transpose()? { |  | ||||||
|                 Some((key, old_val)) if key == k => { |  | ||||||
|                     let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; |  | ||||||
|                     let val = merge(k, &vals).map_err(|_| { |  | ||||||
|                         // TODO just wrap this error? |  | ||||||
|                         InternalError::IndexingMergingKeys { process: "get-put-merge" } |  | ||||||
|                     })?; |  | ||||||
|                     // safety: we don't keep references from inside the LMDB database. |  | ||||||
|                     unsafe { iter.put_current(k, &val)? }; |  | ||||||
|                 } |                 } | ||||||
|                 _ => { |             }; | ||||||
|                     drop(iter); |             match value { | ||||||
|                     database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; |                 Some(value) => database.put(wtxn, key, value)?, | ||||||
|  |                 None => { | ||||||
|  |                     database.delete(wtxn, key)?; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,22 +6,12 @@ use std::result::Result as StdResult; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::heed_codec::CboRoaringBitmapCodec; | use crate::heed_codec::CboRoaringBitmapCodec; | ||||||
|  | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::update::index_documents::transform::Operation; | use crate::update::index_documents::transform::Operation; | ||||||
| use crate::Result; | use crate::Result; | ||||||
|  |  | ||||||
| pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>; | pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>; | ||||||
|  |  | ||||||
| pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { |  | ||||||
|     if values.len() == 1 { |  | ||||||
|         Ok(values[0].clone()) |  | ||||||
|     } else { |  | ||||||
|         let capacity = values.iter().map(|v| v.len()).sum::<usize>(); |  | ||||||
|         let mut output = Vec::with_capacity(capacity); |  | ||||||
|         values.iter().for_each(|integers| output.extend_from_slice(integers)); |  | ||||||
|         Ok(Cow::Owned(output)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> { | pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> { | ||||||
|     buffer.clear(); |     buffer.clear(); | ||||||
|     buffer.reserve(bitmap.serialized_size()); |     buffer.reserve(bitmap.serialized_size()); | ||||||
| @@ -75,57 +65,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow< | |||||||
|     Ok(obkvs.last().unwrap().clone()) |     Ok(obkvs.last().unwrap().clone()) | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) { | pub fn merge_two_del_add_obkvs( | ||||||
|  |     base: obkv::KvReaderU16, | ||||||
|  |     update: obkv::KvReaderU16, | ||||||
|  |     merge_additions: bool, | ||||||
|  |     buffer: &mut Vec<u8>, | ||||||
|  | ) { | ||||||
|     use itertools::merge_join_by; |     use itertools::merge_join_by; | ||||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; |     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||||
|  |  | ||||||
|     buffer.clear(); |     buffer.clear(); | ||||||
|  |  | ||||||
|     let mut writer = obkv::KvWriter::new(buffer); |     let mut writer = obkv::KvWriter::new(buffer); | ||||||
|  |     let mut value_buffer = Vec::new(); | ||||||
|     for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { |     for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { | ||||||
|         match eob { |         match eob { | ||||||
|             Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), |             Left((k, v)) => { | ||||||
|  |                 if merge_additions { | ||||||
|  |                     writer.insert(k, v).unwrap() | ||||||
|  |                 } else { | ||||||
|  |                     // If merge_additions is false, recreate an obkv keeping the deletions only. | ||||||
|  |                     value_buffer.clear(); | ||||||
|  |                     let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                     let base_reader = KvReaderDelAdd::new(v); | ||||||
|  |  | ||||||
|  |                     if let Some(deletion) = base_reader.get(DelAdd::Deletion) { | ||||||
|  |                         value_writer.insert(DelAdd::Deletion, deletion).unwrap(); | ||||||
|  |                         value_writer.finish().unwrap(); | ||||||
|  |                         writer.insert(k, &value_buffer).unwrap() | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             Right((k, v)) => writer.insert(k, v).unwrap(), | ||||||
|  |             Both((k, base), (_, update)) => { | ||||||
|  |                 // merge deletions and additions. | ||||||
|  |                 value_buffer.clear(); | ||||||
|  |                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||||
|  |                 let base_reader = KvReaderDelAdd::new(base); | ||||||
|  |                 let update_reader = KvReaderDelAdd::new(update); | ||||||
|  |  | ||||||
|  |                 // keep newest deletion. | ||||||
|  |                 if let Some(deletion) = update_reader | ||||||
|  |                     .get(DelAdd::Deletion) | ||||||
|  |                     .or_else(|| base_reader.get(DelAdd::Deletion)) | ||||||
|  |                 { | ||||||
|  |                     value_writer.insert(DelAdd::Deletion, deletion).unwrap(); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 // keep base addition only if merge_additions is true. | ||||||
|  |                 let base_addition = | ||||||
|  |                     merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten(); | ||||||
|  |                 // keep newest addition. | ||||||
|  |                 // TODO use or_else | ||||||
|  |                 if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) { | ||||||
|  |                     value_writer.insert(DelAdd::Addition, addition).unwrap(); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 value_writer.finish().unwrap(); | ||||||
|  |                 writer.insert(k, &value_buffer).unwrap() | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     writer.finish().unwrap(); |     writer.finish().unwrap(); | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Merge all the obks in the order we see them. | /// Merge all the obkvs from the newest to the oldest. | ||||||
| pub fn merge_obkvs_and_operations<'a>( | fn inner_merge_del_add_obkvs<'a>( | ||||||
|  |     obkvs: &[Cow<'a, [u8]>], | ||||||
|  |     merge_additions: bool, | ||||||
|  | ) -> Result<Cow<'a, [u8]>> { | ||||||
|  |     // pop the newest operation from the list. | ||||||
|  |     let (newest, obkvs) = obkvs.split_last().unwrap(); | ||||||
|  |     // keep the operation type for the returned value. | ||||||
|  |     let newest_operation_type = newest[0]; | ||||||
|  |  | ||||||
|  |     // treat the newest obkv as the starting point of the merge. | ||||||
|  |     let mut acc_operation_type = newest_operation_type; | ||||||
|  |     let mut acc = newest[1..].to_vec(); | ||||||
|  |     let mut buffer = Vec::new(); | ||||||
|  |     // reverse iter from the most recent to the oldest. | ||||||
|  |     for current in obkvs.iter().rev() { | ||||||
|  |         // if in the previous iteration there was a complete deletion, | ||||||
|  |         // stop the merge process. | ||||||
|  |         if acc_operation_type == Operation::Deletion as u8 { | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let newest = obkv::KvReader::new(&acc); | ||||||
|  |         let oldest = obkv::KvReader::new(¤t[1..]); | ||||||
|  |         merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer); | ||||||
|  |  | ||||||
|  |         // we want the result of the merge into our accumulator. | ||||||
|  |         std::mem::swap(&mut acc, &mut buffer); | ||||||
|  |         acc_operation_type = current[0]; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     acc.insert(0, newest_operation_type); | ||||||
|  |     Ok(Cow::from(acc)) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Merge all the obkvs from the newest to the oldest. | ||||||
|  | pub fn obkvs_merge_additions_and_deletions<'a>( | ||||||
|     _key: &[u8], |     _key: &[u8], | ||||||
|     obkvs: &[Cow<'a, [u8]>], |     obkvs: &[Cow<'a, [u8]>], | ||||||
| ) -> Result<Cow<'a, [u8]>> { | ) -> Result<Cow<'a, [u8]>> { | ||||||
|     // [add, add, delete, add, add] |     inner_merge_del_add_obkvs(obkvs, true) | ||||||
|     // we can ignore everything that happened before the last delete. |  | ||||||
|     let starting_position = |  | ||||||
|         obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0); |  | ||||||
|  |  | ||||||
|     // [add, add, delete] |  | ||||||
|     // if the last operation was a deletion then we simply return the deletion |  | ||||||
|     if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8 |  | ||||||
|     { |  | ||||||
|         return Ok(obkvs[obkvs.len() - 1].clone()); |  | ||||||
|     } |  | ||||||
|     let mut buffer = Vec::new(); |  | ||||||
|  |  | ||||||
|     // (add, add, delete) [add, add] |  | ||||||
|     // in the other case, no deletion will be encountered during the merge |  | ||||||
|     let mut ret = |  | ||||||
|         obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| { |  | ||||||
|             let first = obkv::KvReader::new(&acc); |  | ||||||
|             let second = obkv::KvReader::new(¤t[1..]); |  | ||||||
|             merge_two_obkvs(first, second, &mut buffer); |  | ||||||
|  |  | ||||||
|             // we want the result of the merge into our accumulator |  | ||||||
|             std::mem::swap(&mut acc, &mut buffer); |  | ||||||
|             acc |  | ||||||
|         }); |  | ||||||
|  |  | ||||||
|     ret.insert(0, Operation::Addition as u8); |  | ||||||
|     Ok(Cow::from(ret)) |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions. | ||||||
|  | pub fn obkvs_keep_last_addition_merge_deletions<'a>( | ||||||
|  |     _key: &[u8], | ||||||
|  |     obkvs: &[Cow<'a, [u8]>], | ||||||
|  | ) -> Result<Cow<'a, [u8]>> { | ||||||
|  |     inner_merge_del_add_obkvs(obkvs, false) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// Do a union of all the CboRoaringBitmaps in the values. | ||||||
| pub fn merge_cbo_roaring_bitmaps<'a>( | pub fn merge_cbo_roaring_bitmaps<'a>( | ||||||
|     _key: &[u8], |     _key: &[u8], | ||||||
|     values: &[Cow<'a, [u8]>], |     values: &[Cow<'a, [u8]>], | ||||||
| @@ -138,3 +194,52 @@ pub fn merge_cbo_roaring_bitmaps<'a>( | |||||||
|         Ok(Cow::from(vec)) |         Ok(Cow::from(vec)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv | ||||||
|  | /// separately and outputs a new DelAdd with both unions. | ||||||
|  | pub fn merge_deladd_cbo_roaring_bitmaps<'a>( | ||||||
|  |     _key: &[u8], | ||||||
|  |     values: &[Cow<'a, [u8]>], | ||||||
|  | ) -> Result<Cow<'a, [u8]>> { | ||||||
|  |     if values.len() == 1 { | ||||||
|  |         Ok(values[0].clone()) | ||||||
|  |     } else { | ||||||
|  |         // Retrieve the bitmaps from both sides | ||||||
|  |         let mut del_bitmaps_bytes = Vec::new(); | ||||||
|  |         let mut add_bitmaps_bytes = Vec::new(); | ||||||
|  |         for value in values { | ||||||
|  |             let obkv = KvReaderDelAdd::new(value); | ||||||
|  |             if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { | ||||||
|  |                 del_bitmaps_bytes.push(bitmap_bytes); | ||||||
|  |             } | ||||||
|  |             if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { | ||||||
|  |                 add_bitmaps_bytes.push(bitmap_bytes); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let mut output_deladd_obkv = KvWriterDelAdd::memory(); | ||||||
|  |         let mut buffer = Vec::new(); | ||||||
|  |         CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; | ||||||
|  |         output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; | ||||||
|  |         buffer.clear(); | ||||||
|  |         CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; | ||||||
|  |         output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; | ||||||
|  |         output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// A function that merges a DelAdd of bitmao into an already existing bitmap. | ||||||
|  | /// | ||||||
|  | /// The first argument is the DelAdd obkv of CboRoaringBitmaps and | ||||||
|  | /// the second one is the CboRoaringBitmap to merge into. | ||||||
|  | pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( | ||||||
|  |     deladd_obkv: &[u8], | ||||||
|  |     previous: &[u8], | ||||||
|  |     buffer: &'a mut Vec<u8>, | ||||||
|  | ) -> Result<Option<&'a [u8]>> { | ||||||
|  |     Ok(CboRoaringBitmapCodec::merge_deladd_into( | ||||||
|  |         KvReaderDelAdd::new(deladd_obkv), | ||||||
|  |         previous, | ||||||
|  |         buffer, | ||||||
|  |     )?) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -9,13 +9,14 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; | |||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| pub use grenad_helpers::{ | pub use grenad_helpers::{ | ||||||
|     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, |     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, | ||||||
|     merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, |     merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, | ||||||
|     GrenadParameters, MergeableReader, |     GrenadParameters, MergeableReader, | ||||||
| }; | }; | ||||||
| pub use merge_functions::{ | pub use merge_functions::{ | ||||||
|     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, |     keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, | ||||||
|     merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, |     merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|     serialize_roaring_bitmap, MergeFn, |     merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, | ||||||
|  |     obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| use crate::MAX_WORD_LENGTH; | use crate::MAX_WORD_LENGTH; | ||||||
| @@ -44,10 +45,6 @@ where | |||||||
|     Some((head, tail)) |     Some((head, tail)) | ||||||
| } | } | ||||||
|  |  | ||||||
| pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ { |  | ||||||
|     bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Converts an fst Stream into an HashSet of Strings. | /// Converts an fst Stream into an HashSet of Strings. | ||||||
| pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>> | pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>> | ||||||
| where | where | ||||||
|   | |||||||
| @@ -20,11 +20,13 @@ use slice_group_by::GroupBy; | |||||||
| use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; | ||||||
|  |  | ||||||
| use self::enrich::enrich_documents_batch; | use self::enrich::enrich_documents_batch; | ||||||
| pub use self::enrich::{extract_finite_float_from_value, DocumentId}; | pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; | ||||||
| pub use self::helpers::{ | pub use self::helpers::{ | ||||||
|     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, |     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, | ||||||
|     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, |     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, | ||||||
|     sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, |     merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|  |     merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, | ||||||
|  |     ClonableMmap, MergeFn, | ||||||
| }; | }; | ||||||
| use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | ||||||
| pub use self::transform::{Transform, TransformOutput}; | pub use self::transform::{Transform, TransformOutput}; | ||||||
| @@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; | |||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||||
| use crate::update::{ | use crate::update::{ | ||||||
|     self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, |     IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, | ||||||
|     WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, |  | ||||||
| }; | }; | ||||||
| use crate::{Index, Result, RoaringBitmapCodec}; | use crate::{CboRoaringBitmapCodec, Index, Result}; | ||||||
|  |  | ||||||
| static MERGED_DATABASE_COUNT: usize = 7; | static MERGED_DATABASE_COUNT: usize = 7; | ||||||
| static PREFIX_DATABASE_COUNT: usize = 5; | static PREFIX_DATABASE_COUNT: usize = 4; | ||||||
| static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; | static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | ||||||
| @@ -86,7 +87,6 @@ pub struct IndexDocumentsConfig { | |||||||
|     pub words_positions_level_group_size: Option<NonZeroU32>, |     pub words_positions_level_group_size: Option<NonZeroU32>, | ||||||
|     pub words_positions_min_level_size: Option<NonZeroU32>, |     pub words_positions_min_level_size: Option<NonZeroU32>, | ||||||
|     pub update_method: IndexDocumentsMethod, |     pub update_method: IndexDocumentsMethod, | ||||||
|     pub deletion_strategy: DeletionStrategy, |  | ||||||
|     pub autogenerate_docids: bool, |     pub autogenerate_docids: bool, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -178,6 +178,7 @@ where | |||||||
|  |  | ||||||
|         // Early return when there is no document to add |         // Early return when there is no document to add | ||||||
|         if to_delete.is_empty() { |         if to_delete.is_empty() { | ||||||
|  |             // Maintains Invariant: remove documents actually always returns Ok for the inner result | ||||||
|             return Ok((self, Ok(0))); |             return Ok((self, Ok(0))); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -190,14 +191,48 @@ where | |||||||
|  |  | ||||||
|         self.deleted_documents += deleted_documents; |         self.deleted_documents += deleted_documents; | ||||||
|  |  | ||||||
|  |         // Maintains Invariant: remove documents actually always returns Ok for the inner result | ||||||
|         Ok((self, Ok(deleted_documents))) |         Ok((self, Ok(deleted_documents))) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Removes documents from db using their internal document ids. | ||||||
|  |     /// | ||||||
|  |     /// # Warning | ||||||
|  |     /// | ||||||
|  |     /// This function is dangerous and will only work correctly if: | ||||||
|  |     /// | ||||||
|  |     /// - All the passed ids currently exist in the database | ||||||
|  |     /// - No batching using the standards `remove_documents` and `add_documents` took place | ||||||
|  |     /// | ||||||
|  |     /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. | ||||||
|  |     pub fn remove_documents_from_db_no_batch( | ||||||
|  |         mut self, | ||||||
|  |         to_delete: &RoaringBitmap, | ||||||
|  |     ) -> Result<(Self, u64)> { | ||||||
|  |         puffin::profile_function!(); | ||||||
|  |  | ||||||
|  |         // Early return when there is no document to add | ||||||
|  |         if to_delete.is_empty() { | ||||||
|  |             return Ok((self, 0)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let deleted_documents = self | ||||||
|  |             .transform | ||||||
|  |             .as_mut() | ||||||
|  |             .expect("Invalid document deletion state") | ||||||
|  |             .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)? | ||||||
|  |             as u64; | ||||||
|  |  | ||||||
|  |         self.deleted_documents += deleted_documents; | ||||||
|  |  | ||||||
|  |         Ok((self, deleted_documents)) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     #[logging_timer::time("IndexDocuments::{}")] |     #[logging_timer::time("IndexDocuments::{}")] | ||||||
|     pub fn execute(mut self) -> Result<DocumentAdditionResult> { |     pub fn execute(mut self) -> Result<DocumentAdditionResult> { | ||||||
|         puffin::profile_function!(); |         puffin::profile_function!(); | ||||||
|  |  | ||||||
|         if self.added_documents == 0 { |         if self.added_documents == 0 && self.deleted_documents == 0 { | ||||||
|             let number_of_documents = self.index.number_of_documents(self.wtxn)?; |             let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||||
|             return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); |             return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); | ||||||
|         } |         } | ||||||
| @@ -241,9 +276,6 @@ where | |||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map, |             fields_ids_map, | ||||||
|             field_distribution, |             field_distribution, | ||||||
|             new_external_documents_ids, |  | ||||||
|             new_documents_ids, |  | ||||||
|             replaced_documents_ids, |  | ||||||
|             documents_count, |             documents_count, | ||||||
|             original_documents, |             original_documents, | ||||||
|             flattened_documents, |             flattened_documents, | ||||||
| @@ -367,29 +399,12 @@ where | |||||||
|                 let _ = lmdb_writer_sx.send(Err(e)); |                 let _ = lmdb_writer_sx.send(Err(e)); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // needs to be droped to avoid channel waiting lock. |             // needs to be dropped to avoid channel waiting lock. | ||||||
|             drop(lmdb_writer_sx) |             drop(lmdb_writer_sx) | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         // We delete the documents that this document addition replaces. This way we are |         let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0; | ||||||
|         // able to simply insert all the documents even if they already exist in the database. |  | ||||||
|         if !replaced_documents_ids.is_empty() { |  | ||||||
|             let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; |  | ||||||
|             deletion_builder.strategy(self.config.deletion_strategy); |  | ||||||
|             debug!("documents to delete {:?}", replaced_documents_ids); |  | ||||||
|             deletion_builder.delete_documents(&replaced_documents_ids); |  | ||||||
|             let deleted_documents_result = deletion_builder.execute_inner()?; |  | ||||||
|             debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let index_documents_ids = self.index.documents_ids(self.wtxn)?; |  | ||||||
|         let index_is_empty = index_documents_ids.is_empty(); |  | ||||||
|         let mut final_documents_ids = RoaringBitmap::new(); |         let mut final_documents_ids = RoaringBitmap::new(); | ||||||
|         let mut word_pair_proximity_docids = None; |  | ||||||
|         let mut word_position_docids = None; |  | ||||||
|         let mut word_fid_docids = None; |  | ||||||
|         let mut word_docids = None; |  | ||||||
|         let mut exact_word_docids = None; |  | ||||||
|  |  | ||||||
|         let mut databases_seen = 0; |         let mut databases_seen = 0; | ||||||
|         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { |         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||||
| @@ -397,35 +412,40 @@ where | |||||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, |             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|  |         let mut word_position_docids = None; | ||||||
|  |         let mut word_fid_docids = None; | ||||||
|  |         let mut word_docids = None; | ||||||
|  |         let mut exact_word_docids = None; | ||||||
|  |  | ||||||
|         for result in lmdb_writer_rx { |         for result in lmdb_writer_rx { | ||||||
|             if (self.should_abort)() { |             if (self.should_abort)() { | ||||||
|                 return Err(Error::InternalError(InternalError::AbortedIndexation)); |                 return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let typed_chunk = match result? { |             let typed_chunk = match result? { | ||||||
|                 TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { |                 TypedChunk::WordDocids { | ||||||
|  |                     word_docids_reader, | ||||||
|  |                     exact_word_docids_reader, | ||||||
|  |                     word_fid_docids_reader, | ||||||
|  |                 } => { | ||||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; |                     let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; | ||||||
|                     word_docids = Some(cloneable_chunk); |                     word_docids = Some(cloneable_chunk); | ||||||
|                     let cloneable_chunk = |                     let cloneable_chunk = | ||||||
|                         unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; |                         unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; | ||||||
|                     exact_word_docids = Some(cloneable_chunk); |                     exact_word_docids = Some(cloneable_chunk); | ||||||
|                     TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } |                     let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; | ||||||
|                 } |                     word_fid_docids = Some(cloneable_chunk); | ||||||
|                 TypedChunk::WordPairProximityDocids(chunk) => { |                     TypedChunk::WordDocids { | ||||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; |                         word_docids_reader, | ||||||
|                     word_pair_proximity_docids = Some(cloneable_chunk); |                         exact_word_docids_reader, | ||||||
|                     TypedChunk::WordPairProximityDocids(chunk) |                         word_fid_docids_reader, | ||||||
|  |                     } | ||||||
|                 } |                 } | ||||||
|                 TypedChunk::WordPositionDocids(chunk) => { |                 TypedChunk::WordPositionDocids(chunk) => { | ||||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; |                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; | ||||||
|                     word_position_docids = Some(cloneable_chunk); |                     word_position_docids = Some(cloneable_chunk); | ||||||
|                     TypedChunk::WordPositionDocids(chunk) |                     TypedChunk::WordPositionDocids(chunk) | ||||||
|                 } |                 } | ||||||
|                 TypedChunk::WordFidDocids(chunk) => { |  | ||||||
|                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; |  | ||||||
|                     word_fid_docids = Some(cloneable_chunk); |  | ||||||
|                     TypedChunk::WordFidDocids(chunk) |  | ||||||
|                 } |  | ||||||
|                 otherwise => otherwise, |                 otherwise => otherwise, | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
| @@ -457,25 +477,16 @@ where | |||||||
|  |  | ||||||
|         // We write the primary key field id into the main database |         // We write the primary key field id into the main database | ||||||
|         self.index.put_primary_key(self.wtxn, &primary_key)?; |         self.index.put_primary_key(self.wtxn, &primary_key)?; | ||||||
|  |         let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||||
|         // We write the external documents ids into the main database. |  | ||||||
|         let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?; |  | ||||||
|         external_documents_ids.insert_ids(&new_external_documents_ids)?; |  | ||||||
|         let external_documents_ids = external_documents_ids.into_static(); |  | ||||||
|         self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; |  | ||||||
|  |  | ||||||
|         let all_documents_ids = index_documents_ids | new_documents_ids; |  | ||||||
|         self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; |  | ||||||
|  |  | ||||||
|         self.execute_prefix_databases( |         self.execute_prefix_databases( | ||||||
|             word_docids, |             word_docids, | ||||||
|             exact_word_docids, |             exact_word_docids, | ||||||
|             word_pair_proximity_docids, |  | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_fid_docids, |             word_fid_docids, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         Ok(all_documents_ids.len()) |         Ok(number_of_documents) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[logging_timer::time("IndexDocuments::{}")] |     #[logging_timer::time("IndexDocuments::{}")] | ||||||
| @@ -483,7 +494,6 @@ where | |||||||
|         self, |         self, | ||||||
|         word_docids: Option<grenad::Reader<CursorClonableMmap>>, |         word_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>, |         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|         word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>, |  | ||||||
|         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>, |         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>, |         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>, | ||||||
|     ) -> Result<()> |     ) -> Result<()> | ||||||
| @@ -604,32 +614,6 @@ where | |||||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, |             total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|         if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { |  | ||||||
|             // Run the word prefix pair proximity docids update operation. |  | ||||||
|             PrefixWordPairsProximityDocids::new( |  | ||||||
|                 self.wtxn, |  | ||||||
|                 self.index, |  | ||||||
|                 self.indexer_config.chunk_compression_type, |  | ||||||
|                 self.indexer_config.chunk_compression_level, |  | ||||||
|             ) |  | ||||||
|             .execute( |  | ||||||
|                 word_pair_proximity_docids, |  | ||||||
|                 &new_prefix_fst_words, |  | ||||||
|                 &common_prefix_fst_words, |  | ||||||
|                 &del_prefix_fst_words, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if (self.should_abort)() { |  | ||||||
|             return Err(Error::InternalError(InternalError::AbortedIndexation)); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         databases_seen += 1; |  | ||||||
|         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { |  | ||||||
|             databases_seen, |  | ||||||
|             total_databases: TOTAL_POSTING_DATABASE_COUNT, |  | ||||||
|         }); |  | ||||||
|  |  | ||||||
|         if let Some(word_position_docids) = word_position_docids { |         if let Some(word_position_docids) = word_position_docids { | ||||||
|             // Run the words prefix position docids update operation. |             // Run the words prefix position docids update operation. | ||||||
|             let mut builder = WordPrefixIntegerDocids::new( |             let mut builder = WordPrefixIntegerDocids::new( | ||||||
| @@ -687,8 +671,8 @@ where | |||||||
| fn execute_word_prefix_docids( | fn execute_word_prefix_docids( | ||||||
|     txn: &mut heed::RwTxn, |     txn: &mut heed::RwTxn, | ||||||
|     reader: grenad::Reader<Cursor<ClonableMmap>>, |     reader: grenad::Reader<Cursor<ClonableMmap>>, | ||||||
|     word_docids_db: Database<Str, RoaringBitmapCodec>, |     word_docids_db: Database<Str, CboRoaringBitmapCodec>, | ||||||
|     word_prefix_docids_db: Database<Str, RoaringBitmapCodec>, |     word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>, | ||||||
|     indexer_config: &IndexerConfig, |     indexer_config: &IndexerConfig, | ||||||
|     new_prefix_fst_words: &[String], |     new_prefix_fst_words: &[String], | ||||||
|     common_prefix_fst_words: &[&[String]], |     common_prefix_fst_words: &[&[String]], | ||||||
| @@ -709,14 +693,15 @@ fn execute_word_prefix_docids( | |||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests { | mod tests { | ||||||
|     use big_s::S; |     use big_s::S; | ||||||
|  |     use fst::IntoStreamer; | ||||||
|  |     use heed::RwTxn; | ||||||
|     use maplit::hashset; |     use maplit::hashset; | ||||||
|  |  | ||||||
|     use super::*; |     use super::*; | ||||||
|     use crate::documents::documents_batch_reader_from_objects; |     use crate::documents::documents_batch_reader_from_objects; | ||||||
|     use crate::index::tests::TempIndex; |     use crate::index::tests::TempIndex; | ||||||
|     use crate::search::TermsMatchingStrategy; |     use crate::search::TermsMatchingStrategy; | ||||||
|     use crate::update::DeleteDocuments; |     use crate::{db_snap, Filter, Search, BEU16}; | ||||||
|     use crate::{db_snap, BEU16}; |  | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn simple_document_replacement() { |     fn simple_document_replacement() { | ||||||
| @@ -807,11 +792,10 @@ mod tests { | |||||||
|         assert_eq!(count, 1); |         assert_eq!(count, 1); | ||||||
|  |  | ||||||
|         // Check that we get only one document from the database. |         // Check that we get only one document from the database. | ||||||
|         // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 |         let docs = index.documents(&rtxn, Some(0)).unwrap(); | ||||||
|         let docs = index.documents(&rtxn, Some(1)).unwrap(); |  | ||||||
|         assert_eq!(docs.len(), 1); |         assert_eq!(docs.len(), 1); | ||||||
|         let (id, doc) = docs[0]; |         let (id, doc) = docs[0]; | ||||||
|         assert_eq!(id, 1); |         assert_eq!(id, 0); | ||||||
|  |  | ||||||
|         // Check that this document is equal to the last one sent. |         // Check that this document is equal to the last one sent. | ||||||
|         let mut doc_iter = doc.iter(); |         let mut doc_iter = doc.iter(); | ||||||
| @@ -872,7 +856,7 @@ mod tests { | |||||||
|         assert_eq!(count, 3); |         assert_eq!(count, 3); | ||||||
|  |  | ||||||
|         // the document 0 has been deleted and reinserted with the id 3 |         // the document 0 has been deleted and reinserted with the id 3 | ||||||
|         let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); |         let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap(); | ||||||
|         let kevin_position = |         let kevin_position = | ||||||
|             docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); |             docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); | ||||||
|         assert_eq!(kevin_position, 2); |         assert_eq!(kevin_position, 2); | ||||||
| @@ -1018,7 +1002,6 @@ mod tests { | |||||||
|         assert_eq!(count, 6); |         assert_eq!(count, 6); | ||||||
|  |  | ||||||
|         db_snap!(index, word_docids, "updated"); |         db_snap!(index, word_docids, "updated"); | ||||||
|         db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); |  | ||||||
|  |  | ||||||
|         drop(rtxn); |         drop(rtxn); | ||||||
|     } |     } | ||||||
| @@ -1121,17 +1104,15 @@ mod tests { | |||||||
|                 { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } |                 { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } | ||||||
|             ])) |             ])) | ||||||
|             .unwrap(); |             .unwrap(); | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|         assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); |  | ||||||
|  |  | ||||||
|         // Delete not all of the documents but some of them. |         // Delete not all of the documents but some of them. | ||||||
|         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |         index.delete_document("30"); | ||||||
|         builder.delete_external_id("30"); |  | ||||||
|         builder.execute().unwrap(); |  | ||||||
|  |  | ||||||
|         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); |         let txn = index.read_txn().unwrap(); | ||||||
|         assert!(external_documents_ids.get("30").is_none()); |         assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId")); | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |         let external_documents_ids = index.external_documents_ids(); | ||||||
|  |         assert!(external_documents_ids.get(&txn, "30").unwrap().is_none()); | ||||||
|  |  | ||||||
|         index |         index | ||||||
|             .add_documents(documents!([ |             .add_documents(documents!([ | ||||||
| @@ -1140,8 +1121,8 @@ mod tests { | |||||||
|             .unwrap(); |             .unwrap(); | ||||||
|  |  | ||||||
|         let wtxn = index.write_txn().unwrap(); |         let wtxn = index.write_txn().unwrap(); | ||||||
|         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); |         let external_documents_ids = index.external_documents_ids(); | ||||||
|         assert!(external_documents_ids.get("30").is_some()); |         assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some()); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         index |         index | ||||||
| @@ -1435,8 +1416,10 @@ mod tests { | |||||||
|         index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap(); |         index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap(); | ||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|         let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); |         let all_documents_count = index.all_documents(&rtxn).unwrap().count(); | ||||||
|         assert!(external_documents_ids.get("1").is_some()); |         assert_eq!(all_documents_count, 1); | ||||||
|  |         let external_documents_ids = index.external_documents_ids(); | ||||||
|  |         assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some()); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
| @@ -1490,12 +1473,6 @@ mod tests { | |||||||
|         3   2    second       second |         3   2    second       second | ||||||
|         3   3    third        third |         3   3    third        third | ||||||
|         "###); |         "###); | ||||||
|         db_snap!(index, string_faceted_documents_ids, @r###" |  | ||||||
|         0   [] |  | ||||||
|         1   [] |  | ||||||
|         2   [] |  | ||||||
|         3   [0, 1, 2, 3, ] |  | ||||||
|         "###); |  | ||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
| @@ -1519,12 +1496,6 @@ mod tests { | |||||||
|  |  | ||||||
|         db_snap!(index, facet_id_string_docids, @""); |         db_snap!(index, facet_id_string_docids, @""); | ||||||
|         db_snap!(index, field_id_docid_facet_strings, @""); |         db_snap!(index, field_id_docid_facet_strings, @""); | ||||||
|         db_snap!(index, string_faceted_documents_ids, @r###" |  | ||||||
|         0   [] |  | ||||||
|         1   [] |  | ||||||
|         2   [] |  | ||||||
|         3   [0, 1, 2, 3, ] |  | ||||||
|         "###); |  | ||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
| @@ -1551,12 +1522,6 @@ mod tests { | |||||||
|         3   2    second       second |         3   2    second       second | ||||||
|         3   3    third        third |         3   3    third        third | ||||||
|         "###); |         "###); | ||||||
|         db_snap!(index, string_faceted_documents_ids, @r###" |  | ||||||
|         0   [] |  | ||||||
|         1   [] |  | ||||||
|         2   [] |  | ||||||
|         3   [0, 1, 2, 3, ] |  | ||||||
|         "###); |  | ||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
| @@ -1719,7 +1684,7 @@ mod tests { | |||||||
|  |  | ||||||
|         let wtxn = index.read_txn().unwrap(); |         let wtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|         let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); |         let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap(); | ||||||
|         let ids = map.values().collect::<HashSet<_>>(); |         let ids = map.values().collect::<HashSet<_>>(); | ||||||
|  |  | ||||||
|         assert_eq!(ids.len(), map.len()); |         assert_eq!(ids.len(), map.len()); | ||||||
| @@ -2531,17 +2496,8 @@ mod tests { | |||||||
|         db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); |         db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); | ||||||
|         db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); |         db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|  |  | ||||||
|         // Delete not all of the documents but some of them. |         // Delete not all of the documents but some of them. | ||||||
|         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |         index.delete_documents(vec!["0".into(), "3".into()]); | ||||||
|         builder.strategy(DeletionStrategy::AlwaysHard); |  | ||||||
|         builder.delete_external_id("0"); |  | ||||||
|         builder.delete_external_id("3"); |  | ||||||
|         let result = builder.execute().unwrap(); |  | ||||||
|         println!("{result:?}"); |  | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); |         db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); | ||||||
|         db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); |         db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); | ||||||
| @@ -2596,8 +2552,7 @@ mod tests { | |||||||
|             ), |             ), | ||||||
|         ] |         ] | ||||||
|         */ |         */ | ||||||
|         let mut index = TempIndex::new(); |         let index = TempIndex::new(); | ||||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; |  | ||||||
|  |  | ||||||
|         // START OF BATCH |         // START OF BATCH | ||||||
|  |  | ||||||
| @@ -2637,8 +2592,7 @@ mod tests { | |||||||
|         {"id":1,"doggo":"bernese"} |         {"id":1,"doggo":"bernese"} | ||||||
|         "###); |         "###); | ||||||
|         db_snap!(index, external_documents_ids, @r###" |         db_snap!(index, external_documents_ids, @r###" | ||||||
|         soft: |         docids: | ||||||
|         hard: |  | ||||||
|         1                        0 |         1                        0 | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
| @@ -2683,13 +2637,10 @@ mod tests { | |||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
|         db_snap!(index, external_documents_ids, @r###" |         db_snap!(index, external_documents_ids, @r###" | ||||||
|         soft: |         docids: | ||||||
|         hard: |  | ||||||
|         0                        1 |         0                        1 | ||||||
|         "###); |         "###); | ||||||
|  |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); |  | ||||||
|  |  | ||||||
|         // BATCH 3 |         // BATCH 3 | ||||||
|  |  | ||||||
|         println!("--- ENTERING BATCH 3"); |         println!("--- ENTERING BATCH 3"); | ||||||
| @@ -2731,4 +2682,537 @@ mod tests { | |||||||
|         let res = index.search(&rtxn).execute().unwrap(); |         let res = index.search(&rtxn).execute().unwrap(); | ||||||
|         index.documents(&rtxn, res.documents_ids).unwrap(); |         index.documents(&rtxn, res.documents_ids).unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     fn delete_documents<'t>( | ||||||
|  |         wtxn: &mut RwTxn<'t, '_>, | ||||||
|  |         index: &'t TempIndex, | ||||||
|  |         external_ids: &[&str], | ||||||
|  |     ) -> Vec<u32> { | ||||||
|  |         let external_document_ids = index.external_documents_ids(); | ||||||
|  |         let ids_to_delete: Vec<u32> = external_ids | ||||||
|  |             .iter() | ||||||
|  |             .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap()) | ||||||
|  |             .collect(); | ||||||
|  |  | ||||||
|  |         // Delete some documents. | ||||||
|  |         index.delete_documents_using_wtxn( | ||||||
|  |             wtxn, | ||||||
|  |             external_ids.iter().map(ToString::to_string).collect(), | ||||||
|  |         ); | ||||||
|  |  | ||||||
|  |         ids_to_delete | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn delete_documents_with_numbers_as_primary_key() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                     { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, | ||||||
|  |                     { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, | ||||||
|  |                     { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } | ||||||
|  |                 ]), | ||||||
|  |             ) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         // delete those documents, ids are synchronous therefore 0, 1, and 2. | ||||||
|  |         index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         // All these snapshots should be empty since the database was cleared | ||||||
|  |         db_snap!(index, documents_ids); | ||||||
|  |         db_snap!(index, word_docids); | ||||||
|  |         db_snap!(index, word_pair_proximity_docids); | ||||||
|  |         db_snap!(index, facet_id_exists_docids); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         assert!(index.field_distribution(&rtxn).unwrap().is_empty()); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn delete_documents_with_strange_primary_key() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                     { "mysuperid": 0, "name": "kevin" }, | ||||||
|  |                     { "mysuperid": 1, "name": "kevina" }, | ||||||
|  |                     { "mysuperid": 2, "name": "benoit" } | ||||||
|  |                 ]), | ||||||
|  |             ) | ||||||
|  |             .unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         // Delete not all of the documents but some of them. | ||||||
|  |         index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         db_snap!(index, documents_ids); | ||||||
|  |         db_snap!(index, word_docids); | ||||||
|  |         db_snap!(index, word_pair_proximity_docids); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn filtered_placeholder_search_should_not_return_deleted_documents() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .update_settings_using_wtxn(&mut wtxn, |settings| { | ||||||
|  |                 settings.set_primary_key(S("docid")); | ||||||
|  |                 settings.set_filterable_fields(hashset! { S("label"), S("label2") }); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                     { "docid": "1_4",  "label": ["sign"] }, | ||||||
|  |                     { "docid": "1_5",  "label": ["letter"] }, | ||||||
|  |                     { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] }, | ||||||
|  |                     { "docid": "1_36", "label": ["drawing","painting","pattern"] }, | ||||||
|  |                     { "docid": "1_37", "label": ["art","drawing","outdoor"] }, | ||||||
|  |                     { "docid": "1_38", "label": ["aquarium","art","drawing"] }, | ||||||
|  |                     { "docid": "1_39", "label": ["abstract"] }, | ||||||
|  |                     { "docid": "1_40", "label": ["cartoon"] }, | ||||||
|  |                     { "docid": "1_41", "label": ["art","drawing"] }, | ||||||
|  |                     { "docid": "1_42", "label": ["art","pattern"] }, | ||||||
|  |                     { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_44", "label": ["drawing"] }, | ||||||
|  |                     { "docid": "1_45", "label": ["art"] }, | ||||||
|  |                     { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, | ||||||
|  |                     { "docid": "1_47", "label": ["abstract","pattern"] }, | ||||||
|  |                     { "docid": "1_52", "label": ["abstract","cartoon"] }, | ||||||
|  |                     { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_58", "label": ["abstract","art","cartoon"] }, | ||||||
|  |                     { "docid": "1_68", "label": ["design"] }, | ||||||
|  |                     { "docid": "1_69", "label": ["geometry"] }, | ||||||
|  |                     { "docid": "1_70", "label2": ["geometry", 1.2] }, | ||||||
|  |                     { "docid": "1_71", "label2": ["design", 2.2] }, | ||||||
|  |                     { "docid": "1_72", "label2": ["geometry", 1.2] } | ||||||
|  |                 ]), | ||||||
|  |             ) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]); | ||||||
|  |  | ||||||
|  |         // Placeholder search with filter | ||||||
|  |         let filter = Filter::from_str("label = sign").unwrap().unwrap(); | ||||||
|  |         let results = index.search(&wtxn).filter(filter).execute().unwrap(); | ||||||
|  |         assert!(results.documents_ids.is_empty()); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         db_snap!(index, word_docids); | ||||||
|  |         db_snap!(index, facet_id_f64_docids); | ||||||
|  |         db_snap!(index, word_pair_proximity_docids); | ||||||
|  |         db_snap!(index, facet_id_exists_docids); | ||||||
|  |         db_snap!(index, facet_id_string_docids); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn placeholder_search_should_not_return_deleted_documents() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .update_settings_using_wtxn(&mut wtxn, |settings| { | ||||||
|  |                 settings.set_primary_key(S("docid")); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                     { "docid": "1_4",  "label": ["sign"] }, | ||||||
|  |                     { "docid": "1_5",  "label": ["letter"] }, | ||||||
|  |                     { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] }, | ||||||
|  |                     { "docid": "1_36", "label": ["drawing","painting","pattern"] }, | ||||||
|  |                     { "docid": "1_37", "label": ["art","drawing","outdoor"] }, | ||||||
|  |                     { "docid": "1_38", "label": ["aquarium","art","drawing"] }, | ||||||
|  |                     { "docid": "1_39", "label": ["abstract"] }, | ||||||
|  |                     { "docid": "1_40", "label": ["cartoon"] }, | ||||||
|  |                     { "docid": "1_41", "label": ["art","drawing"] }, | ||||||
|  |                     { "docid": "1_42", "label": ["art","pattern"] }, | ||||||
|  |                     { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_44", "label": ["drawing"] }, | ||||||
|  |                     { "docid": "1_45", "label": ["art"] }, | ||||||
|  |                     { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, | ||||||
|  |                     { "docid": "1_47", "label": ["abstract","pattern"] }, | ||||||
|  |                     { "docid": "1_52", "label": ["abstract","cartoon"] }, | ||||||
|  |                     { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_58", "label": ["abstract","art","cartoon"] }, | ||||||
|  |                     { "docid": "1_68", "label": ["design"] }, | ||||||
|  |                     { "docid": "1_69", "label": ["geometry"] }, | ||||||
|  |                     { "docid": "1_70", "label2": ["geometry", 1.2] }, | ||||||
|  |                     { "docid": "1_71", "label2": ["design", 2.2] }, | ||||||
|  |                     { "docid": "1_72", "label2": ["geometry", 1.2] } | ||||||
|  |                 ]), | ||||||
|  |             ) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); | ||||||
|  |  | ||||||
|  |         // Placeholder search | ||||||
|  |         let results = index.search(&wtxn).execute().unwrap(); | ||||||
|  |         assert!(!results.documents_ids.is_empty()); | ||||||
|  |         for id in results.documents_ids.iter() { | ||||||
|  |             assert!( | ||||||
|  |                 !deleted_internal_ids.contains(id), | ||||||
|  |                 "The document {} was supposed to be deleted", | ||||||
|  |                 id | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn search_should_not_return_deleted_documents() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .update_settings_using_wtxn(&mut wtxn, |settings| { | ||||||
|  |                 settings.set_primary_key(S("docid")); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                     { "docid": "1_4",  "label": ["sign"] }, | ||||||
|  |                     { "docid": "1_5",  "label": ["letter"] }, | ||||||
|  |                     { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] }, | ||||||
|  |                     { "docid": "1_36", "label": ["drawing","painting","pattern"] }, | ||||||
|  |                     { "docid": "1_37", "label": ["art","drawing","outdoor"] }, | ||||||
|  |                     { "docid": "1_38", "label": ["aquarium","art","drawing"] }, | ||||||
|  |                     { "docid": "1_39", "label": ["abstract"] }, | ||||||
|  |                     { "docid": "1_40", "label": ["cartoon"] }, | ||||||
|  |                     { "docid": "1_41", "label": ["art","drawing"] }, | ||||||
|  |                     { "docid": "1_42", "label": ["art","pattern"] }, | ||||||
|  |                     { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_44", "label": ["drawing"] }, | ||||||
|  |                     { "docid": "1_45", "label": ["art"] }, | ||||||
|  |                     { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, | ||||||
|  |                     { "docid": "1_47", "label": ["abstract","pattern"] }, | ||||||
|  |                     { "docid": "1_52", "label": ["abstract","cartoon"] }, | ||||||
|  |                     { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_58", "label": ["abstract","art","cartoon"] }, | ||||||
|  |                     { "docid": "1_68", "label": ["design"] }, | ||||||
|  |                     { "docid": "1_69", "label": ["geometry"] }, | ||||||
|  |                     { "docid": "1_70", "label2": ["geometry", 1.2] }, | ||||||
|  |                     { "docid": "1_71", "label2": ["design", 2.2] }, | ||||||
|  |                     { "docid": "1_72", "label2": ["geometry", 1.2] } | ||||||
|  |                 ]), | ||||||
|  |             ) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); | ||||||
|  |  | ||||||
|  |         // search for abstract | ||||||
|  |         let results = index.search(&wtxn).query("abstract").execute().unwrap(); | ||||||
|  |         assert!(!results.documents_ids.is_empty()); | ||||||
|  |         for id in results.documents_ids.iter() { | ||||||
|  |             assert!( | ||||||
|  |                 !deleted_internal_ids.contains(id), | ||||||
|  |                 "The document {} was supposed to be deleted", | ||||||
|  |                 id | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .update_settings_using_wtxn(&mut wtxn, |settings| { | ||||||
|  |                 settings.set_primary_key(S("id")); | ||||||
|  |                 settings.set_filterable_fields(hashset!(S("_geo"))); | ||||||
|  |                 settings.set_sortable_fields(hashset!(S("_geo"))); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index.add_documents_using_wtxn(&mut wtxn, documents!([ | ||||||
|  |             { "id": "1",  "city": "Lille",             "_geo": { "lat": 50.6299, "lng": 3.0569 } }, | ||||||
|  |             { "id": "2",  "city": "Mons-en-Barœul",    "_geo": { "lat": 50.6415, "lng": 3.1106 } }, | ||||||
|  |             { "id": "3",  "city": "Hellemmes",         "_geo": { "lat": 50.6312, "lng": 3.1106 } }, | ||||||
|  |             { "id": "4",  "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, | ||||||
|  |             { "id": "5",  "city": "Hem",               "_geo": { "lat": 50.6552, "lng": 3.1897 } }, | ||||||
|  |             { "id": "6",  "city": "Roubaix",           "_geo": { "lat": 50.6924, "lng": 3.1763 } }, | ||||||
|  |             { "id": "7",  "city": "Tourcoing",         "_geo": { "lat": 50.7263, "lng": 3.1541 } }, | ||||||
|  |             { "id": "8",  "city": "Mouscron",          "_geo": { "lat": 50.7453, "lng": 3.2206 } }, | ||||||
|  |             { "id": "9",  "city": "Tournai",           "_geo": { "lat": 50.6053, "lng": 3.3758 } }, | ||||||
|  |             { "id": "10", "city": "Ghent",             "_geo": { "lat": 51.0537, "lng": 3.6957 } }, | ||||||
|  |             { "id": "11", "city": "Brussels",          "_geo": { "lat": 50.8466, "lng": 4.3370 } }, | ||||||
|  |             { "id": "12", "city": "Charleroi",         "_geo": { "lat": 50.4095, "lng": 4.4347 } }, | ||||||
|  |             { "id": "13", "city": "Mons",              "_geo": { "lat": 50.4502, "lng": 3.9623 } }, | ||||||
|  |             { "id": "14", "city": "Valenciennes",      "_geo": { "lat": 50.3518, "lng": 3.5326 } }, | ||||||
|  |             { "id": "15", "city": "Arras",             "_geo": { "lat": 50.2844, "lng": 2.7637 } }, | ||||||
|  |             { "id": "16", "city": "Cambrai",           "_geo": { "lat": 50.1793, "lng": 3.2189 } }, | ||||||
|  |             { "id": "17", "city": "Bapaume",           "_geo": { "lat": 50.1112, "lng": 2.8547 } }, | ||||||
|  |             { "id": "18", "city": "Amiens",            "_geo": { "lat": 49.9314, "lng": 2.2710 } }, | ||||||
|  |             { "id": "19", "city": "Compiègne",         "_geo": { "lat": 49.4449, "lng": 2.7913 } }, | ||||||
|  |             { "id": "20", "city": "Paris",             "_geo": { "lat": 48.9021, "lng": 2.3708 } } | ||||||
|  |         ])).unwrap(); | ||||||
|  |  | ||||||
|  |         let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; | ||||||
|  |         let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); | ||||||
|  |  | ||||||
|  |         // Placeholder search with geo filter | ||||||
|  |         let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); | ||||||
|  |         let results = index.search(&wtxn).filter(filter).execute().unwrap(); | ||||||
|  |         assert!(!results.documents_ids.is_empty()); | ||||||
|  |         for id in results.documents_ids.iter() { | ||||||
|  |             assert!( | ||||||
|  |                 !deleted_internal_ids.contains(id), | ||||||
|  |                 "The document {} was supposed to be deleted", | ||||||
|  |                 id | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         db_snap!(index, facet_id_f64_docids); | ||||||
|  |         db_snap!(index, facet_id_string_docids); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn get_documents_should_not_return_deleted_documents() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .update_settings_using_wtxn(&mut wtxn, |settings| { | ||||||
|  |                 settings.set_primary_key(S("docid")); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                     { "docid": "1_4",  "label": ["sign"] }, | ||||||
|  |                     { "docid": "1_5",  "label": ["letter"] }, | ||||||
|  |                     { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] }, | ||||||
|  |                     { "docid": "1_36", "label": ["drawing","painting","pattern"] }, | ||||||
|  |                     { "docid": "1_37", "label": ["art","drawing","outdoor"] }, | ||||||
|  |                     { "docid": "1_38", "label": ["aquarium","art","drawing"] }, | ||||||
|  |                     { "docid": "1_39", "label": ["abstract"] }, | ||||||
|  |                     { "docid": "1_40", "label": ["cartoon"] }, | ||||||
|  |                     { "docid": "1_41", "label": ["art","drawing"] }, | ||||||
|  |                     { "docid": "1_42", "label": ["art","pattern"] }, | ||||||
|  |                     { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_44", "label": ["drawing"] }, | ||||||
|  |                     { "docid": "1_45", "label": ["art"] }, | ||||||
|  |                     { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, | ||||||
|  |                     { "docid": "1_47", "label": ["abstract","pattern"] }, | ||||||
|  |                     { "docid": "1_52", "label": ["abstract","cartoon"] }, | ||||||
|  |                     { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, | ||||||
|  |                     { "docid": "1_58", "label": ["abstract","art","cartoon"] }, | ||||||
|  |                     { "docid": "1_68", "label": ["design"] }, | ||||||
|  |                     { "docid": "1_69", "label": ["geometry"] }, | ||||||
|  |                     { "docid": "1_70", "label2": ["geometry", 1.2] }, | ||||||
|  |                     { "docid": "1_71", "label2": ["design", 2.2] }, | ||||||
|  |                     { "docid": "1_72", "label2": ["geometry", 1.2] } | ||||||
|  |                 ]), | ||||||
|  |             ) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         let deleted_external_ids = ["1_7", "1_52"]; | ||||||
|  |         let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); | ||||||
|  |  | ||||||
|  |         // list all documents | ||||||
|  |         let results = index.all_documents(&wtxn).unwrap(); | ||||||
|  |         for result in results { | ||||||
|  |             let (id, _) = result.unwrap(); | ||||||
|  |             assert!( | ||||||
|  |                 !deleted_internal_ids.contains(&id), | ||||||
|  |                 "The document {} was supposed to be deleted", | ||||||
|  |                 id | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // list internal document ids | ||||||
|  |         let results = index.documents_ids(&wtxn).unwrap(); | ||||||
|  |         for id in results { | ||||||
|  |             assert!( | ||||||
|  |                 !deleted_internal_ids.contains(&id), | ||||||
|  |                 "The document {} was supposed to be deleted", | ||||||
|  |                 id | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         // get internal docids from deleted external document ids | ||||||
|  |         let results = index.external_documents_ids(); | ||||||
|  |         for id in deleted_external_ids { | ||||||
|  |             assert!( | ||||||
|  |                 results.get(&rtxn, id).unwrap().is_none(), | ||||||
|  |                 "The document {} was supposed to be deleted", | ||||||
|  |                 id | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |         drop(rtxn); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn stats_should_not_return_deleted_documents() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .update_settings_using_wtxn(&mut wtxn, |settings| { | ||||||
|  |                 settings.set_primary_key(S("docid")); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index.add_documents_using_wtxn(&mut wtxn, documents!([ | ||||||
|  |             { "docid": "1_4",  "label": ["sign"]}, | ||||||
|  |             { "docid": "1_5",  "label": ["letter"]}, | ||||||
|  |             { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, | ||||||
|  |             { "docid": "1_36", "label": ["drawing","painting","pattern"]}, | ||||||
|  |             { "docid": "1_37", "label": ["art","drawing","outdoor"]}, | ||||||
|  |             { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, | ||||||
|  |             { "docid": "1_39", "label": ["abstract"]}, | ||||||
|  |             { "docid": "1_40", "label": ["cartoon"]}, | ||||||
|  |             { "docid": "1_41", "label": ["art","drawing"]}, | ||||||
|  |             { "docid": "1_42", "label": ["art","pattern"]}, | ||||||
|  |             { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, | ||||||
|  |             { "docid": "1_44", "label": ["drawing"], "number": 44i32}, | ||||||
|  |             { "docid": "1_45", "label": ["art"]}, | ||||||
|  |             { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, | ||||||
|  |             { "docid": "1_47", "label": ["abstract","pattern"]}, | ||||||
|  |             { "docid": "1_52", "label": ["abstract","cartoon"]}, | ||||||
|  |             { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, | ||||||
|  |             { "docid": "1_58", "label": ["abstract","art","cartoon"]}, | ||||||
|  |             { "docid": "1_68", "label": ["design"]}, | ||||||
|  |             { "docid": "1_69", "label": ["geometry"]} | ||||||
|  |         ])).unwrap(); | ||||||
|  |  | ||||||
|  |         delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); | ||||||
|  |  | ||||||
|  |         // count internal documents | ||||||
|  |         let results = index.number_of_documents(&wtxn).unwrap(); | ||||||
|  |         assert_eq!(18, results); | ||||||
|  |  | ||||||
|  |         // count field distribution | ||||||
|  |         let results = index.field_distribution(&wtxn).unwrap(); | ||||||
|  |         assert_eq!(Some(&18), results.get("label")); | ||||||
|  |         assert_eq!(Some(&1), results.get("title")); | ||||||
|  |         assert_eq!(Some(&2), results.get("number")); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn stored_detected_script_and_language_should_not_return_deleted_documents() { | ||||||
|  |         use charabia::{Language, Script}; | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         index | ||||||
|  |             .add_documents_using_wtxn( | ||||||
|  |                 &mut wtxn, | ||||||
|  |                 documents!([ | ||||||
|  |                 { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, | ||||||
|  |                 { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, | ||||||
|  |                 { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, | ||||||
|  |                 { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, | ||||||
|  |                 { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, | ||||||
|  |                 { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, | ||||||
|  |             ])) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         let key_cmn = (Script::Cj, Language::Cmn); | ||||||
|  |         let cj_cmn_docs = | ||||||
|  |             index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); | ||||||
|  |         let mut expected_cj_cmn_docids = RoaringBitmap::new(); | ||||||
|  |         expected_cj_cmn_docids.push(1); | ||||||
|  |         expected_cj_cmn_docids.push(5); | ||||||
|  |         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); | ||||||
|  |  | ||||||
|  |         delete_documents(&mut wtxn, &index, &["1"]); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let cj_cmn_docs = | ||||||
|  |             index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); | ||||||
|  |         let mut expected_cj_cmn_docids = RoaringBitmap::new(); | ||||||
|  |         expected_cj_cmn_docids.push(5); | ||||||
|  |         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn delete_words_exact_attributes() { | ||||||
|  |         let index = TempIndex::new(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .update_settings(|settings| { | ||||||
|  |                 settings.set_primary_key(S("id")); | ||||||
|  |                 settings.set_searchable_fields(vec![S("text"), S("exact")]); | ||||||
|  |                 settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .add_documents(documents!([ | ||||||
|  |                 { "id": 0, "text": "hello" }, | ||||||
|  |                 { "id": 1, "exact": "hello"} | ||||||
|  |             ])) | ||||||
|  |             .unwrap(); | ||||||
|  |         db_snap!(index, word_docids, 1, @r###" | ||||||
|  |         hello            [0, ] | ||||||
|  |         "###); | ||||||
|  |         db_snap!(index, exact_word_docids, 1, @r###" | ||||||
|  |         hello            [1, ] | ||||||
|  |         "###); | ||||||
|  |         db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         db_snap!(index, word_docids, 2, @r###" | ||||||
|  |         hello            [0, ] | ||||||
|  |         "###); | ||||||
|  |         db_snap!(index, exact_word_docids, 2, @""); | ||||||
|  |         db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); | ||||||
|  |  | ||||||
|  |         insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); | ||||||
|  |         let txn = index.read_txn().unwrap(); | ||||||
|  |         let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); | ||||||
|  |         insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); | ||||||
|  |  | ||||||
|  |         let mut s = Search::new(&txn, &index); | ||||||
|  |         s.query("hello"); | ||||||
|  |         let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||||
|  |         insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  | [] | ||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  |  | ||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  |  | ||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  |  | ||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  | [2, ] | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  | benoit           [2, ] | ||||||
|  |  | ||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  |  | ||||||
| @@ -1,5 +1,5 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/delete_documents.rs | source: milli/src/update/index_documents/mod.rs | ||||||
| --- | --- | ||||||
| 1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] | 1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] | ||||||
| 2   [21, ] | 2   [21, ] | ||||||
| @@ -0,0 +1,5 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  | 2   0  2.2    1  [21, ] | ||||||
|  |  | ||||||
| @@ -1,5 +1,5 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/delete_documents.rs | source: milli/src/update/index_documents/mod.rs | ||||||
| --- | --- | ||||||
| 1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ] | 1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ] | ||||||
| 1   0  aquarium     1  [5, ] | 1   0  aquarium     1  [5, ] | ||||||
| @@ -1,5 +1,5 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/delete_documents.rs | source: milli/src/update/index_documents/mod.rs | ||||||
| --- | --- | ||||||
| 1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] | 1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] | ||||||
| 2                [21, ] | 2                [21, ] | ||||||
| @@ -1,5 +1,5 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/delete_documents.rs | source: milli/src/update/index_documents/mod.rs | ||||||
| --- | --- | ||||||
| 1  1                36               [3, ] | 1  1                36               [3, ] | ||||||
| 1  1                37               [4, ] | 1  1                37               [4, ] | ||||||
| @@ -1,5 +1,5 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/delete_documents.rs | source: milli/src/update/index_documents/mod.rs | ||||||
| --- | --- | ||||||
| 3   0  48.9021 1  [19, ] | 3   0  48.9021 1  [19, ] | ||||||
| 3   0  49.9314 1  [17, ] | 3   0  49.9314 1  [17, ] | ||||||
| @@ -0,0 +1,4 @@ | |||||||
|  | --- | ||||||
|  | source: milli/src/update/index_documents/mod.rs | ||||||
|  | --- | ||||||
|  |  | ||||||
| @@ -1,60 +1,56 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/index_documents/mod.rs | source: milli/src/update/index_documents/mod.rs | ||||||
| --- | --- | ||||||
| 0                [1, 7, ] | 0                [1, ] | ||||||
| 1                [2, ] | 1                [2, ] | ||||||
| 10               [1, 7, ] | 10               [1, ] | ||||||
| 12               [0, 8, ] | 12               [0, ] | ||||||
| 1344             [3, ] | 1344             [3, ] | ||||||
| 1813             [8, ] | 1813             [0, ] | ||||||
| 2                [0, 8, ] | 2                [0, ] | ||||||
| 23               [5, ] | 23               [5, ] | ||||||
| 25               [2, ] | 25               [2, ] | ||||||
| 3                [0, 8, ] | 3                [0, ] | ||||||
| 35               [5, ] | 35               [5, ] | ||||||
| 4                [4, 6, ] | 4                [4, ] | ||||||
| 42               [0, 5, 8, ] | 42               [0, 5, ] | ||||||
| 456              [1, 7, ] | 456              [1, ] | ||||||
| 5                [0, 8, ] | 5                [0, ] | ||||||
| 99               [2, ] | 99               [2, ] | ||||||
| adams            [5, ] | adams            [5, ] | ||||||
| adventure        [1, 7, ] | adventure        [1, ] | ||||||
| alice            [2, ] | alice            [2, ] | ||||||
| and              [0, 4, 6, 8, ] | and              [0, 4, ] | ||||||
| antoine          [1, 7, ] | antoine          [1, ] | ||||||
| austen           [8, ] | austen           [0, ] | ||||||
| austin           [0, ] | blood            [4, ] | ||||||
| blood            [4, 6, ] |  | ||||||
| carroll          [2, ] | carroll          [2, ] | ||||||
| de               [1, 7, ] | de               [1, ] | ||||||
| douglas          [5, ] | douglas          [5, ] | ||||||
| exupery          [1, 7, ] | exupery          [1, ] | ||||||
| fantasy          [2, 3, 4, 6, ] | fantasy          [2, 3, 4, ] | ||||||
| galaxy           [5, ] | galaxy           [5, ] | ||||||
| guide            [5, ] | guide            [5, ] | ||||||
| half             [4, 6, ] | half             [4, ] | ||||||
| harry            [4, 6, ] | harry            [4, ] | ||||||
| hitchhiker       [5, ] | hitchhiker       [5, ] | ||||||
| hobbit           [3, ] | hobbit           [3, ] | ||||||
| in               [2, ] | in               [2, ] | ||||||
| j                [3, 4, 6, 8, ] | j                [0, 3, 4, ] | ||||||
| jane             [0, ] | k                [4, ] | ||||||
| k                [4, 6, ] |  | ||||||
| le               [1, ] |  | ||||||
| lewis            [2, ] | lewis            [2, ] | ||||||
| little           [7, ] | little           [1, ] | ||||||
| petit            [1, ] | potter           [4, ] | ||||||
| potter           [4, 6, ] | prejudice        [0, ] | ||||||
| prejudice        [0, 8, ] | pride            [0, ] | ||||||
| pride            [0, 8, ] | prince           [1, ] | ||||||
| prince           [1, 4, 7, ] | princess         [4, ] | ||||||
| princess         [6, ] |  | ||||||
| r                [3, ] | r                [3, ] | ||||||
| romance          [0, 8, ] | romance          [0, ] | ||||||
| rowling          [4, 6, ] | rowling          [4, ] | ||||||
| s                [5, ] | s                [5, ] | ||||||
| saint            [1, 7, ] | saint            [1, ] | ||||||
| the              [3, 4, 5, 6, 7, ] | the              [1, 3, 4, 5, ] | ||||||
| to               [5, ] | to               [5, ] | ||||||
| tolkien          [3, ] | tolkien          [3, ] | ||||||
| wonderland       [2, ] | wonderland       [2, ] | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::hash_map::Entry; | use std::collections::btree_map::Entry as BEntry; | ||||||
|  | use std::collections::hash_map::Entry as HEntry; | ||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::{HashMap, HashSet}; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{Read, Seek}; | use std::io::{Read, Seek}; | ||||||
| @@ -7,18 +8,21 @@ use std::io::{Read, Seek}; | |||||||
| use fxhash::FxHashMap; | use fxhash::FxHashMap; | ||||||
| use heed::RoTxn; | use heed::RoTxn; | ||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use obkv::{KvReader, KvWriter}; | use obkv::{KvReader, KvReaderU16, KvWriter}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
| use smartstring::SmartString; | use smartstring::SmartString; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn, |     create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions, | ||||||
|  |     obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn, | ||||||
| }; | }; | ||||||
| use super::{IndexDocumentsMethod, IndexerConfig}; | use super::{IndexDocumentsMethod, IndexerConfig}; | ||||||
| use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; | use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; | ||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
| use crate::index::{db_name, main_key}; | use crate::index::{db_name, main_key}; | ||||||
|  | use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; | ||||||
|  | use crate::update::index_documents::GrenadParameters; | ||||||
| use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; | use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; | ||||||
| use crate::{ | use crate::{ | ||||||
|     FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, |     FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, | ||||||
| @@ -28,9 +32,6 @@ pub struct TransformOutput { | |||||||
|     pub primary_key: String, |     pub primary_key: String, | ||||||
|     pub fields_ids_map: FieldsIdsMap, |     pub fields_ids_map: FieldsIdsMap, | ||||||
|     pub field_distribution: FieldDistribution, |     pub field_distribution: FieldDistribution, | ||||||
|     pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>, |  | ||||||
|     pub new_documents_ids: RoaringBitmap, |  | ||||||
|     pub replaced_documents_ids: RoaringBitmap, |  | ||||||
|     pub documents_count: usize, |     pub documents_count: usize, | ||||||
|     pub original_documents: File, |     pub original_documents: File, | ||||||
|     pub flattened_documents: File, |     pub flattened_documents: File, | ||||||
| @@ -106,8 +107,8 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         // We must choose the appropriate merge function for when two or more documents |         // We must choose the appropriate merge function for when two or more documents | ||||||
|         // with the same user id must be merged or fully replaced in the same batch. |         // with the same user id must be merged or fully replaced in the same batch. | ||||||
|         let merge_function = match index_documents_method { |         let merge_function = match index_documents_method { | ||||||
|             IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, |             IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions, | ||||||
|             IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations, |             IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         // We initialize the sorter with the user indexing settings. |         // We initialize the sorter with the user indexing settings. | ||||||
| @@ -130,17 +131,13 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             indexer_settings.max_memory.map(|mem| mem / 2), |             indexer_settings.max_memory.map(|mem| mem / 2), | ||||||
|         ); |         ); | ||||||
|         let documents_ids = index.documents_ids(wtxn)?; |         let documents_ids = index.documents_ids(wtxn)?; | ||||||
|         let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; |  | ||||||
|  |  | ||||||
|         Ok(Transform { |         Ok(Transform { | ||||||
|             index, |             index, | ||||||
|             fields_ids_map: index.fields_ids_map(wtxn)?, |             fields_ids_map: index.fields_ids_map(wtxn)?, | ||||||
|             indexer_settings, |             indexer_settings, | ||||||
|             autogenerate_docids, |             autogenerate_docids, | ||||||
|             available_documents_ids: AvailableDocumentsIds::from_documents_ids( |             available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), | ||||||
|                 &documents_ids, |  | ||||||
|                 &soft_deleted_documents_ids, |  | ||||||
|             ), |  | ||||||
|             original_sorter, |             original_sorter, | ||||||
|             flattened_sorter, |             flattened_sorter, | ||||||
|             index_documents_method, |             index_documents_method, | ||||||
| @@ -151,6 +148,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[logging_timer::time] | ||||||
|     pub fn read_documents<R, FP, FA>( |     pub fn read_documents<R, FP, FA>( | ||||||
|         &mut self, |         &mut self, | ||||||
|         reader: EnrichedDocumentsBatchReader<R>, |         reader: EnrichedDocumentsBatchReader<R>, | ||||||
| @@ -163,8 +161,10 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         FP: Fn(UpdateIndexingStep) + Sync, |         FP: Fn(UpdateIndexingStep) + Sync, | ||||||
|         FA: Fn() -> bool + Sync, |         FA: Fn() -> bool + Sync, | ||||||
|     { |     { | ||||||
|  |         puffin::profile_function!(); | ||||||
|  |  | ||||||
|         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); |         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); | ||||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; |         let external_documents_ids = self.index.external_documents_ids(); | ||||||
|         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; |         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; | ||||||
|  |  | ||||||
|         let primary_key = cursor.primary_key().to_string(); |         let primary_key = cursor.primary_key().to_string(); | ||||||
| @@ -172,7 +172,8 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; |             self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; | ||||||
|  |  | ||||||
|         let mut obkv_buffer = Vec::new(); |         let mut obkv_buffer = Vec::new(); | ||||||
|         let mut document_sorter_buffer = Vec::new(); |         let mut document_sorter_value_buffer = Vec::new(); | ||||||
|  |         let mut document_sorter_key_buffer = Vec::new(); | ||||||
|         let mut documents_count = 0; |         let mut documents_count = 0; | ||||||
|         let mut docid_buffer: Vec<u8> = Vec::new(); |         let mut docid_buffer: Vec<u8> = Vec::new(); | ||||||
|         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); |         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); | ||||||
| @@ -213,29 +214,30 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); |             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); | ||||||
|  |  | ||||||
|             // Build the new obkv document. |             // Build the new obkv document. | ||||||
|             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); |             let mut writer = KvWriter::new(&mut obkv_buffer); | ||||||
|             for (k, v) in field_buffer_cache.iter() { |             for (k, v) in field_buffer_cache.iter() { | ||||||
|                 writer.insert(*k, v)?; |                 writer.insert(*k, v)?; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let mut original_docid = None; |             let mut original_docid = None; | ||||||
|  |  | ||||||
|             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { |             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { | ||||||
|                 Entry::Occupied(entry) => *entry.get() as u32, |                 HEntry::Occupied(entry) => *entry.get() as u32, | ||||||
|                 Entry::Vacant(entry) => { |                 HEntry::Vacant(entry) => { | ||||||
|                     // If the document was already in the db we mark it as a replaced document. |                     let docid = match external_documents_ids.get(wtxn, entry.key())? { | ||||||
|                     // It'll be deleted later. |                         Some(docid) => { | ||||||
|                     if let Some(docid) = external_documents_ids.get(entry.key()) { |                             // If it was already in the list of replaced documents it means it was deleted | ||||||
|                         // If it was already in the list of replaced documents it means it was deleted |                             // by the remove_document method. We should starts as if it never existed. | ||||||
|                         // by the remove_document method. We should starts as if it never existed. |                             if self.replaced_documents_ids.insert(docid) { | ||||||
|                         if self.replaced_documents_ids.insert(docid) { |                                 original_docid = Some(docid); | ||||||
|                             original_docid = Some(docid); |                             } | ||||||
|  |  | ||||||
|  |                             docid | ||||||
|                         } |                         } | ||||||
|                     } |                         None => self | ||||||
|                     let docid = self |                             .available_documents_ids | ||||||
|                         .available_documents_ids |                             .next() | ||||||
|                         .next() |                             .ok_or(UserError::DocumentLimitReached)?, | ||||||
|                         .ok_or(UserError::DocumentLimitReached)?; |                     }; | ||||||
|                     entry.insert(docid as u64); |                     entry.insert(docid as u64); | ||||||
|                     docid |                     docid | ||||||
|                 } |                 } | ||||||
| @@ -263,47 +265,68 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                     skip_insertion = true; |                     skip_insertion = true; | ||||||
|                 } else { |                 } else { | ||||||
|                     // we associate the base document with the new key, everything will get merged later. |                     // we associate the base document with the new key, everything will get merged later. | ||||||
|                     document_sorter_buffer.clear(); |                     let deladd_operation = match self.index_documents_method { | ||||||
|                     document_sorter_buffer.push(Operation::Addition as u8); |                         IndexDocumentsMethod::UpdateDocuments => { | ||||||
|                     document_sorter_buffer.extend_from_slice(base_obkv); |                             DelAddOperation::DeletionAndAddition | ||||||
|                     self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; |  | ||||||
|                     match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { |  | ||||||
|                         Some(flattened_obkv) => { |  | ||||||
|                             // we recreate our buffer with the flattened documents |  | ||||||
|                             document_sorter_buffer.clear(); |  | ||||||
|                             document_sorter_buffer.push(Operation::Addition as u8); |  | ||||||
|                             document_sorter_buffer.extend_from_slice(&flattened_obkv); |  | ||||||
|                             self.flattened_sorter |  | ||||||
|                                 .insert(docid.to_be_bytes(), &document_sorter_buffer)? |  | ||||||
|                         } |                         } | ||||||
|                         None => self |                         IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion, | ||||||
|                             .flattened_sorter |                     }; | ||||||
|                             .insert(docid.to_be_bytes(), &document_sorter_buffer)?, |                     document_sorter_key_buffer.clear(); | ||||||
|  |                     document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||||
|  |                     document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||||
|  |                     document_sorter_value_buffer.clear(); | ||||||
|  |                     document_sorter_value_buffer.push(Operation::Addition as u8); | ||||||
|  |                     into_del_add_obkv( | ||||||
|  |                         KvReaderU16::new(base_obkv), | ||||||
|  |                         deladd_operation, | ||||||
|  |                         &mut document_sorter_value_buffer, | ||||||
|  |                     )?; | ||||||
|  |                     self.original_sorter | ||||||
|  |                         .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|  |                     let base_obkv = KvReader::new(base_obkv); | ||||||
|  |                     if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { | ||||||
|  |                         // we recreate our buffer with the flattened documents | ||||||
|  |                         document_sorter_value_buffer.clear(); | ||||||
|  |                         document_sorter_value_buffer.push(Operation::Addition as u8); | ||||||
|  |                         into_del_add_obkv( | ||||||
|  |                             KvReaderU16::new(&flattened_obkv), | ||||||
|  |                             deladd_operation, | ||||||
|  |                             &mut document_sorter_value_buffer, | ||||||
|  |                         )?; | ||||||
|                     } |                     } | ||||||
|  |                     self.flattened_sorter | ||||||
|  |                         .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             if !skip_insertion { |             if !skip_insertion { | ||||||
|                 self.new_documents_ids.insert(docid); |                 self.new_documents_ids.insert(docid); | ||||||
|  |  | ||||||
|                 document_sorter_buffer.clear(); |                 document_sorter_key_buffer.clear(); | ||||||
|                 document_sorter_buffer.push(Operation::Addition as u8); |                 document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||||
|                 document_sorter_buffer.extend_from_slice(&obkv_buffer); |                 document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||||
|  |                 document_sorter_value_buffer.clear(); | ||||||
|  |                 document_sorter_value_buffer.push(Operation::Addition as u8); | ||||||
|  |                 into_del_add_obkv( | ||||||
|  |                     KvReaderU16::new(&obkv_buffer), | ||||||
|  |                     DelAddOperation::Addition, | ||||||
|  |                     &mut document_sorter_value_buffer, | ||||||
|  |                 )?; | ||||||
|                 // We use the extracted/generated user id as the key for this document. |                 // We use the extracted/generated user id as the key for this document. | ||||||
|                 self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; |                 self.original_sorter | ||||||
|  |                     .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|  |  | ||||||
|                 match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { |                 let flattened_obkv = KvReader::new(&obkv_buffer); | ||||||
|                     Some(flattened_obkv) => { |                 if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { | ||||||
|                         document_sorter_buffer.clear(); |                     document_sorter_value_buffer.clear(); | ||||||
|                         document_sorter_buffer.push(Operation::Addition as u8); |                     document_sorter_value_buffer.push(Operation::Addition as u8); | ||||||
|                         document_sorter_buffer.extend_from_slice(&flattened_obkv); |                     into_del_add_obkv( | ||||||
|                         self.flattened_sorter |                         KvReaderU16::new(&obkv), | ||||||
|                             .insert(docid.to_be_bytes(), &document_sorter_buffer)? |                         DelAddOperation::Addition, | ||||||
|                     } |                         &mut document_sorter_value_buffer, | ||||||
|                     None => self |                     )? | ||||||
|                         .flattened_sorter |  | ||||||
|                         .insert(docid.to_be_bytes(), &document_sorter_buffer)?, |  | ||||||
|                 } |                 } | ||||||
|  |                 self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||||
|             } |             } | ||||||
|             documents_count += 1; |             documents_count += 1; | ||||||
|  |  | ||||||
| @@ -338,6 +361,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|     /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, |     /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, | ||||||
|     ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. |     ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. | ||||||
|     /// - If the document to remove was not present in either the db or the transform we do nothing. |     /// - If the document to remove was not present in either the db or the transform we do nothing. | ||||||
|  |     #[logging_timer::time] | ||||||
|     pub fn remove_documents<FA>( |     pub fn remove_documents<FA>( | ||||||
|         &mut self, |         &mut self, | ||||||
|         mut to_remove: Vec<String>, |         mut to_remove: Vec<String>, | ||||||
| @@ -347,54 +371,176 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|     where |     where | ||||||
|         FA: Fn() -> bool + Sync, |         FA: Fn() -> bool + Sync, | ||||||
|     { |     { | ||||||
|  |         puffin::profile_function!(); | ||||||
|  |  | ||||||
|         // there may be duplicates in the documents to remove. |         // there may be duplicates in the documents to remove. | ||||||
|         to_remove.sort_unstable(); |         to_remove.sort_unstable(); | ||||||
|         to_remove.dedup(); |         to_remove.dedup(); | ||||||
|  |  | ||||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; |         let external_documents_ids = self.index.external_documents_ids(); | ||||||
|  |  | ||||||
|         let mut documents_deleted = 0; |         let mut documents_deleted = 0; | ||||||
|  |         let mut document_sorter_value_buffer = Vec::new(); | ||||||
|  |         let mut document_sorter_key_buffer = Vec::new(); | ||||||
|         for to_remove in to_remove { |         for to_remove in to_remove { | ||||||
|             if should_abort() { |             if should_abort() { | ||||||
|                 return Err(Error::InternalError(InternalError::AbortedIndexation)); |                 return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             match self.new_external_documents_ids_builder.entry((*to_remove).into()) { |             // Check if the document has been added in the current indexing process. | ||||||
|                 // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. |             let deleted_from_current = | ||||||
|                 Entry::Occupied(entry) => { |                 match self.new_external_documents_ids_builder.entry((*to_remove).into()) { | ||||||
|                     let doc_id = *entry.get() as u32; |                     // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. | ||||||
|                     self.original_sorter |                     HEntry::Occupied(entry) => { | ||||||
|                         .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; |                         let docid = *entry.get() as u32; | ||||||
|                     self.flattened_sorter |                         // Key is the concatenation of the internal docid and the external one. | ||||||
|                         .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; |                         document_sorter_key_buffer.clear(); | ||||||
|  |                         document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||||
|  |                         document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); | ||||||
|  |                         document_sorter_value_buffer.clear(); | ||||||
|  |                         document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||||
|  |                         obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap(); | ||||||
|  |                         self.original_sorter | ||||||
|  |                             .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|  |                         self.flattened_sorter | ||||||
|  |                             .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||||
|  |  | ||||||
|                     // we must NOT update the list of replaced_documents_ids |                         // we must NOT update the list of replaced_documents_ids | ||||||
|                     // Either: |                         // Either: | ||||||
|                     // 1. It's already in it and there is nothing to do |                         // 1. It's already in it and there is nothing to do | ||||||
|                     // 2. It wasn't in it because the document was created by a previous batch and since |                         // 2. It wasn't in it because the document was created by a previous batch and since | ||||||
|                     //    we're removing it there is nothing to do. |                         //    we're removing it there is nothing to do. | ||||||
|                     self.new_documents_ids.remove(doc_id); |                         self.new_documents_ids.remove(docid); | ||||||
|                     entry.remove_entry(); |                         entry.remove_entry(); | ||||||
|                 } |                         true | ||||||
|                 Entry::Vacant(entry) => { |  | ||||||
|                     // If the document was already in the db we mark it as a `to_delete` document. |  | ||||||
|                     // It'll be deleted later. We don't need to push anything to the sorters. |  | ||||||
|                     if let Some(docid) = external_documents_ids.get(entry.key()) { |  | ||||||
|                         self.replaced_documents_ids.insert(docid); |  | ||||||
|                     } else { |  | ||||||
|                         // if the document is nowehere to be found, there is nothing to do and we must NOT |  | ||||||
|                         // increment the count of documents_deleted |  | ||||||
|                         continue; |  | ||||||
|                     } |                     } | ||||||
|  |                     HEntry::Vacant(_) => false, | ||||||
|  |                 }; | ||||||
|  |  | ||||||
|  |             // If the document was already in the db we mark it as a `to_delete` document. | ||||||
|  |             // Then we push the document in sorters in deletion mode. | ||||||
|  |             let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { | ||||||
|  |                 Some(docid) => { | ||||||
|  |                     self.remove_document_from_db( | ||||||
|  |                         docid, | ||||||
|  |                         to_remove, | ||||||
|  |                         wtxn, | ||||||
|  |                         &mut document_sorter_key_buffer, | ||||||
|  |                         &mut document_sorter_value_buffer, | ||||||
|  |                     )?; | ||||||
|  |                     true | ||||||
|                 } |                 } | ||||||
|  |                 None => false, | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|  |             // increase counter only if the document existed somewhere before. | ||||||
|  |             if deleted_from_current || deleted_from_db { | ||||||
|  |                 documents_deleted += 1; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(documents_deleted) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Removes documents from db using their internal document ids. | ||||||
|  |     /// | ||||||
|  |     /// # Warning | ||||||
|  |     /// | ||||||
|  |     /// This function is dangerous and will only work correctly if: | ||||||
|  |     /// | ||||||
|  |     /// - All the passed ids currently exist in the database | ||||||
|  |     /// - No batching using the standards `remove_documents` and `add_documents` took place | ||||||
|  |     /// | ||||||
|  |     /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. | ||||||
|  |     #[logging_timer::time] | ||||||
|  |     pub fn remove_documents_from_db_no_batch<FA>( | ||||||
|  |         &mut self, | ||||||
|  |         to_remove: &RoaringBitmap, | ||||||
|  |         wtxn: &mut heed::RwTxn, | ||||||
|  |         should_abort: FA, | ||||||
|  |     ) -> Result<usize> | ||||||
|  |     where | ||||||
|  |         FA: Fn() -> bool + Sync, | ||||||
|  |     { | ||||||
|  |         puffin::profile_function!(); | ||||||
|  |  | ||||||
|  |         let mut documents_deleted = 0; | ||||||
|  |         let mut document_sorter_value_buffer = Vec::new(); | ||||||
|  |         let mut document_sorter_key_buffer = Vec::new(); | ||||||
|  |         let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?; | ||||||
|  |  | ||||||
|  |         for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) { | ||||||
|  |             let external_docid = external_docid?; | ||||||
|  |             if should_abort() { | ||||||
|  |                 return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||||
|  |             } | ||||||
|  |             self.remove_document_from_db( | ||||||
|  |                 internal_docid, | ||||||
|  |                 external_docid, | ||||||
|  |                 wtxn, | ||||||
|  |                 &mut document_sorter_key_buffer, | ||||||
|  |                 &mut document_sorter_value_buffer, | ||||||
|  |             )?; | ||||||
|  |  | ||||||
|             documents_deleted += 1; |             documents_deleted += 1; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(documents_deleted) |         Ok(documents_deleted) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     fn remove_document_from_db( | ||||||
|  |         &mut self, | ||||||
|  |         internal_docid: u32, | ||||||
|  |         external_docid: String, | ||||||
|  |         txn: &heed::RoTxn, | ||||||
|  |         document_sorter_key_buffer: &mut Vec<u8>, | ||||||
|  |         document_sorter_value_buffer: &mut Vec<u8>, | ||||||
|  |     ) -> Result<()> { | ||||||
|  |         self.replaced_documents_ids.insert(internal_docid); | ||||||
|  |  | ||||||
|  |         // fetch the obkv document | ||||||
|  |         let original_key = BEU32::new(internal_docid); | ||||||
|  |         let base_obkv = self | ||||||
|  |             .index | ||||||
|  |             .documents | ||||||
|  |             .remap_data_type::<heed::types::ByteSlice>() | ||||||
|  |             .get(txn, &original_key)? | ||||||
|  |             .ok_or(InternalError::DatabaseMissingEntry { | ||||||
|  |                 db_name: db_name::DOCUMENTS, | ||||||
|  |                 key: None, | ||||||
|  |             })?; | ||||||
|  |  | ||||||
|  |         // Key is the concatenation of the internal docid and the external one. | ||||||
|  |         document_sorter_key_buffer.clear(); | ||||||
|  |         document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes()); | ||||||
|  |         document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); | ||||||
|  |         // push it as to delete in the original_sorter | ||||||
|  |         document_sorter_value_buffer.clear(); | ||||||
|  |         document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||||
|  |         into_del_add_obkv( | ||||||
|  |             KvReaderU16::new(base_obkv), | ||||||
|  |             DelAddOperation::Deletion, | ||||||
|  |             document_sorter_value_buffer, | ||||||
|  |         )?; | ||||||
|  |         self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|  |  | ||||||
|  |         // flatten it and push it as to delete in the flattened_sorter | ||||||
|  |         let flattened_obkv = KvReader::new(base_obkv); | ||||||
|  |         if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { | ||||||
|  |             // we recreate our buffer with the flattened documents | ||||||
|  |             document_sorter_value_buffer.clear(); | ||||||
|  |             document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||||
|  |             into_del_add_obkv( | ||||||
|  |                 KvReaderU16::new(&obkv), | ||||||
|  |                 DelAddOperation::Deletion, | ||||||
|  |                 document_sorter_value_buffer, | ||||||
|  |             )?; | ||||||
|  |         } | ||||||
|  |         self.flattened_sorter | ||||||
|  |             .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // Flatten a document from the fields ids map contained in self and insert the new |     // Flatten a document from the fields ids map contained in self and insert the new | ||||||
|     // created fields. Returns `None` if the document doesn't need to be flattened. |     // created fields. Returns `None` if the document doesn't need to be flattened. | ||||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { |     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { | ||||||
| @@ -514,42 +660,10 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn remove_deleted_documents_from_field_distribution( |  | ||||||
|         &self, |  | ||||||
|         rtxn: &RoTxn, |  | ||||||
|         field_distribution: &mut FieldDistribution, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         for deleted_docid in self.replaced_documents_ids.iter() { |  | ||||||
|             let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( |  | ||||||
|                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, |  | ||||||
|             )?; |  | ||||||
|  |  | ||||||
|             for (key, _) in obkv.iter() { |  | ||||||
|                 let name = |  | ||||||
|                     self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { |  | ||||||
|                         field_id: key, |  | ||||||
|                         process: "Computing field distribution in transform.", |  | ||||||
|                     })?; |  | ||||||
|                 // We checked that the document was in the db earlier. If we can't find it it means |  | ||||||
|                 // there is an inconsistency between the field distribution and the field id map. |  | ||||||
|                 let field = |  | ||||||
|                     field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { |  | ||||||
|                         field_id: key, |  | ||||||
|                         process: "Accessing field distribution in transform.", |  | ||||||
|                     })?; |  | ||||||
|                 *field -= 1; |  | ||||||
|                 if *field == 0 { |  | ||||||
|                     // since we were able to get the field right before it's safe to unwrap here |  | ||||||
|                     field_distribution.remove(name).unwrap(); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Generate the `TransformOutput` based on the given sorter that can be generated from any |     /// Generate the `TransformOutput` based on the given sorter that can be generated from any | ||||||
|     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document |     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document | ||||||
|     /// id for the user side and the value must be an obkv where keys are valid fields ids. |     /// id for the user side and the value must be an obkv where keys are valid fields ids. | ||||||
|  |     #[logging_timer::time] | ||||||
|     pub(crate) fn output_from_sorter<F>( |     pub(crate) fn output_from_sorter<F>( | ||||||
|         self, |         self, | ||||||
|         wtxn: &mut heed::RwTxn, |         wtxn: &mut heed::RwTxn, | ||||||
| @@ -581,17 +695,13 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         // 2. Add all the new documents to the field distribution |         // 2. Add all the new documents to the field distribution | ||||||
|         let mut field_distribution = self.index.field_distribution(wtxn)?; |         let mut field_distribution = self.index.field_distribution(wtxn)?; | ||||||
|  |  | ||||||
|         self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; |  | ||||||
|  |  | ||||||
|         // Here we are going to do the document count + field distribution + `write_into_stream_writer` |         // Here we are going to do the document count + field distribution + `write_into_stream_writer` | ||||||
|         let mut iter = self.original_sorter.into_stream_merger_iter()?; |         let mut iter = self.original_sorter.into_stream_merger_iter()?; | ||||||
|         // used only for the callback |         // used only for the callback | ||||||
|         let mut documents_count = 0; |         let mut documents_count = 0; | ||||||
|  |  | ||||||
|         while let Some((key, val)) = iter.next()? { |         while let Some((key, val)) = iter.next()? { | ||||||
|             if val[0] == Operation::Deletion as u8 { |             // skip first byte corresponding to the operation type (Deletion or Addition). | ||||||
|                 continue; |  | ||||||
|             } |  | ||||||
|             let val = &val[1..]; |             let val = &val[1..]; | ||||||
|  |  | ||||||
|             // send a callback to show at which step we are |             // send a callback to show at which step we are | ||||||
| @@ -601,16 +711,51 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                 total_documents: self.documents_count, |                 total_documents: self.documents_count, | ||||||
|             }); |             }); | ||||||
|  |  | ||||||
|             // We increment all the field of the current document in the field distribution. |             for (key, value) in KvReader::new(val) { | ||||||
|             let obkv = KvReader::new(val); |                 let reader = KvReaderDelAdd::new(value); | ||||||
|  |                 match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { | ||||||
|             for (key, _) in obkv.iter() { |                     (None, None) => {} | ||||||
|                 let name = |                     (None, Some(_)) => { | ||||||
|                     self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { |                         // New field | ||||||
|                         field_id: key, |                         let name = self.fields_ids_map.name(key).ok_or( | ||||||
|                         process: "Computing field distribution in transform.", |                             FieldIdMapMissingEntry::FieldId { | ||||||
|                     })?; |                                 field_id: key, | ||||||
|                 *field_distribution.entry(name.to_string()).or_insert(0) += 1; |                                 process: "Computing field distribution in transform.", | ||||||
|  |                             }, | ||||||
|  |                         )?; | ||||||
|  |                         *field_distribution.entry(name.to_string()).or_insert(0) += 1; | ||||||
|  |                     } | ||||||
|  |                     (Some(_), None) => { | ||||||
|  |                         // Field removed | ||||||
|  |                         let name = self.fields_ids_map.name(key).ok_or( | ||||||
|  |                             FieldIdMapMissingEntry::FieldId { | ||||||
|  |                                 field_id: key, | ||||||
|  |                                 process: "Computing field distribution in transform.", | ||||||
|  |                             }, | ||||||
|  |                         )?; | ||||||
|  |                         match field_distribution.entry(name.to_string()) { | ||||||
|  |                             BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */ | ||||||
|  |                             } | ||||||
|  |                             BEntry::Occupied(mut entry) => { | ||||||
|  |                                 // attempt to remove one | ||||||
|  |                                 match entry.get_mut().checked_sub(1) { | ||||||
|  |                                     Some(0) => { | ||||||
|  |                                         entry.remove(); | ||||||
|  |                                     } | ||||||
|  |                                     Some(new_val) => { | ||||||
|  |                                         *entry.get_mut() = new_val; | ||||||
|  |                                     } | ||||||
|  |                                     None => { | ||||||
|  |                                         unreachable!("Attempting to remove a field that wasn't in the field distribution") | ||||||
|  |                                     } | ||||||
|  |                                 } | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|  |                     (Some(_), Some(_)) => { | ||||||
|  |                         // Value change, no field distribution change | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|             writer.insert(key, val)?; |             writer.insert(key, val)?; | ||||||
|         } |         } | ||||||
| @@ -631,9 +776,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         // We get rids of the `Operation` byte and skip the deleted documents as well. |         // We get rids of the `Operation` byte and skip the deleted documents as well. | ||||||
|         let mut iter = self.flattened_sorter.into_stream_merger_iter()?; |         let mut iter = self.flattened_sorter.into_stream_merger_iter()?; | ||||||
|         while let Some((key, val)) = iter.next()? { |         while let Some((key, val)) = iter.next()? { | ||||||
|             if val[0] == Operation::Deletion as u8 { |             // skip first byte corresponding to the operation type (Deletion or Addition). | ||||||
|                 continue; |  | ||||||
|             } |  | ||||||
|             let val = &val[1..]; |             let val = &val[1..]; | ||||||
|             writer.insert(key, val)?; |             writer.insert(key, val)?; | ||||||
|         } |         } | ||||||
| @@ -649,15 +792,11 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { |         new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { | ||||||
|             fst_new_external_documents_ids_builder.insert(key, value) |             fst_new_external_documents_ids_builder.insert(key, value) | ||||||
|         })?; |         })?; | ||||||
|         let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); |  | ||||||
|  |  | ||||||
|         Ok(TransformOutput { |         Ok(TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map: self.fields_ids_map, |             fields_ids_map: self.fields_ids_map, | ||||||
|             field_distribution, |             field_distribution, | ||||||
|             new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(), |  | ||||||
|             new_documents_ids: self.new_documents_ids, |  | ||||||
|             replaced_documents_ids: self.replaced_documents_ids, |  | ||||||
|             documents_count: self.documents_count, |             documents_count: self.documents_count, | ||||||
|             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, |             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, | ||||||
|             flattened_documents: flattened_documents |             flattened_documents: flattened_documents | ||||||
| @@ -687,37 +826,41 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             .to_string(); |             .to_string(); | ||||||
|         let field_distribution = self.index.field_distribution(wtxn)?; |         let field_distribution = self.index.field_distribution(wtxn)?; | ||||||
|  |  | ||||||
|         // Delete the soft deleted document ids from the maps inside the external_document_ids structure |  | ||||||
|         let new_external_documents_ids = { |  | ||||||
|             let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; |  | ||||||
|             external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; |  | ||||||
|             // This call should be free and can't fail since the previous method merged both fsts. |  | ||||||
|             external_documents_ids.into_static().to_fst()?.into_owned() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let documents_ids = self.index.documents_ids(wtxn)?; |         let documents_ids = self.index.documents_ids(wtxn)?; | ||||||
|         let documents_count = documents_ids.len() as usize; |         let documents_count = documents_ids.len() as usize; | ||||||
|  |  | ||||||
|         // We create a final writer to write the new documents in order from the sorter. |         // We initialize the sorter with the user indexing settings. | ||||||
|         let mut original_writer = create_writer( |         let mut original_sorter = create_sorter( | ||||||
|  |             grenad::SortAlgorithm::Stable, | ||||||
|  |             keep_first, | ||||||
|             self.indexer_settings.chunk_compression_type, |             self.indexer_settings.chunk_compression_type, | ||||||
|             self.indexer_settings.chunk_compression_level, |             self.indexer_settings.chunk_compression_level, | ||||||
|             tempfile::tempfile()?, |             self.indexer_settings.max_nb_chunks, | ||||||
|  |             self.indexer_settings.max_memory.map(|mem| mem / 2), | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         // We create a final writer to write the new documents in order from the sorter. |         // We initialize the sorter with the user indexing settings. | ||||||
|         let mut flattened_writer = create_writer( |         let mut flattened_sorter = create_sorter( | ||||||
|  |             grenad::SortAlgorithm::Stable, | ||||||
|  |             keep_first, | ||||||
|             self.indexer_settings.chunk_compression_type, |             self.indexer_settings.chunk_compression_type, | ||||||
|             self.indexer_settings.chunk_compression_level, |             self.indexer_settings.chunk_compression_level, | ||||||
|             tempfile::tempfile()?, |             self.indexer_settings.max_nb_chunks, | ||||||
|  |             self.indexer_settings.max_memory.map(|mem| mem / 2), | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let mut obkv_buffer = Vec::new(); |         let mut obkv_buffer = Vec::new(); | ||||||
|         for result in self.index.all_documents(wtxn)? { |         let mut document_sorter_key_buffer = Vec::new(); | ||||||
|             let (docid, obkv) = result?; |         let mut document_sorter_value_buffer = Vec::new(); | ||||||
|  |         for result in self.index.external_documents_ids().iter(wtxn)? { | ||||||
|  |             let (external_id, docid) = result?; | ||||||
|  |             let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( | ||||||
|  |                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, | ||||||
|  |             )?; | ||||||
|  |             let docid = docid.get(); | ||||||
|  |  | ||||||
|             obkv_buffer.clear(); |             obkv_buffer.clear(); | ||||||
|             let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); |             let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); | ||||||
|  |  | ||||||
|             // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. |             // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. | ||||||
|             for (id, name) in new_fields_ids_map.iter() { |             for (id, name) in new_fields_ids_map.iter() { | ||||||
| @@ -727,7 +870,17 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             let buffer = obkv_writer.into_inner()?; |             let buffer = obkv_writer.into_inner()?; | ||||||
|             original_writer.insert(docid.to_be_bytes(), &buffer)?; |  | ||||||
|  |             document_sorter_key_buffer.clear(); | ||||||
|  |             document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||||
|  |             document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||||
|  |             document_sorter_value_buffer.clear(); | ||||||
|  |             into_del_add_obkv( | ||||||
|  |                 KvReaderU16::new(buffer), | ||||||
|  |                 DelAddOperation::Addition, | ||||||
|  |                 &mut document_sorter_value_buffer, | ||||||
|  |             )?; | ||||||
|  |             original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|  |  | ||||||
|             // Once we have the document. We're going to flatten it |             // Once we have the document. We're going to flatten it | ||||||
|             // and insert it in the flattened sorter. |             // and insert it in the flattened sorter. | ||||||
| @@ -762,29 +915,34 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; |                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||||
|                 writer.insert(fid, &value)?; |                 writer.insert(fid, &value)?; | ||||||
|             } |             } | ||||||
|             flattened_writer.insert(docid.to_be_bytes(), &buffer)?; |             document_sorter_value_buffer.clear(); | ||||||
|  |             into_del_add_obkv( | ||||||
|  |                 KvReaderU16::new(&buffer), | ||||||
|  |                 DelAddOperation::Addition, | ||||||
|  |                 &mut document_sorter_value_buffer, | ||||||
|  |             )?; | ||||||
|  |             flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // Once we have written all the documents, we extract |         let grenad_params = GrenadParameters { | ||||||
|         // the file and reset the seek to be able to read it again. |             chunk_compression_type: self.indexer_settings.chunk_compression_type, | ||||||
|         let mut original_documents = original_writer.into_inner()?; |             chunk_compression_level: self.indexer_settings.chunk_compression_level, | ||||||
|         original_documents.rewind()?; |             max_memory: self.indexer_settings.max_memory, | ||||||
|  |             max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen. | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         let mut flattened_documents = flattened_writer.into_inner()?; |         // Once we have written all the documents, we merge everything into a Reader. | ||||||
|         flattened_documents.rewind()?; |         let original_documents = sorter_into_reader(original_sorter, grenad_params)?; | ||||||
|  |  | ||||||
|  |         let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; | ||||||
|  |  | ||||||
|         let output = TransformOutput { |         let output = TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map: new_fields_ids_map, |             fields_ids_map: new_fields_ids_map, | ||||||
|             field_distribution, |             field_distribution, | ||||||
|             new_external_documents_ids, |  | ||||||
|             new_documents_ids: documents_ids, |  | ||||||
|             replaced_documents_ids: RoaringBitmap::default(), |  | ||||||
|             documents_count, |             documents_count, | ||||||
|             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, |             original_documents: original_documents.into_inner().into_inner(), | ||||||
|             flattened_documents: flattened_documents |             flattened_documents: flattened_documents.into_inner().into_inner(), | ||||||
|                 .into_inner() |  | ||||||
|                 .map_err(|err| err.into_error())?, |  | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let new_facets = output.compute_real_facets(wtxn, self.index)?; |         let new_facets = output.compute_real_facets(wtxn, self.index)?; | ||||||
| @@ -828,38 +986,111 @@ mod test { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn merge_obkvs() { |     fn merge_obkvs() { | ||||||
|         let mut doc_0 = Vec::new(); |         let mut additive_doc_0 = Vec::new(); | ||||||
|         let mut kv_writer = KvWriter::new(&mut doc_0); |         let mut deletive_doc_0 = Vec::new(); | ||||||
|  |         let mut del_add_doc_0 = Vec::new(); | ||||||
|  |         let mut kv_writer = KvWriter::memory(); | ||||||
|         kv_writer.insert(0_u8, [0]).unwrap(); |         kv_writer.insert(0_u8, [0]).unwrap(); | ||||||
|         kv_writer.finish().unwrap(); |         let buffer = kv_writer.into_inner().unwrap(); | ||||||
|         doc_0.insert(0, Operation::Addition as u8); |         into_del_add_obkv( | ||||||
|  |             KvReaderU16::new(&buffer), | ||||||
|         let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap(); |             DelAddOperation::Addition, | ||||||
|         assert_eq!(*ret, doc_0); |             &mut additive_doc_0, | ||||||
|  |  | ||||||
|         let ret = merge_obkvs_and_operations( |  | ||||||
|             &[], |  | ||||||
|             &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())], |  | ||||||
|         ) |         ) | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
|         assert_eq!(*ret, doc_0); |         additive_doc_0.insert(0, Operation::Addition as u8); | ||||||
|  |         into_del_add_obkv( | ||||||
|         let ret = merge_obkvs_and_operations( |             KvReaderU16::new(&buffer), | ||||||
|             &[], |             DelAddOperation::Deletion, | ||||||
|             &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())], |             &mut deletive_doc_0, | ||||||
|         ) |         ) | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
|         assert_eq!(*ret, [Operation::Deletion as u8]); |         deletive_doc_0.insert(0, Operation::Deletion as u8); | ||||||
|  |         into_del_add_obkv( | ||||||
|  |             KvReaderU16::new(&buffer), | ||||||
|  |             DelAddOperation::DeletionAndAddition, | ||||||
|  |             &mut del_add_doc_0, | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         del_add_doc_0.insert(0, Operation::Addition as u8); | ||||||
|  |  | ||||||
|         let ret = merge_obkvs_and_operations( |         let mut additive_doc_1 = Vec::new(); | ||||||
|  |         let mut kv_writer = KvWriter::memory(); | ||||||
|  |         kv_writer.insert(1_u8, [1]).unwrap(); | ||||||
|  |         let buffer = kv_writer.into_inner().unwrap(); | ||||||
|  |         into_del_add_obkv( | ||||||
|  |             KvReaderU16::new(&buffer), | ||||||
|  |             DelAddOperation::Addition, | ||||||
|  |             &mut additive_doc_1, | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         additive_doc_1.insert(0, Operation::Addition as u8); | ||||||
|  |  | ||||||
|  |         let mut additive_doc_0_1 = Vec::new(); | ||||||
|  |         let mut kv_writer = KvWriter::memory(); | ||||||
|  |         kv_writer.insert(0_u8, [0]).unwrap(); | ||||||
|  |         kv_writer.insert(1_u8, [1]).unwrap(); | ||||||
|  |         let buffer = kv_writer.into_inner().unwrap(); | ||||||
|  |         into_del_add_obkv( | ||||||
|  |             KvReaderU16::new(&buffer), | ||||||
|  |             DelAddOperation::Addition, | ||||||
|  |             &mut additive_doc_0_1, | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         additive_doc_0_1.insert(0, Operation::Addition as u8); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())]) | ||||||
|  |             .unwrap(); | ||||||
|  |         assert_eq!(*ret, additive_doc_0); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_merge_additions_and_deletions( | ||||||
|  |             &[], | ||||||
|  |             &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())], | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         assert_eq!(*ret, del_add_doc_0); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_merge_additions_and_deletions( | ||||||
|  |             &[], | ||||||
|  |             &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())], | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         assert_eq!(*ret, deletive_doc_0); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_merge_additions_and_deletions( | ||||||
|             &[], |             &[], | ||||||
|             &[ |             &[ | ||||||
|                 Cow::from([Operation::Addition as u8, 1].as_slice()), |                 Cow::from(additive_doc_1.as_slice()), | ||||||
|                 Cow::from([Operation::Deletion as u8].as_slice()), |                 Cow::from(deletive_doc_0.as_slice()), | ||||||
|                 Cow::from(doc_0.as_slice()), |                 Cow::from(additive_doc_0.as_slice()), | ||||||
|             ], |             ], | ||||||
|         ) |         ) | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
|         assert_eq!(*ret, doc_0); |         assert_eq!(*ret, del_add_doc_0); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_merge_additions_and_deletions( | ||||||
|  |             &[], | ||||||
|  |             &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         assert_eq!(*ret, additive_doc_0_1); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_keep_last_addition_merge_deletions( | ||||||
|  |             &[], | ||||||
|  |             &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         assert_eq!(*ret, additive_doc_0); | ||||||
|  |  | ||||||
|  |         let ret = obkvs_keep_last_addition_merge_deletions( | ||||||
|  |             &[], | ||||||
|  |             &[ | ||||||
|  |                 Cow::from(deletive_doc_0.as_slice()), | ||||||
|  |                 Cow::from(additive_doc_1.as_slice()), | ||||||
|  |                 Cow::from(additive_doc_0.as_slice()), | ||||||
|  |             ], | ||||||
|  |         ) | ||||||
|  |         .unwrap(); | ||||||
|  |         assert_eq!(*ret, del_add_doc_0); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,5 +1,4 @@ | |||||||
| use std::borrow::Cow; | use std::collections::{HashMap, HashSet}; | ||||||
| use std::collections::HashMap; |  | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
| @@ -9,32 +8,40 @@ use charabia::{Language, Script}; | |||||||
| use grenad::MergerBuilder; | use grenad::MergerBuilder; | ||||||
| use heed::types::ByteSlice; | use heed::types::ByteSlice; | ||||||
| use heed::RwTxn; | use heed::RwTxn; | ||||||
|  | use log::error; | ||||||
|  | use obkv::{KvReader, KvWriter}; | ||||||
|  | use ordered_float::OrderedFloat; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, |     self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, | ||||||
|  |     valid_lmdb_key, CursorClonableMmap, | ||||||
| }; | }; | ||||||
| use super::{ClonableMmap, MergeFn}; | use super::{ClonableMmap, MergeFn}; | ||||||
| use crate::distance::NDotProductPoint; | use crate::distance::NDotProductPoint; | ||||||
| use crate::error::UserError; | use crate::error::UserError; | ||||||
|  | use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; | ||||||
| use crate::facet::FacetType; | use crate::facet::FacetType; | ||||||
|  | use crate::index::db_name::DOCUMENTS; | ||||||
| use crate::index::Hnsw; | use crate::index::Hnsw; | ||||||
|  | use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; | ||||||
| use crate::update::facet::FacetsUpdate; | use crate::update::facet::FacetsUpdate; | ||||||
| use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; | use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; | ||||||
| use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; | use crate::{ | ||||||
|  |     lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32, | ||||||
|  | }; | ||||||
|  |  | ||||||
| pub(crate) enum TypedChunk { | pub(crate) enum TypedChunk { | ||||||
|     FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), |     FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), | ||||||
|     FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>), |     FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>), | ||||||
|     Documents(grenad::Reader<CursorClonableMmap>), |     Documents(grenad::Reader<CursorClonableMmap>), | ||||||
|     FieldIdWordcountDocids(grenad::Reader<BufReader<File>>), |     FieldIdWordCountDocids(grenad::Reader<BufReader<File>>), | ||||||
|     NewDocumentsIds(RoaringBitmap), |  | ||||||
|     WordDocids { |     WordDocids { | ||||||
|         word_docids_reader: grenad::Reader<BufReader<File>>, |         word_docids_reader: grenad::Reader<BufReader<File>>, | ||||||
|         exact_word_docids_reader: grenad::Reader<BufReader<File>>, |         exact_word_docids_reader: grenad::Reader<BufReader<File>>, | ||||||
|  |         word_fid_docids_reader: grenad::Reader<BufReader<File>>, | ||||||
|     }, |     }, | ||||||
|     WordPositionDocids(grenad::Reader<BufReader<File>>), |     WordPositionDocids(grenad::Reader<BufReader<File>>), | ||||||
|     WordFidDocids(grenad::Reader<BufReader<File>>), |  | ||||||
|     WordPairProximityDocids(grenad::Reader<BufReader<File>>), |     WordPairProximityDocids(grenad::Reader<BufReader<File>>), | ||||||
|     FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>), |     FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>), | ||||||
|     FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>), |     FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>), | ||||||
| @@ -43,7 +50,7 @@ pub(crate) enum TypedChunk { | |||||||
|     FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>), |     FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>), | ||||||
|     GeoPoints(grenad::Reader<BufReader<File>>), |     GeoPoints(grenad::Reader<BufReader<File>>), | ||||||
|     VectorPoints(grenad::Reader<BufReader<File>>), |     VectorPoints(grenad::Reader<BufReader<File>>), | ||||||
|     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), |     ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), | ||||||
| } | } | ||||||
|  |  | ||||||
| impl TypedChunk { | impl TypedChunk { | ||||||
| @@ -58,23 +65,22 @@ impl TypedChunk { | |||||||
|             TypedChunk::Documents(grenad) => { |             TypedChunk::Documents(grenad) => { | ||||||
|                 format!("Documents {{ number_of_entries: {} }}", grenad.len()) |                 format!("Documents {{ number_of_entries: {} }}", grenad.len()) | ||||||
|             } |             } | ||||||
|             TypedChunk::FieldIdWordcountDocids(grenad) => { |             TypedChunk::FieldIdWordCountDocids(grenad) => { | ||||||
|                 format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) |                 format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) | ||||||
|             } |             } | ||||||
|             TypedChunk::NewDocumentsIds(grenad) => { |             TypedChunk::WordDocids { | ||||||
|                 format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len()) |                 word_docids_reader, | ||||||
|             } |                 exact_word_docids_reader, | ||||||
|             TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!( |                 word_fid_docids_reader, | ||||||
|                 "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}", |             } => format!( | ||||||
|  |                 "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", | ||||||
|                 word_docids_reader.len(), |                 word_docids_reader.len(), | ||||||
|                 exact_word_docids_reader.len() |                 exact_word_docids_reader.len(), | ||||||
|  |                 word_fid_docids_reader.len() | ||||||
|             ), |             ), | ||||||
|             TypedChunk::WordPositionDocids(grenad) => { |             TypedChunk::WordPositionDocids(grenad) => { | ||||||
|                 format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) |                 format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) | ||||||
|             } |             } | ||||||
|             TypedChunk::WordFidDocids(grenad) => { |  | ||||||
|                 format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len()) |  | ||||||
|             } |  | ||||||
|             TypedChunk::WordPairProximityDocids(grenad) => { |             TypedChunk::WordPairProximityDocids(grenad) => { | ||||||
|                 format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) |                 format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) | ||||||
|             } |             } | ||||||
| @@ -99,8 +105,8 @@ impl TypedChunk { | |||||||
|             TypedChunk::VectorPoints(grenad) => { |             TypedChunk::VectorPoints(grenad) => { | ||||||
|                 format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) |                 format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) | ||||||
|             } |             } | ||||||
|             TypedChunk::ScriptLanguageDocids(grenad) => { |             TypedChunk::ScriptLanguageDocids(sl_map) => { | ||||||
|                 format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len()) |                 format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -119,34 +125,75 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|     let mut is_merged_database = false; |     let mut is_merged_database = false; | ||||||
|     match typed_chunk { |     match typed_chunk { | ||||||
|         TypedChunk::Documents(obkv_documents_iter) => { |         TypedChunk::Documents(obkv_documents_iter) => { | ||||||
|  |             let mut operations: Vec<DocumentOperation> = Default::default(); | ||||||
|  |  | ||||||
|  |             let mut docids = index.documents_ids(wtxn)?; | ||||||
|             let mut cursor = obkv_documents_iter.into_cursor()?; |             let mut cursor = obkv_documents_iter.into_cursor()?; | ||||||
|             while let Some((key, value)) = cursor.move_on_next()? { |             while let Some((key, reader)) = cursor.move_on_next()? { | ||||||
|                 index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?; |                 let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); | ||||||
|  |                 let reader: KvReader<FieldId> = KvReader::new(reader); | ||||||
|  |  | ||||||
|  |                 let (document_id_bytes, external_id_bytes) = try_split_array_at(key) | ||||||
|  |                     .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?; | ||||||
|  |                 let docid = DocumentId::from_be_bytes(document_id_bytes); | ||||||
|  |                 let external_id = std::str::from_utf8(external_id_bytes)?; | ||||||
|  |  | ||||||
|  |                 for (field_id, value) in reader.iter() { | ||||||
|  |                     let del_add_reader = KvReaderDelAdd::new(value); | ||||||
|  |  | ||||||
|  |                     if let Some(addition) = del_add_reader.get(DelAdd::Addition) { | ||||||
|  |                         writer.insert(field_id, addition)?; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 let db = index.documents.remap_data_type::<ByteSlice>(); | ||||||
|  |  | ||||||
|  |                 if !writer.is_empty() { | ||||||
|  |                     db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?; | ||||||
|  |                     operations.push(DocumentOperation { | ||||||
|  |                         external_id: external_id.to_string(), | ||||||
|  |                         internal_id: docid, | ||||||
|  |                         kind: DocumentOperationKind::Create, | ||||||
|  |                     }); | ||||||
|  |                     docids.insert(docid); | ||||||
|  |                 } else { | ||||||
|  |                     db.delete(wtxn, &BEU32::new(docid))?; | ||||||
|  |                     operations.push(DocumentOperation { | ||||||
|  |                         external_id: external_id.to_string(), | ||||||
|  |                         internal_id: docid, | ||||||
|  |                         kind: DocumentOperationKind::Delete, | ||||||
|  |                     }); | ||||||
|  |                     docids.remove(docid); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|  |             let external_documents_docids = index.external_documents_ids(); | ||||||
|  |             external_documents_docids.apply(wtxn, operations)?; | ||||||
|  |             index.put_documents_ids(wtxn, &docids)?; | ||||||
|         } |         } | ||||||
|         TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { |         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { | ||||||
|             append_entries_into_database( |             append_entries_into_database( | ||||||
|                 fid_word_count_docids_iter, |                 fid_word_count_docids_iter, | ||||||
|                 &index.field_id_word_count_docids, |                 &index.field_id_word_count_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_cbo_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|             is_merged_database = true; |             is_merged_database = true; | ||||||
|         } |         } | ||||||
|         TypedChunk::NewDocumentsIds(documents_ids) => { |         TypedChunk::WordDocids { | ||||||
|             return Ok((documents_ids, is_merged_database)) |             word_docids_reader, | ||||||
|         } |             exact_word_docids_reader, | ||||||
|         TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { |             word_fid_docids_reader, | ||||||
|  |         } => { | ||||||
|             let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; |             let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; | ||||||
|             append_entries_into_database( |             append_entries_into_database( | ||||||
|                 word_docids_iter.clone(), |                 word_docids_iter.clone(), | ||||||
|                 &index.word_docids, |                 &index.word_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|  |  | ||||||
|             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; |             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; | ||||||
| @@ -155,8 +202,18 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 &index.exact_word_docids, |                 &index.exact_word_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|  |             )?; | ||||||
|  |  | ||||||
|  |             let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; | ||||||
|  |             append_entries_into_database( | ||||||
|  |                 word_fid_docids_iter, | ||||||
|  |                 &index.word_fid_docids, | ||||||
|  |                 wtxn, | ||||||
|  |                 index_is_empty, | ||||||
|  |                 deladd_serialize_add_side, | ||||||
|  |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|  |  | ||||||
|             // create fst from word docids |             // create fst from word docids | ||||||
| @@ -177,19 +234,8 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 &index.word_position_docids, |                 &index.word_position_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_cbo_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |  | ||||||
|             is_merged_database = true; |  | ||||||
|         } |  | ||||||
|         TypedChunk::WordFidDocids(word_fid_docids_iter) => { |  | ||||||
|             append_entries_into_database( |  | ||||||
|                 word_fid_docids_iter, |  | ||||||
|                 &index.word_fid_docids, |  | ||||||
|                 wtxn, |  | ||||||
|                 index_is_empty, |  | ||||||
|                 |value, _buffer| Ok(value), |  | ||||||
|                 merge_cbo_roaring_bitmaps, |  | ||||||
|             )?; |             )?; | ||||||
|             is_merged_database = true; |             is_merged_database = true; | ||||||
|         } |         } | ||||||
| @@ -209,8 +255,8 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 &index.facet_id_exists_docids, |                 &index.facet_id_exists_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_cbo_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|             is_merged_database = true; |             is_merged_database = true; | ||||||
|         } |         } | ||||||
| @@ -220,8 +266,8 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 &index.facet_id_is_null_docids, |                 &index.facet_id_is_null_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_cbo_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|             is_merged_database = true; |             is_merged_database = true; | ||||||
|         } |         } | ||||||
| @@ -231,8 +277,8 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 &index.facet_id_is_empty_docids, |                 &index.facet_id_is_empty_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_cbo_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|             is_merged_database = true; |             is_merged_database = true; | ||||||
|         } |         } | ||||||
| @@ -242,8 +288,8 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 &index.word_pair_proximity_docids, |                 &index.word_pair_proximity_docids, | ||||||
|                 wtxn, |                 wtxn, | ||||||
|                 index_is_empty, |                 index_is_empty, | ||||||
|                 |value, _buffer| Ok(value), |                 deladd_serialize_add_side, | ||||||
|                 merge_cbo_roaring_bitmaps, |                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||||
|             )?; |             )?; | ||||||
|             is_merged_database = true; |             is_merged_database = true; | ||||||
|         } |         } | ||||||
| @@ -252,8 +298,18 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>(); |                 index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>(); | ||||||
|             let mut cursor = fid_docid_facet_number.into_cursor()?; |             let mut cursor = fid_docid_facet_number.into_cursor()?; | ||||||
|             while let Some((key, value)) = cursor.move_on_next()? { |             while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|  |                 let reader = KvReaderDelAdd::new(value); | ||||||
|                 if valid_lmdb_key(key) { |                 if valid_lmdb_key(key) { | ||||||
|                     index_fid_docid_facet_numbers.put(wtxn, key, value)?; |                     match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { | ||||||
|  |                         (None, None) => {} | ||||||
|  |                         (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?, | ||||||
|  |                         (Some(_), None) => { | ||||||
|  |                             index_fid_docid_facet_numbers.delete(wtxn, key)?; | ||||||
|  |                         } | ||||||
|  |                         (Some(_), Some(new)) => { | ||||||
|  |                             index_fid_docid_facet_numbers.put(wtxn, key, new)? | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -262,8 +318,18 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>(); |                 index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>(); | ||||||
|             let mut cursor = fid_docid_facet_string.into_cursor()?; |             let mut cursor = fid_docid_facet_string.into_cursor()?; | ||||||
|             while let Some((key, value)) = cursor.move_on_next()? { |             while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|  |                 let reader = KvReaderDelAdd::new(value); | ||||||
|                 if valid_lmdb_key(key) { |                 if valid_lmdb_key(key) { | ||||||
|                     index_fid_docid_facet_strings.put(wtxn, key, value)?; |                     match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { | ||||||
|  |                         (None, None) => {} | ||||||
|  |                         (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?, | ||||||
|  |                         (Some(_), None) => { | ||||||
|  |                             index_fid_docid_facet_strings.delete(wtxn, key)?; | ||||||
|  |                         } | ||||||
|  |                         (Some(_), Some(new)) => { | ||||||
|  |                             index_fid_docid_facet_strings.put(wtxn, key, new)? | ||||||
|  |                         } | ||||||
|  |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
| @@ -276,57 +342,86 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                 // convert the key back to a u32 (4 bytes) |                 // convert the key back to a u32 (4 bytes) | ||||||
|                 let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); |                 let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); | ||||||
|  |  | ||||||
|                 // convert the latitude and longitude back to a f64 (8 bytes) |                 let deladd_obkv = KvReaderDelAdd::new(value); | ||||||
|                 let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); |                 if let Some(value) = deladd_obkv.get(DelAdd::Deletion) { | ||||||
|                 let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); |                     let geopoint = extract_geo_point(value, docid); | ||||||
|                 let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; |                     rtree.remove(&geopoint); | ||||||
|                 let xyz_point = lat_lng_to_xyz(&point); |                     geo_faceted_docids.remove(docid); | ||||||
|  |                 } | ||||||
|                 rtree.insert(GeoPoint::new(xyz_point, (docid, point))); |                 if let Some(value) = deladd_obkv.get(DelAdd::Addition) { | ||||||
|                 geo_faceted_docids.insert(docid); |                     let geopoint = extract_geo_point(value, docid); | ||||||
|  |                     rtree.insert(geopoint); | ||||||
|  |                     geo_faceted_docids.insert(docid); | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|             index.put_geo_rtree(wtxn, &rtree)?; |             index.put_geo_rtree(wtxn, &rtree)?; | ||||||
|             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; |             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||||
|         } |         } | ||||||
|         TypedChunk::VectorPoints(vector_points) => { |         TypedChunk::VectorPoints(vector_points) => { | ||||||
|             let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? { |             let mut vectors_set = HashSet::new(); | ||||||
|                 Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(), |             // We extract and store the previous vectors | ||||||
|                 None => Default::default(), |             if let Some(hnsw) = index.vector_hnsw(wtxn)? { | ||||||
|             }; |                 for (pid, point) in hnsw.iter() { | ||||||
|  |                     let pid_key = BEU32::new(pid.into_inner()); | ||||||
|             // Convert the PointIds into DocumentIds |                     let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap().get(); | ||||||
|             let mut docids = Vec::new(); |                     let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect(); | ||||||
|             for pid in pids { |                     vectors_set.insert((docid, vector)); | ||||||
|                 let docid = |                 } | ||||||
|                     index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap(); |  | ||||||
|                 docids.push(docid.get()); |  | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             let mut expected_dimensions = points.get(0).map(|p| p.len()); |  | ||||||
|             let mut cursor = vector_points.into_cursor()?; |             let mut cursor = vector_points.into_cursor()?; | ||||||
|             while let Some((key, value)) = cursor.move_on_next()? { |             while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|                 // convert the key back to a u32 (4 bytes) |                 // convert the key back to a u32 (4 bytes) | ||||||
|                 let (left, _index) = try_split_array_at(key).unwrap(); |                 let (left, _index) = try_split_array_at(key).unwrap(); | ||||||
|                 let docid = DocumentId::from_be_bytes(left); |                 let docid = DocumentId::from_be_bytes(left); | ||||||
|                 // convert the vector back to a Vec<f32> |  | ||||||
|                 let vector: Vec<f32> = pod_collect_to_vec(value); |  | ||||||
|  |  | ||||||
|                 // TODO Inform the user about the document that has a wrong `_vectors` |                 let vector_deladd_obkv = KvReaderDelAdd::new(value); | ||||||
|                 let found = vector.len(); |                 if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { | ||||||
|                 let expected = *expected_dimensions.get_or_insert(found); |                     // convert the vector back to a Vec<f32> | ||||||
|                 if expected != found { |                     let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); | ||||||
|                     return Err(UserError::InvalidVectorDimensions { expected, found })?; |                     let key = (docid, vector); | ||||||
|  |                     if !vectors_set.remove(&key) { | ||||||
|  |                         error!("Unable to delete the vector: {:?}", key.1); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { | ||||||
|  |                     // convert the vector back to a Vec<f32> | ||||||
|  |                     let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); | ||||||
|  |                     vectors_set.insert((docid, vector)); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 points.push(NDotProductPoint::new(vector)); |  | ||||||
|                 docids.push(docid); |  | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             assert_eq!(docids.len(), points.len()); |             // Extract the most common vector dimension | ||||||
|  |             let expected_dimension_size = { | ||||||
|  |                 let mut dims = HashMap::new(); | ||||||
|  |                 vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1); | ||||||
|  |                 dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             // Ensure that the vector lengths are correct and | ||||||
|  |             // prepare the vectors before inserting them in the HNSW. | ||||||
|  |             let mut points = Vec::new(); | ||||||
|  |             let mut docids = Vec::new(); | ||||||
|  |             for (docid, vector) in vectors_set { | ||||||
|  |                 if expected_dimension_size.map_or(false, |expected| expected != vector.len()) { | ||||||
|  |                     return Err(UserError::InvalidVectorDimensions { | ||||||
|  |                         expected: expected_dimension_size.unwrap_or(vector.len()), | ||||||
|  |                         found: vector.len(), | ||||||
|  |                     } | ||||||
|  |                     .into()); | ||||||
|  |                 } else { | ||||||
|  |                     let vector = vector.into_iter().map(OrderedFloat::into_inner).collect(); | ||||||
|  |                     points.push(NDotProductPoint::new(vector)); | ||||||
|  |                     docids.push(docid); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|             let hnsw_length = points.len(); |             let hnsw_length = points.len(); | ||||||
|             let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); |             let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); | ||||||
|  |  | ||||||
|  |             assert_eq!(docids.len(), pids.len()); | ||||||
|  |  | ||||||
|  |             // Store the vectors in the point-docid relation database | ||||||
|             index.vector_id_docid.clear(wtxn)?; |             index.vector_id_docid.clear(wtxn)?; | ||||||
|             for (docid, pid) in docids.into_iter().zip(pids) { |             for (docid, pid) in docids.into_iter().zip(pids) { | ||||||
|                 index.vector_id_docid.put( |                 index.vector_id_docid.put( | ||||||
| @@ -339,22 +434,25 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|             log::debug!("There are {} entries in the HNSW so far", hnsw_length); |             log::debug!("There are {} entries in the HNSW so far", hnsw_length); | ||||||
|             index.put_vector_hnsw(wtxn, &new_hnsw)?; |             index.put_vector_hnsw(wtxn, &new_hnsw)?; | ||||||
|         } |         } | ||||||
|         TypedChunk::ScriptLanguageDocids(hash_pair) => { |         TypedChunk::ScriptLanguageDocids(sl_map) => { | ||||||
|             let mut buffer = Vec::new(); |             for (key, (deletion, addition)) in sl_map { | ||||||
|             for (key, value) in hash_pair { |                 let mut db_key_exists = false; | ||||||
|                 buffer.clear(); |  | ||||||
|                 let final_value = match index.script_language_docids.get(wtxn, &key)? { |                 let final_value = match index.script_language_docids.get(wtxn, &key)? { | ||||||
|                     Some(db_values) => { |                     Some(db_values) => { | ||||||
|                         let mut db_value_buffer = Vec::new(); |                         db_key_exists = true; | ||||||
|                         serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; |                         (db_values - deletion) | addition | ||||||
|                         let mut new_value_buffer = Vec::new(); |  | ||||||
|                         serialize_roaring_bitmap(&value, &mut new_value_buffer)?; |  | ||||||
|                         merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; |  | ||||||
|                         RoaringBitmap::deserialize_from(&buffer[..])? |  | ||||||
|                     } |                     } | ||||||
|                     None => value, |                     None => addition, | ||||||
|                 }; |                 }; | ||||||
|                 index.script_language_docids.put(wtxn, &key, &final_value)?; |  | ||||||
|  |                 if final_value.is_empty() { | ||||||
|  |                     // If the database entry exists, delete it. | ||||||
|  |                     if db_key_exists { | ||||||
|  |                         index.script_language_docids.delete(wtxn, &key)?; | ||||||
|  |                     } | ||||||
|  |                 } else { | ||||||
|  |                     index.script_language_docids.put(wtxn, &key, &final_value)?; | ||||||
|  |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -362,6 +460,15 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|     Ok((RoaringBitmap::new(), is_merged_database)) |     Ok((RoaringBitmap::new(), is_merged_database)) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Converts the latitude and longitude back to an xyz GeoPoint. | ||||||
|  | fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { | ||||||
|  |     let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); | ||||||
|  |     let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); | ||||||
|  |     let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; | ||||||
|  |     let xyz_point = lat_lng_to_xyz(&point); | ||||||
|  |     GeoPoint::new(xyz_point, (docid, point)) | ||||||
|  | } | ||||||
|  |  | ||||||
| fn merge_word_docids_reader_into_fst( | fn merge_word_docids_reader_into_fst( | ||||||
|     word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, |     word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, | ||||||
|     exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, |     exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, | ||||||
| @@ -379,24 +486,6 @@ fn merge_word_docids_reader_into_fst( | |||||||
|     Ok(builder.into_set()) |     Ok(builder.into_set()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> { |  | ||||||
|     let new_value = RoaringBitmap::deserialize_from(new_value)?; |  | ||||||
|     let db_value = RoaringBitmap::deserialize_from(db_value)?; |  | ||||||
|     let value = new_value | db_value; |  | ||||||
|     Ok(serialize_roaring_bitmap(&value, buffer)?) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn merge_cbo_roaring_bitmaps( |  | ||||||
|     new_value: &[u8], |  | ||||||
|     db_value: &[u8], |  | ||||||
|     buffer: &mut Vec<u8>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     Ok(CboRoaringBitmapCodec::merge_into( |  | ||||||
|         &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], |  | ||||||
|         buffer, |  | ||||||
|     )?) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Write provided entries in database using serialize_value function. | /// Write provided entries in database using serialize_value function. | ||||||
| /// merge_values function is used if an entry already exist in the database. | /// merge_values function is used if an entry already exist in the database. | ||||||
| fn write_entries_into_database<R, K, V, FS, FM>( | fn write_entries_into_database<R, K, V, FS, FM>( | ||||||
| @@ -410,7 +499,7 @@ fn write_entries_into_database<R, K, V, FS, FM>( | |||||||
| where | where | ||||||
|     R: io::Read + io::Seek, |     R: io::Read + io::Seek, | ||||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, |     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||||
|     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>, |     FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>, | ||||||
| { | { | ||||||
|     puffin::profile_function!(format!("number of entries: {}", data.len())); |     puffin::profile_function!(format!("number of entries: {}", data.len())); | ||||||
|  |  | ||||||
| @@ -422,17 +511,19 @@ where | |||||||
|         if valid_lmdb_key(key) { |         if valid_lmdb_key(key) { | ||||||
|             buffer.clear(); |             buffer.clear(); | ||||||
|             let value = if index_is_empty { |             let value = if index_is_empty { | ||||||
|                 serialize_value(value, &mut buffer)? |                 Some(serialize_value(value, &mut buffer)?) | ||||||
|             } else { |             } else { | ||||||
|                 match database.get(wtxn, key)? { |                 match database.get(wtxn, key)? { | ||||||
|                     Some(prev_value) => { |                     Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, | ||||||
|                         merge_values(value, prev_value, &mut buffer)?; |                     None => Some(serialize_value(value, &mut buffer)?), | ||||||
|                         &buffer[..] |  | ||||||
|                     } |  | ||||||
|                     None => serialize_value(value, &mut buffer)?, |  | ||||||
|                 } |                 } | ||||||
|             }; |             }; | ||||||
|             database.put(wtxn, key, value)?; |             match value { | ||||||
|  |                 Some(value) => database.put(wtxn, key, value)?, | ||||||
|  |                 None => { | ||||||
|  |                     database.delete(wtxn, key)?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -454,7 +545,8 @@ fn append_entries_into_database<R, K, V, FS, FM>( | |||||||
| where | where | ||||||
|     R: io::Read + io::Seek, |     R: io::Read + io::Seek, | ||||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, |     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||||
|     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>, |     FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>, | ||||||
|  |     K: for<'a> heed::BytesDecode<'a>, | ||||||
| { | { | ||||||
|     puffin::profile_function!(format!("number of entries: {}", data.len())); |     puffin::profile_function!(format!("number of entries: {}", data.len())); | ||||||
|  |  | ||||||
| @@ -475,6 +567,12 @@ where | |||||||
|     let mut cursor = data.into_cursor()?; |     let mut cursor = data.into_cursor()?; | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
|         if valid_lmdb_key(key) { |         if valid_lmdb_key(key) { | ||||||
|  |             debug_assert!( | ||||||
|  |                 K::bytes_decode(key).is_some(), | ||||||
|  |                 "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", | ||||||
|  |                 key.len(), | ||||||
|  |                 &key | ||||||
|  |             ); | ||||||
|             buffer.clear(); |             buffer.clear(); | ||||||
|             let value = serialize_value(value, &mut buffer)?; |             let value = serialize_value(value, &mut buffer)?; | ||||||
|             unsafe { database.append(key, value)? }; |             unsafe { database.append(key, value)? }; | ||||||
|   | |||||||
| @@ -1,6 +1,5 @@ | |||||||
| pub use self::available_documents_ids::AvailableDocumentsIds; | pub use self::available_documents_ids::AvailableDocumentsIds; | ||||||
| pub use self::clear_documents::ClearDocuments; | pub use self::clear_documents::ClearDocuments; | ||||||
| pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; |  | ||||||
| pub use self::facet::bulk::FacetsUpdateBulk; | pub use self::facet::bulk::FacetsUpdateBulk; | ||||||
| pub use self::facet::incremental::FacetsUpdateIncrementalInner; | pub use self::facet::incremental::FacetsUpdateIncrementalInner; | ||||||
| pub use self::index_documents::{ | pub use self::index_documents::{ | ||||||
| @@ -9,10 +8,6 @@ pub use self::index_documents::{ | |||||||
|     MergeFn, |     MergeFn, | ||||||
| }; | }; | ||||||
| pub use self::indexer_config::IndexerConfig; | pub use self::indexer_config::IndexerConfig; | ||||||
| pub use self::prefix_word_pairs::{ |  | ||||||
|     PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
|     MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
| }; |  | ||||||
| pub use self::settings::{Setting, Settings}; | pub use self::settings::{Setting, Settings}; | ||||||
| pub use self::update_step::UpdateIndexingStep; | pub use self::update_step::UpdateIndexingStep; | ||||||
| pub use self::word_prefix_docids::WordPrefixDocids; | pub use self::word_prefix_docids::WordPrefixDocids; | ||||||
| @@ -21,11 +16,10 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; | |||||||
|  |  | ||||||
| mod available_documents_ids; | mod available_documents_ids; | ||||||
| mod clear_documents; | mod clear_documents; | ||||||
| mod delete_documents; | pub(crate) mod del_add; | ||||||
| pub(crate) mod facet; | pub(crate) mod facet; | ||||||
| mod index_documents; | mod index_documents; | ||||||
| mod indexer_config; | mod indexer_config; | ||||||
| mod prefix_word_pairs; |  | ||||||
| mod settings; | mod settings; | ||||||
| mod update_step; | mod update_step; | ||||||
| mod word_prefix_docids; | mod word_prefix_docids; | ||||||
|   | |||||||
| @@ -1,579 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::HashSet; |  | ||||||
| use std::io::{BufReader, BufWriter}; |  | ||||||
|  |  | ||||||
| use grenad::CompressionType; |  | ||||||
| use heed::types::ByteSlice; |  | ||||||
|  |  | ||||||
| use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; |  | ||||||
| use crate::{Index, Result}; |  | ||||||
|  |  | ||||||
| mod prefix_word; |  | ||||||
| mod word_prefix; |  | ||||||
|  |  | ||||||
| pub use prefix_word::index_prefix_word_database; |  | ||||||
| pub use word_prefix::index_word_prefix_database; |  | ||||||
|  |  | ||||||
| pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; |  | ||||||
| pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; |  | ||||||
|  |  | ||||||
| pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { |  | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |  | ||||||
|     index: &'i Index, |  | ||||||
|     max_proximity: u8, |  | ||||||
|     max_prefix_length: usize, |  | ||||||
|     chunk_compression_type: CompressionType, |  | ||||||
|     chunk_compression_level: Option<u32>, |  | ||||||
| } |  | ||||||
| impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { |  | ||||||
|     pub fn new( |  | ||||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, |  | ||||||
|         index: &'i Index, |  | ||||||
|         chunk_compression_type: CompressionType, |  | ||||||
|         chunk_compression_level: Option<u32>, |  | ||||||
|     ) -> Self { |  | ||||||
|         Self { |  | ||||||
|             wtxn, |  | ||||||
|             index, |  | ||||||
|             max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
|             max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, |  | ||||||
|             chunk_compression_type, |  | ||||||
|             chunk_compression_level, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[logging_timer::time("WordPrefixPairProximityDocids::{}")] |  | ||||||
|     pub fn execute<'a>( |  | ||||||
|         self, |  | ||||||
|         new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, |  | ||||||
|         new_prefix_fst_words: &'a [String], |  | ||||||
|         common_prefix_fst_words: &[&'a [String]], |  | ||||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         puffin::profile_function!(); |  | ||||||
|  |  | ||||||
|         index_word_prefix_database( |  | ||||||
|             self.wtxn, |  | ||||||
|             self.index.word_pair_proximity_docids, |  | ||||||
|             self.index.word_prefix_pair_proximity_docids, |  | ||||||
|             self.max_proximity, |  | ||||||
|             self.max_prefix_length, |  | ||||||
|             new_word_pair_proximity_docids.clone(), |  | ||||||
|             new_prefix_fst_words, |  | ||||||
|             common_prefix_fst_words, |  | ||||||
|             del_prefix_fst_words, |  | ||||||
|             self.chunk_compression_type, |  | ||||||
|             self.chunk_compression_level, |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         index_prefix_word_database( |  | ||||||
|             self.wtxn, |  | ||||||
|             self.index.word_pair_proximity_docids, |  | ||||||
|             self.index.prefix_word_pair_proximity_docids, |  | ||||||
|             self.max_proximity, |  | ||||||
|             self.max_prefix_length, |  | ||||||
|             new_word_pair_proximity_docids, |  | ||||||
|             new_prefix_fst_words, |  | ||||||
|             common_prefix_fst_words, |  | ||||||
|             del_prefix_fst_words, |  | ||||||
|             self.chunk_compression_type, |  | ||||||
|             self.chunk_compression_level, |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // This is adapted from `sorter_into_lmdb_database` |  | ||||||
| pub fn insert_into_database( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     database: heed::PolyDatabase, |  | ||||||
|     new_key: &[u8], |  | ||||||
|     new_value: &[u8], |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; |  | ||||||
|     match iter.next().transpose()? { |  | ||||||
|         Some((key, old_val)) if new_key == key => { |  | ||||||
|             let val = |  | ||||||
|                 merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) |  | ||||||
|                     .map_err(|_| { |  | ||||||
|                         // TODO just wrap this error? |  | ||||||
|                         crate::error::InternalError::IndexingMergingKeys { |  | ||||||
|                             process: "get-put-merge", |  | ||||||
|                         } |  | ||||||
|                     })?; |  | ||||||
|             // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour |  | ||||||
|             unsafe { iter.put_current(new_key, &val)? }; |  | ||||||
|         } |  | ||||||
|         _ => { |  | ||||||
|             drop(iter); |  | ||||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, |  | ||||||
| // but it uses `append` if the database is empty, and it assumes that the values in the |  | ||||||
| // writer don't conflict with values in the database. |  | ||||||
| pub fn write_into_lmdb_database_without_merging( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     database: heed::PolyDatabase, |  | ||||||
|     writer: grenad::Writer<BufWriter<std::fs::File>>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; |  | ||||||
|     let reader = grenad::Reader::new(BufReader::new(file))?; |  | ||||||
|     if database.is_empty(wtxn)? { |  | ||||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; |  | ||||||
|         let mut cursor = reader.into_cursor()?; |  | ||||||
|         while let Some((k, v)) = cursor.move_on_next()? { |  | ||||||
|             // safety: the key comes from the grenad reader, not the database |  | ||||||
|             unsafe { out_iter.append(k, v)? }; |  | ||||||
|         } |  | ||||||
|     } else { |  | ||||||
|         let mut cursor = reader.into_cursor()?; |  | ||||||
|         while let Some((k, v)) = cursor.move_on_next()? { |  | ||||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use std::io::Cursor; |  | ||||||
|     use std::iter::FromIterator; |  | ||||||
|  |  | ||||||
|     use roaring::RoaringBitmap; |  | ||||||
|  |  | ||||||
|     use crate::db_snap; |  | ||||||
|     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; |  | ||||||
|     use crate::index::tests::TempIndex; |  | ||||||
|     use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; |  | ||||||
|  |  | ||||||
|     fn documents_with_enough_different_words_for_prefixes( |  | ||||||
|         prefixes: &[&str], |  | ||||||
|         start_id: usize, |  | ||||||
|     ) -> Vec<crate::Object> { |  | ||||||
|         let mut documents = Vec::new(); |  | ||||||
|         let mut id = start_id; |  | ||||||
|         for prefix in prefixes { |  | ||||||
|             for i in 0..50 { |  | ||||||
|                 documents.push( |  | ||||||
|                     serde_json::json!({ |  | ||||||
|                         "id": id, |  | ||||||
|                         "text": format!("{prefix}{i:x}"), |  | ||||||
|                     }) |  | ||||||
|                     .as_object() |  | ||||||
|                     .unwrap() |  | ||||||
|                     .clone(), |  | ||||||
|                 ); |  | ||||||
|                 id += 1; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         documents |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn add_new_documents() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.autogenerate_docids = true; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": "9000", |  | ||||||
|                 "text": "At an amazing and beautiful house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": "9001", |  | ||||||
|                 "text": "The bell rings at 5 am" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": "9002", |  | ||||||
|                 "text": "At an extraordinary house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_pair_proximity_docids, "update"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "update"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "update"); |  | ||||||
|     } |  | ||||||
|     #[test] |  | ||||||
|     fn batch_bug_3043() { |  | ||||||
|         // https://github.com/meilisearch/meilisearch/issues/3043 |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.autogenerate_docids = true; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "text": "x y" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "text": "x a y" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, word_pair_proximity_docids); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn hard_delete_and_reupdate() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9000, |  | ||||||
|                 "text": "At an amazing and beautiful house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9001, |  | ||||||
|                 "text": "The bell rings at 5 am" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "initial"); |  | ||||||
|         db_snap!(index, word_docids, "initial"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|         delete.strategy(DeletionStrategy::AlwaysHard); |  | ||||||
|         delete.delete_documents(&RoaringBitmap::from_iter([50])); |  | ||||||
|         delete.execute().unwrap(); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "first_delete"); |  | ||||||
|         db_snap!(index, word_docids, "first_delete"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); |  | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|         delete.strategy(DeletionStrategy::AlwaysHard); |  | ||||||
|         delete.delete_documents(&RoaringBitmap::from_iter(0..50)); |  | ||||||
|         delete.execute().unwrap(); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "second_delete"); |  | ||||||
|         db_snap!(index, word_docids, "second_delete"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); |  | ||||||
|  |  | ||||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|  |  | ||||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "reupdate"); |  | ||||||
|         db_snap!(index, word_docids, "reupdate"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn soft_delete_and_reupdate() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9000, |  | ||||||
|                 "text": "At an amazing and beautiful house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9001, |  | ||||||
|                 "text": "The bell rings at 5 am" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "initial"); |  | ||||||
|         db_snap!(index, word_docids, "initial"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|         delete.strategy(DeletionStrategy::AlwaysSoft); |  | ||||||
|         delete.delete_documents(&RoaringBitmap::from_iter([50])); |  | ||||||
|         delete.execute().unwrap(); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "first_delete"); |  | ||||||
|         db_snap!(index, word_docids, "first_delete"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); |  | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); |  | ||||||
|         delete.strategy(DeletionStrategy::AlwaysSoft); |  | ||||||
|  |  | ||||||
|         delete.delete_documents(&RoaringBitmap::from_iter(0..50)); |  | ||||||
|         delete.execute().unwrap(); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "second_delete"); |  | ||||||
|         db_snap!(index, word_docids, "second_delete"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); |  | ||||||
|  |  | ||||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|  |  | ||||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "reupdate"); |  | ||||||
|         db_snap!(index, word_docids, "reupdate"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn replace_soft_deletion() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; |  | ||||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9000, |  | ||||||
|                 "text": "At an amazing house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9001, |  | ||||||
|                 "text": "The bell rings" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "initial"); |  | ||||||
|         db_snap!(index, word_docids, "initial"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); |  | ||||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "replaced"); |  | ||||||
|         db_snap!(index, word_docids, "replaced"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn replace_hard_deletion() { |  | ||||||
|         let mut index = TempIndex::new(); |  | ||||||
|         index.index_documents_config.words_prefix_threshold = Some(50); |  | ||||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; |  | ||||||
|         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; |  | ||||||
|  |  | ||||||
|         index |  | ||||||
|             .update_settings(|settings| { |  | ||||||
|                 settings.set_primary_key("id".to_owned()); |  | ||||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); |  | ||||||
|             }) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let batch_reader_from_documents = |documents| { |  | ||||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|             for object in documents { |  | ||||||
|                 builder.append_json_object(&object).unwrap(); |  | ||||||
|             } |  | ||||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() |  | ||||||
|         }; |  | ||||||
|  |  | ||||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); |  | ||||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9000, |  | ||||||
|                 "text": "At an amazing house" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|         documents.push( |  | ||||||
|             serde_json::json!({ |  | ||||||
|                 "id": 9001, |  | ||||||
|                 "text": "The bell rings" |  | ||||||
|             }) |  | ||||||
|             .as_object() |  | ||||||
|             .unwrap() |  | ||||||
|             .clone(), |  | ||||||
|         ); |  | ||||||
|  |  | ||||||
|         let documents = batch_reader_from_documents(documents); |  | ||||||
|         index.add_documents(documents).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "initial"); |  | ||||||
|         db_snap!(index, word_docids, "initial"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); |  | ||||||
|  |  | ||||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); |  | ||||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); |  | ||||||
|  |  | ||||||
|         db_snap!(index, documents_ids, "replaced"); |  | ||||||
|         db_snap!(index, word_docids, "replaced"); |  | ||||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); |  | ||||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); |  | ||||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,182 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::{BTreeMap, HashSet}; |  | ||||||
|  |  | ||||||
| use grenad::CompressionType; |  | ||||||
| use heed::types::ByteSlice; |  | ||||||
| use heed::BytesDecode; |  | ||||||
| use log::debug; |  | ||||||
|  |  | ||||||
| use crate::update::index_documents::{create_writer, CursorClonableMmap}; |  | ||||||
| use crate::update::prefix_word_pairs::{ |  | ||||||
|     insert_into_database, write_into_lmdb_database_without_merging, |  | ||||||
| }; |  | ||||||
| use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; |  | ||||||
|  |  | ||||||
| #[allow(clippy::too_many_arguments)] |  | ||||||
| #[logging_timer::time] |  | ||||||
| pub fn index_prefix_word_database( |  | ||||||
|     wtxn: &mut heed::RwTxn, |  | ||||||
|     word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, |  | ||||||
|     max_proximity: u8, |  | ||||||
|     max_prefix_length: usize, |  | ||||||
|     new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, |  | ||||||
|     new_prefix_fst_words: &[String], |  | ||||||
|     common_prefix_fst_words: &[&[String]], |  | ||||||
|     del_prefix_fst_words: &HashSet<Vec<u8>>, |  | ||||||
|     chunk_compression_type: CompressionType, |  | ||||||
|     chunk_compression_level: Option<u32>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     puffin::profile_function!(); |  | ||||||
|  |  | ||||||
|     let max_proximity = max_proximity - 1; |  | ||||||
|     debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); |  | ||||||
|  |  | ||||||
|     let common_prefixes: Vec<_> = common_prefix_fst_words |  | ||||||
|         .iter() |  | ||||||
|         .flat_map(|s| s.iter()) |  | ||||||
|         .map(|s| s.as_str()) |  | ||||||
|         .filter(|s| s.len() <= max_prefix_length) |  | ||||||
|         .collect(); |  | ||||||
|  |  | ||||||
|     for proximity in 1..max_proximity { |  | ||||||
|         for prefix in common_prefixes.iter() { |  | ||||||
|             let mut prefix_key = vec![proximity]; |  | ||||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); |  | ||||||
|             let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; |  | ||||||
|             // This is the core of the algorithm |  | ||||||
|             execute_on_word_pairs_and_prefixes( |  | ||||||
|                 proximity, |  | ||||||
|                 prefix.as_bytes(), |  | ||||||
|                 // the next two arguments tell how to iterate over the new word pairs |  | ||||||
|                 &mut cursor, |  | ||||||
|                 |cursor| { |  | ||||||
|                     if let Some((key, value)) = cursor.next()? { |  | ||||||
|                         let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) |  | ||||||
|                             .ok_or(heed::Error::Decoding)?; |  | ||||||
|                         Ok(Some((word2, value))) |  | ||||||
|                     } else { |  | ||||||
|                         Ok(None) |  | ||||||
|                     } |  | ||||||
|                 }, |  | ||||||
|                 // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) |  | ||||||
|                 |key, value| { |  | ||||||
|                     insert_into_database( |  | ||||||
|                         wtxn, |  | ||||||
|                         *prefix_word_pair_proximity_docids.as_polymorph(), |  | ||||||
|                         key, |  | ||||||
|                         value, |  | ||||||
|                     ) |  | ||||||
|                 }, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Now we do the same thing with the new prefixes and all word pairs in the DB |  | ||||||
|     let new_prefixes: Vec<_> = new_prefix_fst_words |  | ||||||
|         .iter() |  | ||||||
|         .map(|s| s.as_str()) |  | ||||||
|         .filter(|s| s.len() <= max_prefix_length) |  | ||||||
|         .collect(); |  | ||||||
|  |  | ||||||
|     // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) |  | ||||||
|     // element in an intermediary grenad |  | ||||||
|     let mut writer = |  | ||||||
|         create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); |  | ||||||
|  |  | ||||||
|     for proximity in 1..max_proximity { |  | ||||||
|         for prefix in new_prefixes.iter() { |  | ||||||
|             let mut prefix_key = vec![proximity]; |  | ||||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); |  | ||||||
|             let mut db_iter = word_pair_proximity_docids |  | ||||||
|                 .as_polymorph() |  | ||||||
|                 .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? |  | ||||||
|                 .remap_key_type::<UncheckedU8StrStrCodec>(); |  | ||||||
|             execute_on_word_pairs_and_prefixes( |  | ||||||
|                 proximity, |  | ||||||
|                 prefix.as_bytes(), |  | ||||||
|                 &mut db_iter, |  | ||||||
|                 |db_iter| { |  | ||||||
|                     db_iter |  | ||||||
|                         .next() |  | ||||||
|                         .transpose() |  | ||||||
|                         .map(|x| x.map(|((_, _, word2), value)| (word2, value))) |  | ||||||
|                         .map_err(|e| e.into()) |  | ||||||
|                 }, |  | ||||||
|                 |key, value| writer.insert(key, value).map_err(|e| e.into()), |  | ||||||
|             )?; |  | ||||||
|             drop(db_iter); |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // and then we write the grenad into the DB |  | ||||||
|     // Since the grenad contains only new prefixes, we know in advance that none |  | ||||||
|     // of its elements already exist in the DB, thus there is no need to specify |  | ||||||
|     // how to merge conflicting elements |  | ||||||
|     write_into_lmdb_database_without_merging( |  | ||||||
|         wtxn, |  | ||||||
|         *prefix_word_pair_proximity_docids.as_polymorph(), |  | ||||||
|         writer, |  | ||||||
|     )?; |  | ||||||
|  |  | ||||||
|     // All of the word prefix pairs in the database that have a w2 |  | ||||||
|     // that is contained in the `suppr_pw` set must be removed as well. |  | ||||||
|     if !del_prefix_fst_words.is_empty() { |  | ||||||
|         let mut iter = |  | ||||||
|             prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?; |  | ||||||
|         while let Some(((_, prefix, _), _)) = iter.next().transpose()? { |  | ||||||
|             if del_prefix_fst_words.contains(prefix.as_bytes()) { |  | ||||||
|                 // Delete this entry as the w2 prefix is no more in the words prefix fst. |  | ||||||
|                 unsafe { iter.del_current()? }; |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. |  | ||||||
| /// |  | ||||||
| /// Its arguments are: |  | ||||||
| /// - an iterator over the words following the given `prefix` with the given `proximity` |  | ||||||
| /// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements |  | ||||||
| fn execute_on_word_pairs_and_prefixes<I>( |  | ||||||
|     proximity: u8, |  | ||||||
|     prefix: &[u8], |  | ||||||
|     iter: &mut I, |  | ||||||
|     mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>, |  | ||||||
|     mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, |  | ||||||
| ) -> Result<()> { |  | ||||||
|     let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default(); |  | ||||||
|  |  | ||||||
|     // Memory usage check: |  | ||||||
|     // The content of the loop will be called for each `word2` that follows a word beginning |  | ||||||
|     // with `prefix` with the given proximity. |  | ||||||
|     // In practice, I don't think the batch can ever get too big. |  | ||||||
|     while let Some((word2, docids)) = next_word2_and_docids(iter)? { |  | ||||||
|         let entry = batch.entry(word2.to_owned()).or_default(); |  | ||||||
|         entry.push(Cow::Owned(docids.to_owned())); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     let mut key_buffer = Vec::with_capacity(512); |  | ||||||
|     key_buffer.push(proximity); |  | ||||||
|     key_buffer.extend_from_slice(prefix); |  | ||||||
|     key_buffer.push(0); |  | ||||||
|  |  | ||||||
|     let mut value_buffer = Vec::with_capacity(65_536); |  | ||||||
|  |  | ||||||
|     for (word2, docids) in batch { |  | ||||||
|         key_buffer.truncate(prefix.len() + 2); |  | ||||||
|         value_buffer.clear(); |  | ||||||
|  |  | ||||||
|         key_buffer.extend_from_slice(&word2); |  | ||||||
|         let data = if docids.len() > 1 { |  | ||||||
|             CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; |  | ||||||
|             value_buffer.as_slice() |  | ||||||
|         } else { |  | ||||||
|             &docids[0] |  | ||||||
|         }; |  | ||||||
|         insert(key_buffer.as_slice(), data)?; |  | ||||||
|     } |  | ||||||
|     Ok(()) |  | ||||||
| } |  | ||||||
| @@ -1,20 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  a    5                [101, ] |  | ||||||
| 1  a    amazing          [100, ] |  | ||||||
| 1  a    an               [100, ] |  | ||||||
| 1  a    and              [100, ] |  | ||||||
| 1  a    beautiful        [100, ] |  | ||||||
| 1  b    house            [100, ] |  | ||||||
| 1  b    rings            [101, ] |  | ||||||
| 1  be   house            [100, ] |  | ||||||
| 1  be   rings            [101, ] |  | ||||||
| 2  a    am               [101, ] |  | ||||||
| 2  a    amazing          [100, ] |  | ||||||
| 2  a    and              [100, ] |  | ||||||
| 2  a    beautiful        [100, ] |  | ||||||
| 2  a    house            [100, ] |  | ||||||
| 2  b    at               [101, ] |  | ||||||
| 2  be   at               [101, ] |  | ||||||
|  |  | ||||||
| @@ -1,23 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  5                a    [101, ] |  | ||||||
| 1  amazing          a    [100, ] |  | ||||||
| 1  an               a    [100, ] |  | ||||||
| 1  and              b    [100, ] |  | ||||||
| 1  and              be   [100, ] |  | ||||||
| 1  at               a    [100, ] |  | ||||||
| 1  rings            a    [101, ] |  | ||||||
| 1  the              b    [101, ] |  | ||||||
| 1  the              be   [101, ] |  | ||||||
| 2  amazing          b    [100, ] |  | ||||||
| 2  amazing          be   [100, ] |  | ||||||
| 2  an               a    [100, ] |  | ||||||
| 2  at               a    [100, 101, ] |  | ||||||
| 2  bell             a    [101, ] |  | ||||||
| 3  an               b    [100, ] |  | ||||||
| 3  an               be   [100, ] |  | ||||||
| 3  at               a    [100, ] |  | ||||||
| 3  rings            a    [101, ] |  | ||||||
| 3  the              a    [101, ] |  | ||||||
|  |  | ||||||
| @@ -1,29 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  a    5                [101, ] |  | ||||||
| 1  a    amazing          [100, ] |  | ||||||
| 1  a    an               [100, 202, ] |  | ||||||
| 1  a    and              [100, ] |  | ||||||
| 1  a    beautiful        [100, ] |  | ||||||
| 1  a    extraordinary    [202, ] |  | ||||||
| 1  am   and              [100, ] |  | ||||||
| 1  an   amazing          [100, ] |  | ||||||
| 1  an   beautiful        [100, ] |  | ||||||
| 1  an   extraordinary    [202, ] |  | ||||||
| 1  b    house            [100, ] |  | ||||||
| 1  b    rings            [101, ] |  | ||||||
| 1  be   house            [100, ] |  | ||||||
| 1  be   rings            [101, ] |  | ||||||
| 2  a    am               [101, ] |  | ||||||
| 2  a    amazing          [100, ] |  | ||||||
| 2  a    and              [100, ] |  | ||||||
| 2  a    beautiful        [100, ] |  | ||||||
| 2  a    extraordinary    [202, ] |  | ||||||
| 2  a    house            [100, 202, ] |  | ||||||
| 2  am   beautiful        [100, ] |  | ||||||
| 2  an   and              [100, ] |  | ||||||
| 2  an   house            [100, 202, ] |  | ||||||
| 2  b    at               [101, ] |  | ||||||
| 2  be   at               [101, ] |  | ||||||
|  |  | ||||||
| @@ -1,33 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  5                am               [101, ] |  | ||||||
| 1  amazing          and              [100, ] |  | ||||||
| 1  an               amazing          [100, ] |  | ||||||
| 1  an               extraordinary    [202, ] |  | ||||||
| 1  and              beautiful        [100, ] |  | ||||||
| 1  at               5                [101, ] |  | ||||||
| 1  at               an               [100, 202, ] |  | ||||||
| 1  beautiful        house            [100, ] |  | ||||||
| 1  bell             rings            [101, ] |  | ||||||
| 1  extraordinary    house            [202, ] |  | ||||||
| 1  rings            at               [101, ] |  | ||||||
| 1  the              bell             [101, ] |  | ||||||
| 2  amazing          beautiful        [100, ] |  | ||||||
| 2  an               and              [100, ] |  | ||||||
| 2  an               house            [202, ] |  | ||||||
| 2  and              house            [100, ] |  | ||||||
| 2  at               am               [101, ] |  | ||||||
| 2  at               amazing          [100, ] |  | ||||||
| 2  at               extraordinary    [202, ] |  | ||||||
| 2  bell             at               [101, ] |  | ||||||
| 2  rings            5                [101, ] |  | ||||||
| 2  the              rings            [101, ] |  | ||||||
| 3  amazing          house            [100, ] |  | ||||||
| 3  an               beautiful        [100, ] |  | ||||||
| 3  at               and              [100, ] |  | ||||||
| 3  at               house            [202, ] |  | ||||||
| 3  bell             5                [101, ] |  | ||||||
| 3  rings            am               [101, ] |  | ||||||
| 3  the              at               [101, ] |  | ||||||
|  |  | ||||||
| @@ -1,31 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  5                a    [101, ] |  | ||||||
| 1  5                am   [101, ] |  | ||||||
| 1  amazing          a    [100, ] |  | ||||||
| 1  amazing          an   [100, ] |  | ||||||
| 1  an               a    [100, ] |  | ||||||
| 1  an               am   [100, ] |  | ||||||
| 1  and              b    [100, ] |  | ||||||
| 1  and              be   [100, ] |  | ||||||
| 1  at               a    [100, 202, ] |  | ||||||
| 1  at               an   [100, 202, ] |  | ||||||
| 1  rings            a    [101, ] |  | ||||||
| 1  the              b    [101, ] |  | ||||||
| 1  the              be   [101, ] |  | ||||||
| 2  amazing          b    [100, ] |  | ||||||
| 2  amazing          be   [100, ] |  | ||||||
| 2  an               a    [100, ] |  | ||||||
| 2  an               an   [100, ] |  | ||||||
| 2  at               a    [100, 101, ] |  | ||||||
| 2  at               am   [100, 101, ] |  | ||||||
| 2  bell             a    [101, ] |  | ||||||
| 3  an               b    [100, ] |  | ||||||
| 3  an               be   [100, ] |  | ||||||
| 3  at               a    [100, ] |  | ||||||
| 3  at               an   [100, ] |  | ||||||
| 3  rings            a    [101, ] |  | ||||||
| 3  rings            am   [101, ] |  | ||||||
| 3  the              a    [101, ] |  | ||||||
|  |  | ||||||
| @@ -1,4 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
|  |  | ||||||
| @@ -1,8 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  a                y                [51, ] |  | ||||||
| 1  x                a                [51, ] |  | ||||||
| 1  x                y                [50, ] |  | ||||||
| 2  x                y                [51, ] |  | ||||||
|  |  | ||||||
| @@ -1,7 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  a                y    [51, ] |  | ||||||
| 1  x                y    [50, ] |  | ||||||
| 2  x                y    [51, ] |  | ||||||
|  |  | ||||||
| @@ -1,4 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] |  | ||||||
| @@ -1,6 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  a    5                [51, ] |  | ||||||
| 2  a    am               [51, ] |  | ||||||
|  |  | ||||||
| @@ -1,60 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 5                [51, ] |  | ||||||
| a0               [0, ] |  | ||||||
| a1               [1, ] |  | ||||||
| a10              [16, ] |  | ||||||
| a11              [17, ] |  | ||||||
| a12              [18, ] |  | ||||||
| a13              [19, ] |  | ||||||
| a14              [20, ] |  | ||||||
| a15              [21, ] |  | ||||||
| a16              [22, ] |  | ||||||
| a17              [23, ] |  | ||||||
| a18              [24, ] |  | ||||||
| a19              [25, ] |  | ||||||
| a1a              [26, ] |  | ||||||
| a1b              [27, ] |  | ||||||
| a1c              [28, ] |  | ||||||
| a1d              [29, ] |  | ||||||
| a1e              [30, ] |  | ||||||
| a1f              [31, ] |  | ||||||
| a2               [2, ] |  | ||||||
| a20              [32, ] |  | ||||||
| a21              [33, ] |  | ||||||
| a22              [34, ] |  | ||||||
| a23              [35, ] |  | ||||||
| a24              [36, ] |  | ||||||
| a25              [37, ] |  | ||||||
| a26              [38, ] |  | ||||||
| a27              [39, ] |  | ||||||
| a28              [40, ] |  | ||||||
| a29              [41, ] |  | ||||||
| a2a              [42, ] |  | ||||||
| a2b              [43, ] |  | ||||||
| a2c              [44, ] |  | ||||||
| a2d              [45, ] |  | ||||||
| a2e              [46, ] |  | ||||||
| a2f              [47, ] |  | ||||||
| a3               [3, ] |  | ||||||
| a30              [48, ] |  | ||||||
| a31              [49, ] |  | ||||||
| a4               [4, ] |  | ||||||
| a5               [5, ] |  | ||||||
| a6               [6, ] |  | ||||||
| a7               [7, ] |  | ||||||
| a8               [8, ] |  | ||||||
| a9               [9, ] |  | ||||||
| aa               [10, ] |  | ||||||
| ab               [11, ] |  | ||||||
| ac               [12, ] |  | ||||||
| ad               [13, ] |  | ||||||
| ae               [14, ] |  | ||||||
| af               [15, ] |  | ||||||
| am               [51, ] |  | ||||||
| at               [51, ] |  | ||||||
| bell             [51, ] |  | ||||||
| rings            [51, ] |  | ||||||
| the              [51, ] |  | ||||||
|  |  | ||||||
| @@ -1,10 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  5                a    [51, ] |  | ||||||
| 1  rings            a    [51, ] |  | ||||||
| 2  at               a    [51, ] |  | ||||||
| 2  bell             a    [51, ] |  | ||||||
| 3  rings            a    [51, ] |  | ||||||
| 3  the              a    [51, ] |  | ||||||
|  |  | ||||||
| @@ -1,4 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] |  | ||||||
| @@ -1,14 +0,0 @@ | |||||||
| --- |  | ||||||
| source: milli/src/update/prefix_word_pairs/mod.rs |  | ||||||
| --- |  | ||||||
| 1  a    5                [51, ] |  | ||||||
| 1  a    amazing          [50, ] |  | ||||||
| 1  a    an               [50, ] |  | ||||||
| 1  a    and              [50, ] |  | ||||||
| 1  a    beautiful        [50, ] |  | ||||||
| 2  a    am               [51, ] |  | ||||||
| 2  a    amazing          [50, ] |  | ||||||
| 2  a    and              [50, ] |  | ||||||
| 2  a    beautiful        [50, ] |  | ||||||
| 2  a    house            [50, ] |  | ||||||
|  |  | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user