diff --git a/.github/ISSUE_TEMPLATE/sprint_issue.md b/.github/ISSUE_TEMPLATE/sprint_issue.md index 261e11798..30b5e16ff 100644 --- a/.github/ISSUE_TEMPLATE/sprint_issue.md +++ b/.github/ISSUE_TEMPLATE/sprint_issue.md @@ -22,6 +22,20 @@ Related product discussion: +### Are you modifying a database? +- [ ] If not, add the `no db change` label to your PR, and you're good to merge. +- [ ] If yes, add the `db change` label to your PR. You'll receive a message explaining you what to do. + +### Reminders when modifying the API + +- [ ] Update the openAPI file with utoipa: + - [ ] If a new module has been introduced, create a new structure deriving [the OpenAPI proc-macro](https://docs.rs/utoipa/latest/utoipa/derive.OpenApi.html) and nest it in the main [openAPI structure](https://github.com/meilisearch/meilisearch/blob/f2185438eed60fa32d25b15480c5ee064f6fba4a/crates/meilisearch/src/routes/mod.rs#L64-L78). + - [ ] If a new route has been introduced, add the [path decorator](https://docs.rs/utoipa/latest/utoipa/attr.path.html) to it and add the route at the top of the file in its openAPI structure. + - [ ] If a structure which is deserialized or serialized in the API has been introduced or modified, it must derive the [`schema`](https://docs.rs/utoipa/latest/utoipa/macro.schema.html) or the [`IntoParams`](https://docs.rs/utoipa/latest/utoipa/derive.IntoParams.html) proc-macro. + If it's a **new** structure you must also add it to the big list of structures [in the main `OpenApi` structure](https://github.com/meilisearch/meilisearch/blob/f2185438eed60fa32d25b15480c5ee064f6fba4a/crates/meilisearch/src/routes/mod.rs#L88). + - [ ] Once everything is done, start Meilisearch with the swagger flag: `cargo run --features swagger`, open `http://localhost:7700/scalar` on your browser, and ensure everything works as expected. + - For more info, refer to [this presentation](https://pitch.com/v/generating-the-openapi-file-jrn3nh). + ### Reminders when modifying the Setting API diff --git a/.github/workflows/bench-manual.yml b/.github/workflows/bench-manual.yml index 3f2b67d52..afa408bea 100644 --- a/.github/workflows/bench-manual.yml +++ b/.github/workflows/bench-manual.yml @@ -1,28 +1,27 @@ name: Bench (manual) on: - workflow_dispatch: - inputs: - workload: - description: 'The path to the workloads to execute (workloads/...)' - required: true - default: 'workloads/movies.json' + workflow_dispatch: + inputs: + workload: + description: "The path to the workloads to execute (workloads/...)" + required: true + default: "workloads/movies.json" env: - WORKLOAD_NAME: ${{ github.event.inputs.workload }} + WORKLOAD_NAME: ${{ github.event.inputs.workload }} jobs: - benchmarks: - name: Run and upload benchmarks - runs-on: benchmarks - timeout-minutes: 180 # 3h - steps: - - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 - with: - profile: minimal - - - name: Run benchmarks - workload ${WORKLOAD_NAME} - branch ${{ github.ref }} - commit ${{ github.sha }} - run: | - cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Manual [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- ${WORKLOAD_NAME} + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.85 + with: + profile: minimal + - name: Run benchmarks - workload ${WORKLOAD_NAME} - branch ${{ github.ref }} - commit ${{ github.sha }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Manual [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- ${WORKLOAD_NAME} diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 632c86d4e..b533b47c5 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -1,82 +1,82 @@ name: Bench (PR) on: - issue_comment: - types: [created] + issue_comment: + types: [created] permissions: - issues: write + issues: write env: - GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} + GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} jobs: - run-benchmarks-on-comment: - if: startsWith(github.event.comment.body, '/bench') - name: Run and upload benchmarks - runs-on: benchmarks - timeout-minutes: 180 # 3h - steps: - - name: Check permissions - id: permission - env: - PR_AUTHOR: ${{github.event.issue.user.login }} - COMMENT_AUTHOR: ${{github.event.comment.user.login }} - REPOSITORY: ${{github.repository}} - PR_ID: ${{github.event.issue.number}} - run: | - PR_REPOSITORY=$(gh api /repos/"$REPOSITORY"/pulls/"$PR_ID" --jq .head.repo.full_name) - if $(gh api /repos/"$REPOSITORY"/collaborators/"$PR_AUTHOR"/permission --jq .user.permissions.push) - then - echo "::notice title=Authentication success::PR author authenticated" - else - echo "::error title=Authentication error::PR author doesn't have push permission on this repository" - exit 1 - fi - if $(gh api /repos/"$REPOSITORY"/collaborators/"$COMMENT_AUTHOR"/permission --jq .user.permissions.push) - then - echo "::notice title=Authentication success::Comment author authenticated" - else - echo "::error title=Authentication error::Comment author doesn't have push permission on this repository" - exit 1 - fi - if [ "$PR_REPOSITORY" = "$REPOSITORY" ] - then - echo "::notice title=Authentication success::PR started from main repository" - else - echo "::error title=Authentication error::PR started from a fork" - exit 1 - fi + run-benchmarks-on-comment: + if: startsWith(github.event.comment.body, '/bench') + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - name: Check permissions + id: permission + env: + PR_AUTHOR: ${{github.event.issue.user.login }} + COMMENT_AUTHOR: ${{github.event.comment.user.login }} + REPOSITORY: ${{github.repository}} + PR_ID: ${{github.event.issue.number}} + run: | + PR_REPOSITORY=$(gh api /repos/"$REPOSITORY"/pulls/"$PR_ID" --jq .head.repo.full_name) + if $(gh api /repos/"$REPOSITORY"/collaborators/"$PR_AUTHOR"/permission --jq .user.permissions.push) + then + echo "::notice title=Authentication success::PR author authenticated" + else + echo "::error title=Authentication error::PR author doesn't have push permission on this repository" + exit 1 + fi + if $(gh api /repos/"$REPOSITORY"/collaborators/"$COMMENT_AUTHOR"/permission --jq .user.permissions.push) + then + echo "::notice title=Authentication success::Comment author authenticated" + else + echo "::error title=Authentication error::Comment author doesn't have push permission on this repository" + exit 1 + fi + if [ "$PR_REPOSITORY" = "$REPOSITORY" ] + then + echo "::notice title=Authentication success::PR started from main repository" + else + echo "::error title=Authentication error::PR started from a fork" + exit 1 + fi - - name: Check for Command - id: command - uses: xt0rted/slash-command-action@v2 - with: - command: bench - reaction-type: "rocket" - repo-token: ${{ env.GH_TOKEN }} + - name: Check for Command + id: command + uses: xt0rted/slash-command-action@v2 + with: + command: bench + reaction-type: "rocket" + repo-token: ${{ env.GH_TOKEN }} - - uses: xt0rted/pull-request-comment-branch@v3 - id: comment-branch - with: - repo_token: ${{ env.GH_TOKEN }} + - uses: xt0rted/pull-request-comment-branch@v3 + id: comment-branch + with: + repo_token: ${{ env.GH_TOKEN }} - - uses: actions/checkout@v3 - if: success() - with: - fetch-depth: 0 # fetch full history to be able to get main commit sha - ref: ${{ steps.comment-branch.outputs.head_ref }} + - uses: actions/checkout@v3 + if: success() + with: + fetch-depth: 0 # fetch full history to be able to get main commit sha + ref: ${{ steps.comment-branch.outputs.head_ref }} - - uses: dtolnay/rust-toolchain@1.79 - with: - profile: minimal + - uses: dtolnay/rust-toolchain@1.85 + with: + profile: minimal - - name: Run benchmarks on PR ${{ github.event.issue.id }} - run: | - cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" \ - --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" \ - --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" \ - -- ${{ steps.command.outputs.command-arguments }} > benchlinks.txt + - name: Run benchmarks on PR ${{ github.event.issue.id }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" \ + --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" \ + --reason "[Comment](${{ github.event.comment.html_url }}) on [#${{ github.event.issue.number }}](${{ github.event.issue.html_url }})" \ + -- ${{ steps.command.outputs.command-arguments }} > benchlinks.txt - - name: Send comment in PR - run: | - gh pr comment ${{github.event.issue.number}} --body-file benchlinks.txt + - name: Send comment in PR + run: | + gh pr comment ${{github.event.issue.number}} --body-file benchlinks.txt diff --git a/.github/workflows/bench-push-indexing.yml b/.github/workflows/bench-push-indexing.yml index 0d9975eb7..f35f60398 100644 --- a/.github/workflows/bench-push-indexing.yml +++ b/.github/workflows/bench-push-indexing.yml @@ -1,23 +1,22 @@ name: Indexing bench (push) on: - push: - branches: - - main + push: + branches: + - main jobs: - benchmarks: - name: Run and upload benchmarks - runs-on: benchmarks - timeout-minutes: 180 # 3h - steps: - - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 - with: - profile: minimal - - # Run benchmarks - - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch main - Commit ${{ github.sha }} - run: | - cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Push on `main` [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- workloads/*.json + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 180 # 3h + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.85 + with: + profile: minimal + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch main - Commit ${{ github.sha }} + run: | + cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Push on `main` [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- workloads/*.json diff --git a/.github/workflows/benchmarks-manual.yml b/.github/workflows/benchmarks-manual.yml index 14b77c83d..27e736979 100644 --- a/.github/workflows/benchmarks-manual.yml +++ b/.github/workflows/benchmarks-manual.yml @@ -4,9 +4,9 @@ on: workflow_dispatch: inputs: dataset_name: - description: 'The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)' + description: "The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)" required: false - default: 'search_songs' + default: "search_songs" env: BENCH_NAME: ${{ github.event.inputs.dataset_name }} @@ -18,7 +18,7 @@ jobs: timeout-minutes: 4320 # 72h steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal @@ -67,7 +67,7 @@ jobs: out_dir: critcmp_results # Helper - - name: 'README: compare with another benchmark' + - name: "README: compare with another benchmark" run: | echo "${{ steps.file.outputs.basename }}.json has just been pushed." echo 'How to compare this benchmark with another one?' diff --git a/.github/workflows/benchmarks-pr.yml b/.github/workflows/benchmarks-pr.yml index 7c081932a..ad669b648 100644 --- a/.github/workflows/benchmarks-pr.yml +++ b/.github/workflows/benchmarks-pr.yml @@ -44,7 +44,7 @@ jobs: exit 1 fi - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal diff --git a/.github/workflows/benchmarks-push-indexing.yml b/.github/workflows/benchmarks-push-indexing.yml index 4495b4b9d..996162d9c 100644 --- a/.github/workflows/benchmarks-push-indexing.yml +++ b/.github/workflows/benchmarks-push-indexing.yml @@ -16,7 +16,7 @@ jobs: timeout-minutes: 4320 # 72h steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal @@ -69,7 +69,7 @@ jobs: run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug # Helper - - name: 'README: compare with another benchmark' + - name: "README: compare with another benchmark" run: | echo "${{ steps.file.outputs.basename }}.json has just been pushed." echo 'How to compare this benchmark with another one?' diff --git a/.github/workflows/benchmarks-push-search-geo.yml b/.github/workflows/benchmarks-push-search-geo.yml index 22218cd6e..e9a81c6a3 100644 --- a/.github/workflows/benchmarks-push-search-geo.yml +++ b/.github/workflows/benchmarks-push-search-geo.yml @@ -15,7 +15,7 @@ jobs: runs-on: benchmarks steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal @@ -68,7 +68,7 @@ jobs: run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug # Helper - - name: 'README: compare with another benchmark' + - name: "README: compare with another benchmark" run: | echo "${{ steps.file.outputs.basename }}.json has just been pushed." echo 'How to compare this benchmark with another one?' diff --git a/.github/workflows/benchmarks-push-search-songs.yml b/.github/workflows/benchmarks-push-search-songs.yml index e9744a434..e5019063e 100644 --- a/.github/workflows/benchmarks-push-search-songs.yml +++ b/.github/workflows/benchmarks-push-search-songs.yml @@ -15,7 +15,7 @@ jobs: runs-on: benchmarks steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal @@ -68,7 +68,7 @@ jobs: run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug # Helper - - name: 'README: compare with another benchmark' + - name: "README: compare with another benchmark" run: | echo "${{ steps.file.outputs.basename }}.json has just been pushed." echo 'How to compare this benchmark with another one?' diff --git a/.github/workflows/benchmarks-push-search-wiki.yml b/.github/workflows/benchmarks-push-search-wiki.yml index bc9e1bcd0..1e9d97a6e 100644 --- a/.github/workflows/benchmarks-push-search-wiki.yml +++ b/.github/workflows/benchmarks-push-search-wiki.yml @@ -15,7 +15,7 @@ jobs: runs-on: benchmarks steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal @@ -68,7 +68,7 @@ jobs: run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug # Helper - - name: 'README: compare with another benchmark' + - name: "README: compare with another benchmark" run: | echo "${{ steps.file.outputs.basename }}.json has just been pushed." echo 'How to compare this benchmark with another one?' diff --git a/.github/workflows/check-valid-milestone.yml b/.github/workflows/check-valid-milestone.yml new file mode 100644 index 000000000..0e9357050 --- /dev/null +++ b/.github/workflows/check-valid-milestone.yml @@ -0,0 +1,100 @@ +name: PR Milestone Check + +on: + pull_request: + types: [opened, reopened, edited, synchronize, milestoned, demilestoned] + branches: + - "main" + - "release-v*.*.*" + +jobs: + check-milestone: + name: Check PR Milestone + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Validate PR milestone + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + // Get PR number directly from the event payload + const prNumber = context.payload.pull_request.number; + + // Get PR details + const { data: prData } = await github.rest.pulls.get({ + owner: 'meilisearch', + repo: 'meilisearch', + pull_number: prNumber + }); + + // Get base branch name + const baseBranch = prData.base.ref; + console.log(`Base branch: ${baseBranch}`); + + // Get PR milestone + const prMilestone = prData.milestone; + if (!prMilestone) { + core.setFailed('PR must have a milestone assigned'); + return; + } + console.log(`PR milestone: ${prMilestone.title}`); + + // Validate milestone format: vx.y.z + const milestoneRegex = /^v\d+\.\d+\.\d+$/; + if (!milestoneRegex.test(prMilestone.title)) { + core.setFailed(`Milestone "${prMilestone.title}" does not follow the required format vx.y.z`); + return; + } + + // For main branch PRs, check if the milestone is the highest one + if (baseBranch === 'main') { + // Get all milestones + const { data: milestones } = await github.rest.issues.listMilestones({ + owner: 'meilisearch', + repo: 'meilisearch', + state: 'open', + sort: 'due_on', + direction: 'desc' + }); + + // Sort milestones by version number (vx.y.z) + const sortedMilestones = milestones + .filter(m => milestoneRegex.test(m.title)) + .sort((a, b) => { + const versionA = a.title.substring(1).split('.').map(Number); + const versionB = b.title.substring(1).split('.').map(Number); + + // Compare major version + if (versionA[0] !== versionB[0]) return versionB[0] - versionA[0]; + // Compare minor version + if (versionA[1] !== versionB[1]) return versionB[1] - versionA[1]; + // Compare patch version + return versionB[2] - versionA[2]; + }); + + if (sortedMilestones.length === 0) { + core.setFailed('No valid milestones found in the repository. Please create at least one milestone with the format vx.y.z'); + return; + } + + const highestMilestone = sortedMilestones[0]; + console.log(`Highest milestone: ${highestMilestone.title}`); + + if (prMilestone.title !== highestMilestone.title) { + core.setFailed(`PRs targeting the main branch must use the highest milestone (${highestMilestone.title}), but this PR uses ${prMilestone.title}`); + return; + } + } else { + // For release branches, the milestone should match the branch version + const branchVersion = baseBranch.substring(8); // remove 'release-' + if (prMilestone.title !== branchVersion) { + core.setFailed(`PRs targeting release branch "${baseBranch}" must use the matching milestone "${branchVersion}", but this PR uses "${prMilestone.title}"`); + return; + } + } + + console.log('PR milestone validation passed!'); diff --git a/.github/workflows/db-change-comments.yml b/.github/workflows/db-change-comments.yml new file mode 100644 index 000000000..794142044 --- /dev/null +++ b/.github/workflows/db-change-comments.yml @@ -0,0 +1,57 @@ +name: Comment when db change labels are added + +on: + pull_request: + types: [labeled] + +env: + MESSAGE: | + ### Hello, I'm a bot šŸ¤– + + You are receiving this message because you declared that this PR make changes to the Meilisearch database. + Depending on the nature of the change, additional actions might be required on your part. The following sections detail the additional actions depending on the nature of the change, please copy the relevant section in the description of your PR, and make sure to perform the required actions. + + Thank you for contributing to Meilisearch :heart: + + ## This PR makes forward-compatible changes + + *Forward-compatible changes are changes to the database such that databases created in an older version of Meilisearch are still valid in the new version of Meilisearch. They usually represent additive changes, like adding a new optional attribute or setting.* + + - [ ] Detail the change to the DB format and why they are forward compatible + - [ ] Forward-compatibility: A database created before this PR and using the features touched by this PR was able to be opened by a Meilisearch produced by the code of this PR. + + + ## This PR makes breaking changes + + *Breaking changes are changes to the database such that databases created in an older version of Meilisearch need changes to remain valid in the new version of Meilisearch. This typically happens when the way to store the data changed (change of database, new required key, etc). This can also happen due to breaking changes in the API of an experimental feature. āš ļø This kind of changes are more difficult to achieve safely, so proceed with caution and test dumpless upgrade right before merging the PR.* + + - [ ] Detail the changes to the DB format, + - [ ] which are compatible, and why + - [ ] which are not compatible, why, and how they will be fixed up in the upgrade + - [ ] /!\ Ensure all the read operations still work! + - If the change happened in milli, you may need to check the version of the database before doing any read operation + - If the change happened in the index-scheduler, make sure the new code can immediately read the old database + - If the change happened in the meilisearch-auth database, reach out to the team; we don't know yet how to handle these changes + - [ ] Write the code to go from the old database to the new one + - If the change happened in milli, the upgrade function should be written and called [here](https://github.com/meilisearch/meilisearch/blob/3fd86e8d76d7d468b0095d679adb09211ca3b6c0/crates/milli/src/update/upgrade/mod.rs#L24-L47) + - If the change happened in the index-scheduler, we've never done it yet, but the right place to do it should be [here](https://github.com/meilisearch/meilisearch/blob/3fd86e8d76d7d468b0095d679adb09211ca3b6c0/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs#L13) + - [ ] Write an integration test [here](https://github.com/meilisearch/meilisearch/blob/main/crates/meilisearch/tests/upgrade/mod.rs) ensuring you can read the old database, upgrade to the new database, and read the new database as expected + + +jobs: + add-comment: + runs-on: ubuntu-latest + if: github.event.label.name == 'db change' + steps: + - name: Add comment + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const message = process.env.MESSAGE; + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: message + }) diff --git a/.github/workflows/db-change-missing.yml b/.github/workflows/db-change-missing.yml new file mode 100644 index 000000000..dbd6bb82c --- /dev/null +++ b/.github/workflows/db-change-missing.yml @@ -0,0 +1,28 @@ +name: Check db change labels + +on: + pull_request: + types: [opened, synchronize, reopened, labeled, unlabeled] + +env: + GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} + +jobs: + check-labels: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + - name: Check db change labels + id: check_labels + run: | + URL=/repos/meilisearch/meilisearch/pulls/${{ github.event.pull_request.number }}/labels + echo ${{ github.event.pull_request.number }} + echo $URL + LABELS=$(gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" /repos/meilisearch/meilisearch/issues/${{ github.event.pull_request.number }}/labels -q .[].name) + if [[ ! "$LABELS" =~ "db change" && ! "$LABELS" =~ "no db change" ]]; then + echo "::error::Pull request must contain either the 'db change' or 'no db change' label." + exit 1 + else + echo "The label is set" + fi diff --git a/.github/workflows/flaky-tests.yml b/.github/workflows/flaky-tests.yml index 3fa9c549c..66be5b823 100644 --- a/.github/workflows/flaky-tests.yml +++ b/.github/workflows/flaky-tests.yml @@ -9,22 +9,22 @@ jobs: flaky: runs-on: ubuntu-latest container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 steps: - - uses: actions/checkout@v3 - - name: Install needed dependencies - run: | - apt-get update && apt-get install -y curl - apt-get install build-essential -y - - uses: dtolnay/rust-toolchain@1.79 - - name: Install cargo-flaky - run: cargo install cargo-flaky - - name: Run cargo flaky in the dumps - run: cd crates/dump; cargo flaky -i 100 --release - - name: Run cargo flaky in the index-scheduler - run: cd crates/index-scheduler; cargo flaky -i 100 --release - - name: Run cargo flaky in the auth - run: cd crates/meilisearch-auth; cargo flaky -i 100 --release - - name: Run cargo flaky in meilisearch - run: cd crates/meilisearch; cargo flaky -i 100 --release + - uses: actions/checkout@v3 + - name: Install needed dependencies + run: | + apt-get update && apt-get install -y curl + apt-get install build-essential -y + - uses: dtolnay/rust-toolchain@1.85 + - name: Install cargo-flaky + run: cargo install cargo-flaky + - name: Run cargo flaky in the dumps + run: cd crates/dump; cargo flaky -i 100 --release + - name: Run cargo flaky in the index-scheduler + run: cd crates/index-scheduler; cargo flaky -i 100 --release + - name: Run cargo flaky in the auth + run: cd crates/meilisearch-auth; cargo flaky -i 100 --release + - name: Run cargo flaky in meilisearch + run: cd crates/meilisearch; cargo flaky -i 100 --release diff --git a/.github/workflows/fuzzer-indexing.yml b/.github/workflows/fuzzer-indexing.yml index ad0962802..cf7dd5bdc 100644 --- a/.github/workflows/fuzzer-indexing.yml +++ b/.github/workflows/fuzzer-indexing.yml @@ -12,7 +12,7 @@ jobs: timeout-minutes: 4320 # 72h steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal diff --git a/.github/workflows/milestone-workflow.yml b/.github/workflows/milestone-workflow.yml index c15684661..eb78bf8a8 100644 --- a/.github/workflows/milestone-workflow.yml +++ b/.github/workflows/milestone-workflow.yml @@ -5,6 +5,7 @@ name: Milestone's workflow # For each Milestone created (not opened!), and if the release is NOT a patch release (only the patch changed) # - the roadmap issue is created, see https://github.com/meilisearch/engine-team/blob/main/issue-templates/roadmap-issue.md # - the changelog issue is created, see https://github.com/meilisearch/engine-team/blob/main/issue-templates/changelog-issue.md +# - update the ruleset to add the current release version to the list of allowed versions and be able to use the merge queue. # For each Milestone closed # - the `release_version` label is created @@ -21,10 +22,9 @@ env: GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} jobs: - -# ----------------- -# MILESTONE CREATED -# ----------------- + # ----------------- + # MILESTONE CREATED + # ----------------- get-release-version: if: github.event.action == 'created' @@ -148,9 +148,37 @@ jobs: --body-file $ISSUE_TEMPLATE \ --milestone $MILESTONE_VERSION -# ---------------- -# MILESTONE CLOSED -# ---------------- + update-ruleset: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Install jq + run: | + sudo apt-get update + sudo apt-get install -y jq + - name: Update ruleset + env: + # gh api repos/meilisearch/meilisearch/rulesets --jq '.[] | {name: .name, id: .id}' + RULESET_ID: 4253297 + BRANCH_NAME: ${{ github.event.inputs.branch_name }} + run: | + # Get current ruleset conditions + CONDITIONS=$(gh api repos/meilisearch/meilisearch/rulesets/$RULESET_ID --jq '{ conditions: .conditions }') + + # Update the conditions by appending the milestone version + UPDATED_CONDITIONS=$(echo $CONDITIONS | jq '.conditions.ref_name.include += ["refs/heads/release-'$MILESTONE_VERSION'"]') + + # Update the ruleset from stdin (-) + echo $UPDATED_CONDITIONS | + gh api repos/meilisearch/meilisearch/rulesets/$RULESET_ID \ + --method PUT \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + --input - + + # ---------------- + # MILESTONE CLOSED + # ---------------- create-release-label: if: github.event.action == 'closed' diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index 546ec1bee..e6adfca57 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -18,28 +18,28 @@ jobs: runs-on: ubuntu-latest needs: check-version container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 steps: - - name: Install needed dependencies - run: | - apt-get update && apt-get install -y curl - apt-get install build-essential -y - - uses: dtolnay/rust-toolchain@1.79 - - name: Install cargo-deb - run: cargo install cargo-deb - - uses: actions/checkout@v3 - - name: Build deb package - run: cargo deb -p meilisearch -o target/debian/meilisearch.deb - - name: Upload debian pkg to release - uses: svenstaro/upload-release-action@2.7.0 - with: - repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} - file: target/debian/meilisearch.deb - asset_name: meilisearch.deb - tag: ${{ github.ref }} - - name: Upload debian pkg to apt repository - run: curl -F package=@target/debian/meilisearch.deb https://${{ secrets.GEMFURY_PUSH_TOKEN }}@push.fury.io/meilisearch/ + - name: Install needed dependencies + run: | + apt-get update && apt-get install -y curl + apt-get install build-essential -y + - uses: dtolnay/rust-toolchain@1.85 + - name: Install cargo-deb + run: cargo install cargo-deb + - uses: actions/checkout@v3 + - name: Build deb package + run: cargo deb -p meilisearch -o target/debian/meilisearch.deb + - name: Upload debian pkg to release + uses: svenstaro/upload-release-action@2.7.0 + with: + repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} + file: target/debian/meilisearch.deb + asset_name: meilisearch.deb + tag: ${{ github.ref }} + - name: Upload debian pkg to apt repository + run: curl -F package=@target/debian/meilisearch.deb https://${{ secrets.GEMFURY_PUSH_TOKEN }}@push.fury.io/meilisearch/ homebrew: name: Bump Homebrew formula diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index c53946fea..885a04d0d 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -3,7 +3,7 @@ name: Publish binaries to GitHub release on: workflow_dispatch: schedule: - - cron: '0 2 * * *' # Every day at 2:00am + - cron: "0 2 * * *" # Every day at 2:00am release: types: [published] @@ -37,26 +37,26 @@ jobs: runs-on: ubuntu-latest needs: check-version container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 steps: - - uses: actions/checkout@v3 - - name: Install needed dependencies - run: | - apt-get update && apt-get install -y curl - apt-get install build-essential -y - - uses: dtolnay/rust-toolchain@1.79 - - name: Build - run: cargo build --release --locked - # No need to upload binaries for dry run (cron) - - name: Upload binaries to release - if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 - with: - repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} - file: target/release/meilisearch - asset_name: meilisearch-linux-amd64 - tag: ${{ github.ref }} + - uses: actions/checkout@v3 + - name: Install needed dependencies + run: | + apt-get update && apt-get install -y curl + apt-get install build-essential -y + - uses: dtolnay/rust-toolchain@1.85 + - name: Build + run: cargo build --release --locked + # No need to upload binaries for dry run (cron) + - name: Upload binaries to release + if: github.event_name == 'release' + uses: svenstaro/upload-release-action@2.7.0 + with: + repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} + file: target/release/meilisearch + asset_name: meilisearch-linux-amd64 + tag: ${{ github.ref }} publish-macos-windows: name: Publish binary for ${{ matrix.os }} @@ -74,19 +74,19 @@ jobs: artifact_name: meilisearch.exe asset_name: meilisearch-windows-amd64.exe steps: - - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 - - name: Build - run: cargo build --release --locked - # No need to upload binaries for dry run (cron) - - name: Upload binaries to release - if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 - with: - repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} - file: target/release/${{ matrix.artifact_name }} - asset_name: ${{ matrix.asset_name }} - tag: ${{ github.ref }} + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@1.85 + - name: Build + run: cargo build --release --locked + # No need to upload binaries for dry run (cron) + - name: Upload binaries to release + if: github.event_name == 'release' + uses: svenstaro/upload-release-action@2.7.0 + with: + repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} + file: target/release/${{ matrix.artifact_name }} + asset_name: ${{ matrix.asset_name }} + tag: ${{ github.ref }} publish-macos-apple-silicon: name: Publish binary for macOS silicon @@ -101,7 +101,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Installing Rust toolchain - uses: dtolnay/rust-toolchain@1.79 + uses: dtolnay/rust-toolchain@1.85 with: profile: minimal target: ${{ matrix.target }} @@ -127,8 +127,8 @@ jobs: env: DEBIAN_FRONTEND: noninteractive container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 strategy: matrix: include: @@ -148,7 +148,7 @@ jobs: add-apt-repository "deb [arch=$(dpkg --print-architecture)] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" apt-get update -y && apt-get install -y docker-ce - name: Installing Rust toolchain - uses: dtolnay/rust-toolchain@1.79 + uses: dtolnay/rust-toolchain@1.85 with: profile: minimal target: ${{ matrix.target }} diff --git a/.github/workflows/sdks-tests.yml b/.github/workflows/sdks-tests.yml index 61b33e2fc..fc9e5770e 100644 --- a/.github/workflows/sdks-tests.yml +++ b/.github/workflows/sdks-tests.yml @@ -52,7 +52,7 @@ jobs: - name: Setup .NET Core uses: actions/setup-dotnet@v4 with: - dotnet-version: "6.0.x" + dotnet-version: "8.0.x" - name: Install dependencies run: dotnet restore - name: Build diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index e142b15b6..8daa32e35 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -4,13 +4,9 @@ on: workflow_dispatch: schedule: # Everyday at 5:00am - - cron: '0 5 * * *' + - cron: "0 5 * * *" pull_request: - push: - # trying and staging branches are for Bors config - branches: - - trying - - staging + merge_group: env: CARGO_TERM_COLOR: always @@ -19,21 +15,21 @@ env: jobs: test-linux: - name: Tests on ubuntu-20.04 + name: Tests on ubuntu-22.04 runs-on: ubuntu-latest container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install needed dependencies run: | apt-get update && apt-get install -y curl apt-get install build-essential -y - name: Setup test with Rust stable - uses: dtolnay/rust-toolchain@1.79 + uses: dtolnay/rust-toolchain@1.85 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.5 + uses: Swatinem/rust-cache@v2.7.7 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -55,8 +51,8 @@ jobs: steps: - uses: actions/checkout@v3 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.5 - - uses: dtolnay/rust-toolchain@1.79 + uses: Swatinem/rust-cache@v2.7.7 + - uses: dtolnay/rust-toolchain@1.85 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -72,8 +68,8 @@ jobs: name: Tests almost all features runs-on: ubuntu-latest container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' steps: - uses: actions/checkout@v3 @@ -81,19 +77,51 @@ jobs: run: | apt-get update apt-get install --assume-yes build-essential curl - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 - name: Run cargo build with almost all features run: | - cargo build --workspace --locked --release --features "$(cargo xtask list-features --exclude-feature cuda)" + cargo build --workspace --locked --release --features "$(cargo xtask list-features --exclude-feature cuda,test-ollama)" - name: Run cargo test with almost all features run: | - cargo test --workspace --locked --release --features "$(cargo xtask list-features --exclude-feature cuda)" + cargo test --workspace --locked --release --features "$(cargo xtask list-features --exclude-feature cuda,test-ollama)" + + ollama-ubuntu: + name: Test with Ollama + runs-on: ubuntu-latest + env: + MEILI_TEST_OLLAMA_SERVER: "http://localhost:11434" + steps: + - uses: actions/checkout@v1 + - name: Install Ollama + run: | + curl -fsSL https://ollama.com/install.sh | sudo -E sh + - name: Start serving + run: | + # Run it in the background, there is no way to daemonise at the moment + ollama serve & + + # A short pause is required before the HTTP port is opened + sleep 5 + + # This endpoint blocks until ready + time curl -i http://localhost:11434 + + - name: Pull nomic-embed-text & all-minilm + run: | + ollama pull nomic-embed-text + ollama pull all-minilm + + - name: Run cargo test + uses: actions-rs/cargo@v1 + with: + command: test + args: --locked --release --all --features test-ollama ollama test-disabled-tokenization: name: Test disabled tokenization runs-on: ubuntu-latest container: - image: ubuntu:20.04 + image: ubuntu:22.04 if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' steps: - uses: actions/checkout@v3 @@ -101,7 +129,7 @@ jobs: run: | apt-get update apt-get install --assume-yes build-essential curl - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 - name: Run cargo tree without default features and check lindera is not present run: | if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -qz lindera; then @@ -117,17 +145,17 @@ jobs: name: Run tests in debug runs-on: ubuntu-latest container: - # Use ubuntu-20.04 to compile with glibc 2.28 - image: ubuntu:20.04 + # Use ubuntu-22.04 to compile with glibc 2.35 + image: ubuntu:22.04 steps: - uses: actions/checkout@v3 - name: Install needed dependencies run: | apt-get update && apt-get install -y curl apt-get install build-essential -y - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.5 + uses: Swatinem/rust-cache@v2.7.7 - name: Run tests in debug uses: actions-rs/cargo@v1 with: @@ -139,12 +167,12 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal components: clippy - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.5 + uses: Swatinem/rust-cache@v2.7.7 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -156,14 +184,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal toolchain: nightly-2024-07-09 override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.5 + uses: Swatinem/rust-cache@v2.7.7 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate diff --git a/.github/workflows/update-cargo-toml-version.yml b/.github/workflows/update-cargo-toml-version.yml index d9d79d595..d13a4404a 100644 --- a/.github/workflows/update-cargo-toml-version.yml +++ b/.github/workflows/update-cargo-toml-version.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: new_version: - description: 'The new version (vX.Y.Z)' + description: "The new version (vX.Y.Z)" required: true env: @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: dtolnay/rust-toolchain@1.79 + - uses: dtolnay/rust-toolchain@1.85 with: profile: minimal - name: Install sd diff --git a/.gitignore b/.gitignore index 0d6750008..07453a58f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ /dumps /bench /_xtask_benchmark.ms +/benchmarks # Snapshots ## ... large diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 092eb10e9..e129e5600 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -95,6 +95,11 @@ Meilisearch follows the [cargo xtask](https://github.com/matklad/cargo-xtask) wo Run `cargo xtask --help` from the root of the repository to find out what is available. +#### Update the openAPI file if the API changed + +To update the openAPI file in the code, see [sprint_issue.md](https://github.com/meilisearch/meilisearch/blob/main/.github/ISSUE_TEMPLATE/sprint_issue.md#reminders-when-modifying-the-api). +If you want to update the openAPI file on the [open-api repository](https://github.com/meilisearch/open-api), see [update-openapi-issue.md](https://github.com/meilisearch/engine-team/blob/main/issue-templates/update-openapi-issue.md). + ### Logging Meilisearch uses [`tracing`](https://lib.rs/crates/tracing) for logging purposes. Tracing logs are structured and can be displayed as JSON to the end user, so prefer passing arguments as fields rather than interpolating them in the message. @@ -145,7 +150,7 @@ Some notes on GitHub PRs: - The PR title should be accurate and descriptive of the changes. - [Convert your PR as a draft](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request) if your changes are a work in progress: no one will review it until you pass your PR as ready for review.
The draft PRs are recommended when you want to show that you are working on something and make your work visible. -- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [Bors](https://github.com/bors-ng/bors-ng) to automatically enforce this requirement without the PR author having to rebase manually. +- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [GitHub Merge Queues](https://github.blog/news-insights/product-news/github-merge-queue-is-generally-available/) to automatically enforce this requirement without the PR author having to rebase manually. ## Release Process (for internal team only) @@ -153,8 +158,7 @@ Meilisearch tools follow the [Semantic Versioning Convention](https://semver.org ### Automation to rebase and Merge the PRs -This project integrates a bot that helps us manage pull requests merging.
-_[Read more about this](https://github.com/meilisearch/integration-guides/blob/main/resources/bors.md)._ +This project uses GitHub Merge Queues that helps us manage pull requests merging. ### How to Publish a new Release diff --git a/Cargo.lock b/Cargo.lock index 5686f07e9..8efb960dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "actix-codec" @@ -36,9 +36,9 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.8.0" +version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae682f693a9cd7b058f2b0b5d9a6d7728a8555779bedbbc35dd88528611d020" +checksum = "d48f96fc3003717aeb9856ca3d02a8c7de502667ad76eeacd830b48d2e91fac4" dependencies = [ "actix-codec", "actix-rt", @@ -47,7 +47,7 @@ dependencies = [ "actix-utils", "ahash 0.8.11", "base64 0.22.1", - "bitflags 2.6.0", + "bitflags 2.9.0", "brotli", "bytes", "bytestring", @@ -119,7 +119,7 @@ dependencies = [ "actix-utils", "futures-core", "futures-util", - "mio", + "mio 0.8.11", "num_cpus", "socket2 0.4.9", "tokio", @@ -168,9 +168,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.8.0" +version = "4.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1988c02af8d2b718c05bc4aeb6a66395b7cdf32858c2c71131e5637a8c05a9ff" +checksum = "9180d76e5cc7ccbc4d60a506f2c727730b154010262df5b910eb17dbe4b8cb38" dependencies = [ "actix-codec", "actix-http", @@ -191,6 +191,7 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", + "impl-more", "itoa", "language-tags", "log", @@ -234,6 +235,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "aes" version = "0.8.4" @@ -296,9 +303,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "anes" @@ -322,9 +329,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" @@ -356,9 +363,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.86" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" dependencies = [ "backtrace", ] @@ -371,9 +378,9 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "arbitrary" -version = "1.3.2" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" dependencies = [ "derive_arbitrary", ] @@ -386,41 +393,24 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arroy" -version = "0.5.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e" +checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6" dependencies = [ "bytemuck", "byteorder", + "enum-iterator", "heed", - "log", "memmap2", "nohash", "ordered-float", + "page_size", "rand", "rayon", "roaring", "tempfile", - "thiserror", -] - -[[package]] -name = "arroy" -version = "0.5.0" -source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7" -dependencies = [ - "bytemuck", - "byteorder", - "heed", - "log", - "memmap2", - "nohash", - "ordered-float", - "rand", - "rayon", - "roaring", - "tempfile", - "thiserror", + "thiserror 2.0.9", + "tracing", ] [[package]] @@ -435,9 +425,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.81" +version = "0.1.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", @@ -466,7 +456,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.2", "object", "rustc-demangle", ] @@ -489,9 +479,14 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bbqueue" +version = "0.5.1" +source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2443e06ba0d6d4" + [[package]] name = "benchmarks" -version = "1.12.0" +version = "1.14.0" dependencies = [ "anyhow", "bumpalo", @@ -537,16 +532,14 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.4" +version = "0.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00dc851838a2120612785d195287475a3ac45514741da670b735818822129a0" +checksum = "f49d8fed880d473ea71efb9bf597651e77201bdd4893efe54c9e5d65ae04ce6f" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "cexpr", "clang-sys", - "itertools 0.12.1", - "lazy_static", - "lazycell", + "itertools 0.13.0", "proc-macro2", "quote", "regex", @@ -589,9 +582,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.6.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" dependencies = [ "serde", ] @@ -673,9 +666,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.9.1" +version = "1.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" dependencies = [ "memchr", "regex-automata", @@ -684,7 +677,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.12.0" +version = "1.14.0" dependencies = [ "anyhow", "time", @@ -702,10 +695,24 @@ dependencies = [ ] [[package]] -name = "byte-unit" -version = "5.1.4" +name = "bumparaw-collections" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ac19bdf0b2665407c39d82dbc937e951e7e2001609f0fb32edd0af45a2d63e" +checksum = "4ce682bdc86c2e25ef5cd95881d9d6a1902214eddf74cf9ffea88fe1464377e8" +dependencies = [ + "allocator-api2", + "bitpacking", + "bumpalo", + "hashbrown 0.15.2", + "serde", + "serde_json", +] + +[[package]] +name = "byte-unit" +version = "5.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cd29c3c585209b0cbc7309bfe3ed7efd8c84c21b7af29c8bfae908f8777174" dependencies = [ "rust_decimal", "serde", @@ -742,18 +749,18 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "bytemuck" -version = "1.19.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" +checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.6.0" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4da9a32f3fed317401fa3c862968128267c3106685286e15d5aaa3d7389c2f60" +checksum = "7ecc273b49b3205b83d648f0690daa588925572cc5063745bfe547fe7ec8e1a1" dependencies = [ "proc-macro2", "quote", @@ -768,9 +775,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "bytestring" @@ -813,15 +820,15 @@ dependencies = [ [[package]] name = "candle-core" -version = "0.6.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5b18de020c2729dbf7ac390325312644808b6ba9b7962f1f724e9185b1d53c7" +checksum = "855dfedff437d2681d68e1f34ae559d88b0dd84aa5a6b63f2c8e75ebdd875bbf" dependencies = [ "byteorder", "candle-kernels", "cudarc", "gemm", - "half 2.4.0", + "half 2.4.1", "memmap2", "num-traits", "num_cpus", @@ -829,45 +836,47 @@ dependencies = [ "rand_distr", "rayon", "safetensors", - "thiserror", + "thiserror 1.0.69", + "ug", + "ug-cuda", "yoke", "zip 1.1.4", ] [[package]] name = "candle-kernels" -version = "0.6.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bc0a71be8b2f0950b63fd602a5e10a74a4f94a5fd63059ae455e96163389488" +checksum = "53343628fa470b7075c28c589b98735b4220b464e37ddbb8e117040e199f4787" dependencies = [ "bindgen_cuda", ] [[package]] name = "candle-nn" -version = "0.6.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b006b30f66a0d94fc9cef0ac4de6ce510565f35ae2c6c35ce5d4aacfb0fc8eeb" +checksum = "ddd3c6b2ee0dfd64af12ae5b07e4b7c517898981cdaeffcb10b71d7dd5c8f359" dependencies = [ "candle-core", - "half 2.4.0", + "half 2.4.1", "num-traits", "rayon", "safetensors", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "candle-transformers" -version = "0.6.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f0d4eb6a0d9279d5829b06b2bf3caa117904eefd6dcf879d16e687c4a84034c" +checksum = "4270cc692c4a3df2051c2e8c3c4da3a189746af7ca3a547b99ecd335582b92e1" dependencies = [ "byteorder", "candle-core", "candle-nn", - "fancy-regex 0.13.0", + "fancy-regex", "num-traits", "rand", "rayon", @@ -888,23 +897,23 @@ dependencies = [ [[package]] name = "cargo_metadata" -version = "0.18.1" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d886547e41f740c616ae73108f6eb70afe6d940c7bc697cb30f13daec073037" +checksum = "8769706aad5d996120af43197bf46ef6ad0fda35216b4505f926a365a232d924" dependencies = [ "camino", "cargo-platform", "semver", "serde", "serde_json", - "thiserror", + "thiserror 2.0.9", ] [[package]] name = "cargo_toml" -version = "0.20.3" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4895c018bb228aa6b3ba1a0285543fcb4b704734c3fb1f72afaa75aa769500c1" +checksum = "5fbd1fe9db3ebf71b89060adaf7b0504c2d6a425cf061313099547e382c2e472" dependencies = [ "serde", "toml", @@ -918,13 +927,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.104" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74b6a57f98764a267ff415d50a25e6e166f3831a5071af4995296ea97d210490" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" dependencies = [ "jobserver", "libc", - "once_cell", + "shlex", ] [[package]] @@ -969,8 +978,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650d52f87a36472ea1c803dee49d6bfd23d426efa9363e2f4c4a0e6a236d3407" dependencies = [ "aho-corasick", "csv", @@ -1038,9 +1048,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.9" +version = "4.5.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462" +checksum = "9560b07a799281c7e0958b9296854d6fafd4c5f31444a7e5bb1ad6dde5ccf1bd" dependencies = [ "clap_builder", "clap_derive", @@ -1048,9 +1058,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.9" +version = "4.5.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942" +checksum = "874e0dd3eb68bf99058751ac9712f622e61e6f393a94f7128fa26e3f02f5c7cd" dependencies = [ "anstream", "anstyle", @@ -1060,9 +1070,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.8" +version = "4.5.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" +checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -1072,9 +1082,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "color-spantrace" @@ -1172,9 +1182,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.4" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" @@ -1245,24 +1255,11 @@ dependencies = [ "itertools 0.10.5", ] -[[package]] -name = "crossbeam" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" dependencies = [ "crossbeam-utils", ] @@ -1319,9 +1316,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -1340,11 +1337,11 @@ dependencies = [ [[package]] name = "cudarc" -version = "0.11.7" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ee2a3fbbd981e1c7ea73cc2af136e754eb22d17436de37155227ee4dbe0cf4" +checksum = "8cd76de2aa3a7bdb9a65941ea5a3c688d941688f736a81b2fc5beb88747a7f25" dependencies = [ - "half 2.4.0", + "half 2.4.1", "libloading", ] @@ -1360,12 +1357,12 @@ dependencies = [ [[package]] name = "darling" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83b2eb4d90d12bdda5ed17de686c2acb4c57914f8f921b8da7e112b5a36f3fe1" +checksum = "6f63b86c8a8826a49b8c21f08a2d07338eec8d900540f8630dc76284be802989" dependencies = [ - "darling_core 0.20.9", - "darling_macro 0.20.9", + "darling_core 0.20.10", + "darling_macro 0.20.10", ] [[package]] @@ -1384,9 +1381,9 @@ dependencies = [ [[package]] name = "darling_core" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622687fe0bac72a04e5599029151f5796111b90f1baaa9b544d807a5e31cd120" +checksum = "95133861a8032aaea082871032f5815eb9e98cef03fa916ab4500513994df9e5" dependencies = [ "fnv", "ident_case", @@ -1409,11 +1406,11 @@ dependencies = [ [[package]] name = "darling_macro" -version = "0.20.9" +version = "0.20.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "733cabb43482b1a1b53eee8583c2b9e8684d592215ea83efd305dd31bc2f0178" +checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ - "darling_core 0.20.9", + "darling_core 0.20.10", "quote", "syn 2.0.87", ] @@ -1447,9 +1444,9 @@ dependencies = [ [[package]] name = "deflate64" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83ace6c86376be0b6cdcf3fb41882e81d94b31587573d1cfa9d01cd06bba210d" +checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b" [[package]] name = "deranged" @@ -1463,9 +1460,9 @@ dependencies = [ [[package]] name = "derive_arbitrary" -version = "1.3.2" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" +checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", @@ -1483,11 +1480,11 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ - "derive_builder_macro 0.20.0", + "derive_builder_macro 0.20.2", ] [[package]] @@ -1504,11 +1501,11 @@ dependencies = [ [[package]] name = "derive_builder_core" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ - "darling 0.20.9", + "darling 0.20.10", "proc-macro2", "quote", "syn 2.0.87", @@ -1526,11 +1523,11 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.0" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ - "derive_builder_core 0.20.0", + "derive_builder_core 0.20.2", "syn 2.0.87", ] @@ -1549,9 +1546,9 @@ dependencies = [ [[package]] name = "deserr" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfe971a2a48625fda3198032f35de60939828c4aed47d76715c21698801b985c" +checksum = "7a4da990b9de75d39b5fb69bb0105ec3f1726d43b71b3ddb359cf38470017a56" dependencies = [ "actix-http", "actix-utils", @@ -1566,9 +1563,9 @@ dependencies = [ [[package]] name = "deserr-internal" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aae89f00c97a75940185084a826c0aace055774ad57a58211625606449ea0bd8" +checksum = "aadef696fce456c704f10186def1bdc0a40e646c9f4f18cf091477acadb731d8" dependencies = [ "convert_case 0.6.0", "proc-macro2", @@ -1657,12 +1654,12 @@ dependencies = [ [[package]] name = "dump" -version = "1.12.0" +version = "1.14.0" dependencies = [ "anyhow", "big_s", "flate2", - "http 1.1.0", + "http 1.2.0", "maplit", "meili-snap", "meilisearch-types", @@ -1673,7 +1670,7 @@ dependencies = [ "serde_json", "tar", "tempfile", - "thiserror", + "thiserror 2.0.9", "time", "tracing", "uuid", @@ -1840,16 +1837,6 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" -[[package]] -name = "fancy-regex" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7493d4c459da9f84325ad297371a6b2b8a162800873a22e3b6b6512e61d18c05" -dependencies = [ - "bit-set", - "regex", -] - [[package]] name = "fancy-regex" version = "0.13.0" @@ -1863,16 +1850,16 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-store" -version = "1.12.0" +version = "1.14.0" dependencies = [ "tempfile", - "thiserror", + "thiserror 2.0.9", "tracing", "uuid", ] @@ -1891,7 +1878,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.12.0" +version = "1.14.0" dependencies = [ "insta", "nom", @@ -1901,22 +1888,31 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.2", ] [[package]] name = "flatten-serde-json" -version = "1.12.0" +version = "1.14.0" dependencies = [ "criterion", "serde_json", ] +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "spin", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1952,9 +1948,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1967,9 +1963,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1977,15 +1973,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1994,15 +1990,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -2011,21 +2007,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -2041,7 +2037,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.12.0" +version = "1.14.0" dependencies = [ "arbitrary", "bumpalo", @@ -2069,7 +2065,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce20bbb48248608ba4908b45fe36e17e40f56f8c6bb385ecf5d3c4a1e8b05a22" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "debugid", "fxhash", "serde", @@ -2135,7 +2131,7 @@ checksum = "a2e7ea062c987abcd8db95db917b4ffb4ecdfd0668471d8dc54734fdff2354e8" dependencies = [ "bytemuck", "dyn-stack", - "half 2.4.0", + "half 2.4.1", "num-complex", "num-traits", "once_cell", @@ -2156,7 +2152,7 @@ dependencies = [ "dyn-stack", "gemm-common", "gemm-f32", - "half 2.4.0", + "half 2.4.1", "num-complex", "num-traits", "paste", @@ -2213,9 +2209,9 @@ checksum = "36d244a08113319b5ebcabad2b8b7925732d15eec46d7e7ac3c11734f3b7a6ad" [[package]] name = "getrandom" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "js-sys", @@ -2224,18 +2220,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "getset" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" -dependencies = [ - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "gimli" version = "0.27.3" @@ -2248,7 +2232,7 @@ version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b903b73e45dc0c6c596f2d37eccece7c1c8bb6e4407b001096387c63d0d93724" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "libc", "libgit2-sys", "log", @@ -2263,8 +2247,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grenad" -version = "0.4.7" -source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#58ac87d852413571102f44c5e55ca13509a3f1a0" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e2ac9baf835ee2a7f0622a5617792ced6f65af25994078c343d429431ef2bbc" dependencies = [ "bytemuck", "byteorder", @@ -2303,7 +2288,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.1.0", + "http 1.2.0", "indexmap", "slab", "tokio", @@ -2319,9 +2304,9 @@ checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] name = "half" -version = "2.4.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" dependencies = [ "bytemuck", "cfg-if", @@ -2361,9 +2346,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", @@ -2395,11 +2380,11 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "heed" -version = "0.20.3" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bc30da4a93ff8cb98e535d595d6de42731d4719d707bc1c86f579158751a24e" +checksum = "6a56c94661ddfb51aa9cdfbf102cfcc340aa69267f95ebccc4af08d7c530d393" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "byteorder", "heed-traits", "heed-types", @@ -2419,9 +2404,9 @@ checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff" [[package]] name = "heed-types" -version = "0.20.1" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d3f528b053a6d700b2734eabcd0fd49cb8230647aa72958467527b0b7917114" +checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d" dependencies = [ "bincode", "byteorder", @@ -2436,6 +2421,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hex" version = "0.4.3" @@ -2448,13 +2439,13 @@ version = "0.3.2" source = "git+https://github.com/dureuill/hf-hub.git?branch=rust_tls#88d4f11cb9fa079f2912bacb96f5080b16825ce8" dependencies = [ "dirs", - "http 1.1.0", + "http 1.2.0", "indicatif", "log", "rand", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "ureq", ] @@ -2480,9 +2471,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -2496,7 +2487,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" dependencies = [ "bytes", - "http 1.1.0", + "http 1.2.0", ] [[package]] @@ -2507,7 +2498,7 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body", "pin-project-lite", ] @@ -2534,7 +2525,7 @@ dependencies = [ "futures-channel", "futures-util", "h2 0.4.5", - "http 1.1.0", + "http 1.2.0", "http-body", "httparse", "httpdate", @@ -2552,7 +2543,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", - "http 1.1.0", + "http 1.2.0", "hyper", "hyper-util", "rustls", @@ -2565,24 +2556,141 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.6" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body", "hyper", "pin-project-lite", "socket2 0.5.5", "tokio", - "tower", "tower-service", "tracing", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -2591,12 +2699,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", ] [[package]] @@ -2607,16 +2726,17 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d" [[package]] name = "index-scheduler" -version = "1.12.0" +version = "1.14.0" dependencies = [ "anyhow", - "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "big_s", "bincode", "bumpalo", - "crossbeam", + "bumparaw-collections", + "convert_case 0.6.0", + "crossbeam-channel", "csv", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "dump", "enum-iterator", "file-store", @@ -2628,14 +2748,13 @@ dependencies = [ "meilisearch-types", "memmap2", "page_size", - "raw-collections", "rayon", "roaring", "serde", "serde_json", "synchronoise", "tempfile", - "thiserror", + "thiserror 2.0.9", "time", "tracing", "ureq", @@ -2644,12 +2763,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.15.2", "serde", ] @@ -2708,7 +2827,8 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" [[package]] name = "irg-kvariants" version = "0.1.1" -source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" dependencies = [ "csv", "once_cell", @@ -2717,11 +2837,11 @@ dependencies = [ [[package]] name = "is-terminal" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi", + "hermit-abi 0.4.0", "libc", "windows-sys 0.52.0", ] @@ -2762,6 +2882,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -2775,7 +2904,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1e2b0210dc78b49337af9e49d7ae41a39dceac6e5985613f1cf7763e2f76a25" dependencies = [ "cedarwood", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "fxhash", "lazy_static", "phf", @@ -2803,7 +2932,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.12.0" +version = "1.14.0" dependencies = [ "criterion", "serde_json", @@ -2855,12 +2984,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -2872,9 +2995,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.164" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] name = "libgit2-sys" @@ -2890,12 +3013,12 @@ dependencies = [ [[package]] name = "libloading" -version = "0.8.4" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e310b3a6b5907f99202fcdb4960ff45b93735d7c7d96b760fcff8db2dc0e103d" +checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -2916,9 +3039,9 @@ dependencies = [ [[package]] name = "libproc" -version = "0.14.8" +version = "0.14.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae9ea4b75e1a81675429dafe43441df1caea70081e82246a8cccf514884a88bb" +checksum = "e78a09b56be5adbcad5aa1197371688dc6bb249a26da3bca2011ee2fb987ebfb" dependencies = [ "bindgen", "errno", @@ -2939,9 +3062,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cbc1aad631a7da0a7e9bc4b8669fa92ac9ca8eeb7b35a807376dd3034443ff" +checksum = "832c220475557e3b44a46cad1862b57f010f0c6e93d771d0e628e08689c068b1" dependencies = [ "lindera-analyzer", "lindera-core", @@ -2952,9 +3075,9 @@ dependencies = [ [[package]] name = "lindera-analyzer" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74508ffbb24e36905d1718b261460e378a748029b07bcd7e06f0d18500b8194c" +checksum = "a8e26651714abf5167e6b6a80f5cdaa0cad41c5fcb84d8ba96bebafcb9029339" dependencies = [ "anyhow", "bincode", @@ -2973,7 +3096,7 @@ dependencies = [ "regex", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "unicode-blocks", "unicode-normalization", "unicode-segmentation", @@ -2982,9 +3105,9 @@ dependencies = [ [[package]] name = "lindera-assets" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a677c371ecb3bd02b751be306ea09876cd47cf426303ad5f10a3fd6f9a4ded6" +checksum = "ebb01f1ca53c1e642234c6c7fdb9ac664ad0c1ab9502f33e4200201bac7e6ce7" dependencies = [ "encoding", "flate2", @@ -2995,9 +3118,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c35944000d05a177e981f037b5f0805f283b32f05a0c35713003bef136ca8cb4" +checksum = "5f7618d9aa947fdd7c38eae2b79f0fd237ecb5067608f1363610ba20d20ab5a8" dependencies = [ "bincode", "byteorder", @@ -3009,9 +3132,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b8f642bc9c9130682569975772a17336c6aab26d11fc0f823f3e663167ace6" +checksum = "efdbcb809d81428935d601a78c94bfb39500749213f7320705f427a7a1d31aec" dependencies = [ "anyhow", "lindera-core", @@ -3021,9 +3144,9 @@ dependencies = [ [[package]] name = "lindera-compress" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7825d8d63592aa5727d67bd209170ac82df56c369533efbf0ddbac277bb68ec" +checksum = "eac178afa2456dac469d3b1a2d7fbaf3e1ea796a1f52321e8ac29545a53c239c" dependencies = [ "anyhow", "flate2", @@ -3032,9 +3155,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c28191456debc98af6aa5f7db77872471983e9fa2a737b1c232b6ef543aed62" +checksum = "649777465f48147ce593ab6db347e235e3af8f693a23f4437be94a1cdbdf5fdf" dependencies = [ "anyhow", "bincode", @@ -3043,15 +3166,15 @@ dependencies = [ "log", "once_cell", "serde", - "thiserror", + "thiserror 1.0.69", "yada", ] [[package]] name = "lindera-decompress" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4788a1ead2f63f3fc2888109272921dedd86a87b7d0bf05e9daab46600daac51" +checksum = "9e3faaceb85e43ac250021866c6db3cdc9997b44b3d3ea498594d04edc91fc45" dependencies = [ "anyhow", "flate2", @@ -3060,9 +3183,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdf5f91725e32b9a21b1656baa7030766c9bafc4de4b4ddeb8ffdde7224dd2f6" +checksum = "31e15b2d2d8a4ad45f2e373a084931cf3dfbde15f124044e2436bb920af3366c" dependencies = [ "anyhow", "bincode", @@ -3085,15 +3208,15 @@ dependencies = [ [[package]] name = "lindera-dictionary-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e41f00ba7ac541b0ffd8c30e7a73f2dd197546cc5780462ec4f2e4782945a780" +checksum = "59802949110545b59b663917ed3fd55dc3b3a8cde6bd20137d7fe24372cfb9aa" dependencies = [ "anyhow", "bincode", "byteorder", "csv", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "encoding", "encoding_rs", "encoding_rs_io", @@ -3107,9 +3230,9 @@ dependencies = [ [[package]] name = "lindera-filter" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273d27e01e1377e2647314a4a5b9bdca4b52a867b319069ebae8c10191146eca" +checksum = "1320f118c3fc9e897f4ebfc16864e5ef8c0b06ba769c0a50e53f193f9d682bf8" dependencies = [ "anyhow", "csv", @@ -3132,9 +3255,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97a52ff0af5acb700093badaf7078051ab9ffd9071859724445a60193995f1f" +checksum = "5b4731bf3730f1f38266d7ee9bca7d460cd336645c9dfd4e6a1082e58ab1e993" dependencies = [ "bincode", "byteorder", @@ -3146,9 +3269,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf5031c52686128db13f774b2c5a8abfd52b4cc1f904041d8411aa19d630ce4d" +checksum = "309966c12e682f67205c3cd3c8dc55bbdcd1eb3b5c7c5cb41fb8acd18906d340" dependencies = [ "anyhow", "lindera-core", @@ -3158,9 +3281,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b36764b27b169aa11d24888141f206a6c246a5b195c1e67127485bac512fb6" +checksum = "e90e919b4cfb9962d24ee1e1d50a7c163bbf356376495ad66d1996e20b9f9e44" dependencies = [ "bincode", "byteorder", @@ -3172,9 +3295,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf36e40ace904741efdd883ed5c4dba6425f65156a0fb5d3f73a386335950dc" +checksum = "7e517df0d501f9f8bf3126da20fc8cb9a5e37921e0eec1824d7a62f096463e02" dependencies = [ "anyhow", "lindera-core", @@ -3184,9 +3307,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c92a1a3564b531953f0238cbcea392f2905f7b27b449978cf9e702a80e1086d" +checksum = "e9c6da4e68bc8b452a54b96d65361ebdceb4b6f36ecf262425c0e1f77960ae82" dependencies = [ "bincode", "byteorder", @@ -3199,9 +3322,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f2c60425abc1548570c2568858f74a1f042105ecd89faa39c651b4315350fd9" +checksum = "afc95884cc8f6dfb176caf5991043a4acf94c359215bbd039ea765e00454f271" dependencies = [ "anyhow", "lindera-core", @@ -3211,9 +3334,9 @@ dependencies = [ [[package]] name = "lindera-tokenizer" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "903e558981bcb6f59870aa7d6b4bcb09e8f7db778886a6a70f67fd74c9fa2ca3" +checksum = "d122042e1232a55c3604692445952a134e523822e9b4b9ab32a53ff890037ad4" dependencies = [ "bincode", "lindera-core", @@ -3225,9 +3348,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d227c3ce9cbd905f865c46c65a0470fd04e89b71104d7f92baa71a212ffe1d4b" +checksum = "cbffae1fb2f2614abdcb50f99b138476dbac19862ffa57bfdc9c7b5d5b22a90c" dependencies = [ "bincode", "byteorder", @@ -3240,9 +3363,9 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e2c50015c242e02c451acb6748667ac6fd1d3d667cd7db48cd89e2f2d2377e" +checksum = "fe50055327712ebd1bcc74b657cf78c728a78b9586e3f99d5dd0b6a0be221c5d" dependencies = [ "anyhow", "lindera-core", @@ -3264,9 +3387,9 @@ checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "liquid" -version = "0.26.6" +version = "0.26.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10929f201279ba14da3297b957dcda1e0bf7a6f3bb5115688be684aa8864e9cc" +checksum = "7cdcc72b82748f47c2933c172313f5a9aea5b2c4eb3fa4c66b4ea55bb60bb4b1" dependencies = [ "doc-comment", "liquid-core", @@ -3277,12 +3400,12 @@ dependencies = [ [[package]] name = "liquid-core" -version = "0.26.6" +version = "0.26.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3aef4b2160791f456eb880c990a97746f693746f92302ef5f1d06111cf14b768" +checksum = "2752e978ffc53670f3f2e8b3ef09f348d6f7b5474a3be3f8a5befe5382e4effb" dependencies = [ "anymap2", - "itertools 0.12.1", + "itertools 0.13.0", "kstring", "liquid-derive", "num-traits", @@ -3295,9 +3418,9 @@ dependencies = [ [[package]] name = "liquid-derive" -version = "0.26.5" +version = "0.26.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915f6d0a2963a27cd5205c1902f32ddfe3bc035816afd268cf88c0fc0f8d287e" +checksum = "3b51f1d220e3fa869e24cfd75915efe3164bd09bb11b3165db3f37f57bf673e3" dependencies = [ "proc-macro2", "quote", @@ -3306,11 +3429,11 @@ dependencies = [ [[package]] name = "liquid-lib" -version = "0.26.6" +version = "0.26.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73f48fc446873f74d869582f5c4b8cbf3248c93395e410a67af5809b3731e44a" +checksum = "59b1a298d3d2287ee5b1e43840d885b8fdfc37d3f4e90d82aacfd04d021618da" dependencies = [ - "itertools 0.12.1", + "itertools 0.13.0", "liquid-core", "once_cell", "percent-encoding", @@ -3320,10 +3443,16 @@ dependencies = [ ] [[package]] -name = "lmdb-master-sys" -version = "0.2.2" +name = "litemap" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57640c190703d5ccf4a86aff4aeb749b2d287a8cb1723c76b51f39d77ab53b24" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + +[[package]] +name = "lmdb-master-sys" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "864808e0b19fb6dd3b70ba94ee671b82fce17554cf80aeb0a155c65bb08027df" dependencies = [ "cc", "doxygen-rs", @@ -3366,9 +3495,18 @@ checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" [[package]] name = "log" -version = "0.4.21" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" + +[[package]] +name = "lru" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "227748d55f2f0ab4735d87fd623798cb6b664512fe979705f829c9f81c934465" +dependencies = [ + "hashbrown 0.15.2", +] [[package]] name = "lzma-rs" @@ -3422,7 +3560,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.12.0" +version = "1.14.0" dependencies = [ "insta", "md5", @@ -3431,7 +3569,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.12.0" +version = "1.14.0" dependencies = [ "actix-cors", "actix-http", @@ -3461,7 +3599,7 @@ dependencies = [ "indexmap", "insta", "is-terminal", - "itertools 0.13.0", + "itertools 0.14.0", "jsonwebtoken", "lazy_static", "manifest-dir-macros", @@ -3503,7 +3641,7 @@ dependencies = [ "temp-env", "tempfile", "termcolor", - "thiserror", + "thiserror 2.0.9", "time", "tokio", "toml", @@ -3513,15 +3651,17 @@ dependencies = [ "tracing-trace", "url", "urlencoding", + "utoipa", + "utoipa-scalar", "uuid", "wiremock", "yaup", - "zip 2.1.3", + "zip 2.2.2", ] [[package]] name = "meilisearch-auth" -version = "1.12.0" +version = "1.14.0" dependencies = [ "base64 0.22.1", "enum-iterator", @@ -3533,19 +3673,20 @@ dependencies = [ "serde", "serde_json", "sha2", - "thiserror", + "thiserror 2.0.9", "time", "uuid", ] [[package]] name = "meilisearch-types" -version = "1.12.0" +version = "1.14.0" dependencies = [ "actix-web", "anyhow", - "bitflags 2.6.0", + "bitflags 2.9.0", "bumpalo", + "bumparaw-collections", "convert_case 0.6.0", "csv", "deserr", @@ -3558,31 +3699,34 @@ dependencies = [ "meili-snap", "memmap2", "milli", - "raw-collections", "roaring", + "rustc-hash 2.1.0", "serde", "serde-cs", "serde_json", "tar", "tempfile", - "thiserror", + "thiserror 2.0.9", "time", "tokio", + "utoipa", "uuid", ] [[package]] name = "meilitool" -version = "1.12.0" +version = "1.14.0" dependencies = [ "anyhow", - "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)", "clap", "dump", "file-store", + "indexmap", "meilisearch-auth", "meilisearch-types", "serde", + "serde_json", + "tempfile", "time", "uuid", ] @@ -3605,15 +3749,17 @@ dependencies = [ [[package]] name = "milli" -version = "1.12.0" +version = "1.14.0" dependencies = [ "allocator-api2", - "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "arroy", + "bbqueue", "big_s", "bimap", "bincode", "bstr", "bumpalo", + "bumparaw-collections", "bytemuck", "byteorder", "candle-core", @@ -3621,6 +3767,7 @@ dependencies = [ "candle-transformers", "charabia", "concat-arrays", + "convert_case 0.6.0", "crossbeam-channel", "csv", "deserr", @@ -3628,19 +3775,21 @@ dependencies = [ "enum-iterator", "filter-parser", "flatten-serde-json", + "flume", "fst", "fxhash", "geoutils", "grenad", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "heed", "hf-hub", "indexmap", "insta", - "itertools 0.13.0", + "itertools 0.14.0", "json-depth-checker", "levenshtein_automata", "liquid", + "lru", "maplit", "md5", "meili-snap", @@ -3651,13 +3800,12 @@ dependencies = [ "once_cell", "ordered-float", "rand", - "raw-collections", "rayon", "rayon-par-bridge", "rhai", "roaring", "rstar", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "serde", "serde_json", "slice-group-by", @@ -3665,7 +3813,7 @@ dependencies = [ "smallvec", "smartstring", "tempfile", - "thiserror", + "thiserror 2.0.9", "thread_local", "tiktoken-rs", "time", @@ -3674,6 +3822,7 @@ dependencies = [ "uell", "ureq", "url", + "utoipa", "uuid", ] @@ -3717,6 +3866,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "0.8.11" @@ -3729,6 +3887,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.52.0", +] + [[package]] name = "monostate" version = "0.1.9" @@ -3758,9 +3927,9 @@ checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11" [[package]] name = "mutually_exclusive_features" -version = "0.0.3" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d02c0b00610773bb7fc61d85e13d86c7858cbdf00e1a120bfc41bc055dbaa0e" +checksum = "e94e1e6445d314f972ff7395df2de295fe51b71821694f0b0e1e79c4f12c8577" [[package]] name = "nohash" @@ -3809,21 +3978,34 @@ dependencies = [ ] [[package]] -name = "num-bigint" +name = "num" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-complex" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" dependencies = [ "bytemuck", "num-traits", @@ -3837,19 +4019,40 @@ checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" [[package]] name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" dependencies = [ "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", "num-traits", ] [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", "libm", @@ -3861,7 +4064,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] @@ -3913,13 +4116,14 @@ dependencies = [ [[package]] name = "obkv" version = "0.3.0" -source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce535874008ecac554f02e0c670e6caf62134d6b" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae4512a8f418ac322335255a72361b9ac927e106f4d7fe6ab4d8ac59cb01f7a9" [[package]] name = "once_cell" -version = "1.19.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "cde51589ab56b20a6f686b2c68f7a0bd6add753d697abf720d63f8db3ab7b1ad" [[package]] name = "onig" @@ -3957,9 +4161,9 @@ checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" [[package]] name = "ordered-float" -version = "4.2.1" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ff2cf528c6c03d9ed653d6c4ce1dc0582dc4af309790ad92f07c1cd551b0be" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" dependencies = [ "num-traits", ] @@ -4058,7 +4262,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.12.0" +version = "1.14.0" dependencies = [ "big_s", "serde_json", @@ -4070,7 +4274,7 @@ version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1acb4a4365a13f749a93f1a094a7805e5cfa0955373a9de860d962eaa3a5fe5a" dependencies = [ - "thiserror", + "thiserror 1.0.69", "ucd-trie", ] @@ -4182,9 +4386,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -4277,7 +4481,6 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2", "quote", - "syn 1.0.109", "version_check", ] @@ -4307,7 +4510,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "hex", "lazy_static", "procfs-core", @@ -4320,7 +4523,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "hex", ] @@ -4338,7 +4541,7 @@ dependencies = [ "parking_lot", "procfs", "protobuf", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -4391,7 +4594,7 @@ dependencies = [ "quinn-udp", "rustc-hash 1.1.0", "rustls", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -4405,10 +4608,10 @@ dependencies = [ "bytes", "rand", "ring", - "rustc-hash 2.0.0", + "rustc-hash 2.1.0", "rustls", "slab", - "thiserror", + "thiserror 1.0.69", "tinyvec", "tracing", ] @@ -4481,19 +4684,6 @@ dependencies = [ "rand", ] -[[package]] -name = "raw-collections" -version = "0.1.0" -source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a" -dependencies = [ - "allocator-api2", - "bitpacking", - "bumpalo", - "hashbrown 0.15.1", - "serde", - "serde_json", -] - [[package]] name = "raw-cpuid" version = "10.7.0" @@ -4575,14 +4765,14 @@ checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ "getrandom", "redox_syscall 0.2.16", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "regex" -version = "1.10.5" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -4592,9 +4782,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.6" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -4609,9 +4799,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rend" @@ -4624,16 +4814,16 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64 0.22.1", "bytes", "futures-channel", "futures-core", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body", "http-body-util", "hyper", @@ -4657,6 +4847,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", + "tower", "tower-service", "url", "wasm-bindgen", @@ -4664,7 +4855,7 @@ dependencies = [ "wasm-streams", "web-sys", "webpki-roots", - "winreg", + "windows-registry", ] [[package]] @@ -4673,7 +4864,7 @@ version = "1.20.0" source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "ahash 0.8.11", - "bitflags 2.6.0", + "bitflags 2.9.0", "instant", "num-traits", "once_cell", @@ -4696,15 +4887,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" dependencies = [ "cc", "cfg-if", "getrandom", "libc", - "spin", "untrusted", "windows-sys 0.52.0", ] @@ -4740,8 +4930,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#8ff028e484fb6192a0acf5a669eaf18c30cada6e" +version = "0.10.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652edd001c53df0b3f96a36a8dc93fce6866988efc16808235653c6bcac8bf2" dependencies = [ "bytemuck", "byteorder", @@ -4750,9 +4941,9 @@ dependencies = [ [[package]] name = "rstar" -version = "0.12.0" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "133315eb94c7b1e8d0cb097e5a710d850263372fd028fff18969de708afc7008" +checksum = "421400d13ccfd26dfa5858199c30a5d76f9c54e0dba7575273025b43c5175dbb" dependencies = [ "heapless", "num-traits", @@ -4790,9 +4981,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustc-hash" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" [[package]] name = "rustc_version" @@ -4809,7 +5000,7 @@ version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "errno", "libc", "linux-raw-sys", @@ -4818,9 +5009,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.11" +version = "0.23.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4828ea528154ae444e5a642dbb7d5623354030dc9822b83fd9bb79683c7399d0" +checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" dependencies = [ "log", "once_cell", @@ -4833,25 +5024,24 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64 0.22.1", "rustls-pki-types", ] [[package]] name = "rustls-pki-types" -version = "1.7.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" [[package]] name = "rustls-webpki" -version = "0.102.5" +version = "0.102.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a6fccd794a42c2c105b513a2f62bc3fd8f3ba57a4593677ceb0bd035164d78" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "ring", "rustls-pki-types", @@ -4860,9 +5050,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" @@ -4903,15 +5093,15 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "segment" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bdca318192c89bb31bffa2ef8e9e9898bc80f15a78db2fdd41cd051f1b41d01" +checksum = "1dd0f21b6eb87a45a7cce06075a29ccdb42658a6eb84bf40c8fc179479630609" dependencies = [ "async-trait", "reqwest", "serde", "serde_json", - "thiserror", + "thiserror 2.0.9", "time", ] @@ -4932,9 +5122,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.214" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] @@ -4950,9 +5140,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.214" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", @@ -4961,9 +5151,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "indexmap", "itoa", @@ -4983,9 +5173,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" +checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" dependencies = [ "serde", ] @@ -5085,7 +5275,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -5183,6 +5373,9 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] [[package]] name = "spm_precompiled" @@ -5298,6 +5491,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "synchronoise" @@ -5325,25 +5521,24 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec7dddc5f0fee506baf8b9fdb989e242f17e4b11c61dfbb0635b705217199eea" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.9.0", "byteorder", "enum-as-inner", "libc", - "thiserror", + "thiserror 1.0.69", "walkdir", ] [[package]] name = "sysinfo" -version = "0.30.13" +version = "0.33.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a5b4ddaee55fb2bea2bf0e5000747e5f5c0de765e5a5ff87f4cd106439f4bb3" +checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" dependencies = [ - "cfg-if", "core-foundation-sys", "libc", + "memchr", "ntapi", - "once_cell", "rayon", "windows", ] @@ -5356,9 +5551,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.41" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" +checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6" dependencies = [ "filetime", "libc", @@ -5376,12 +5571,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.14.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" dependencies = [ "cfg-if", "fastrand", + "getrandom", "once_cell", "rustix", "windows-sys 0.52.0", @@ -5407,18 +5603,38 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" +dependencies = [ + "thiserror-impl 2.0.9", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" dependencies = [ "proc-macro2", "quote", @@ -5437,24 +5653,25 @@ dependencies = [ [[package]] name = "tiktoken-rs" -version = "0.5.9" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c314e7ce51440f9e8f5a497394682a57b7c323d0f4d0a6b1b13c429056e0e234" +checksum = "44075987ee2486402f0808505dd65692163d243a337fc54363d49afac41087f6" dependencies = [ "anyhow", "base64 0.21.7", "bstr", - "fancy-regex 0.12.0", + "fancy-regex", "lazy_static", "parking_lot", + "regex", "rustc-hash 1.1.0", ] [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -5475,9 +5692,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -5492,6 +5709,16 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -5541,7 +5768,7 @@ dependencies = [ "serde", "serde_json", "spm_precompiled", - "thiserror", + "thiserror 1.0.69", "unicode-normalization-alignments", "unicode-segmentation", "unicode_categories", @@ -5549,28 +5776,27 @@ dependencies = [ [[package]] name = "tokio" -version = "1.38.0" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", "libc", - "mio", - "num_cpus", + "mio 1.0.3", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2 0.5.5", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", @@ -5603,21 +5829,21 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.14" +version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +checksum = "a1ed1f98e3fdc28d6d910e6737ae6ab1a93bf1985935a1193e68f93eeb68d24e" dependencies = [ "serde", "serde_spanned", "toml_datetime", - "toml_edit 0.22.15", + "toml_edit 0.22.22", ] [[package]] name = "toml_datetime" -version = "0.6.6" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" dependencies = [ "serde", ] @@ -5635,27 +5861,27 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.15" +version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59a3a72298453f564e2b111fa896f8d07fabb36f51f06d7e875fc5e0b5a3ef1" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ "indexmap", "serde", "serde_spanned", "toml_datetime", - "winnow 0.6.13", + "winnow 0.6.22", ] [[package]] name = "tower" -version = "0.4.13" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "pin-project", "pin-project-lite", + "sync_wrapper", "tokio", "tower-layer", "tower-service", @@ -5663,21 +5889,21 @@ dependencies = [ [[package]] name = "tower-layer" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "log", "pin-project-lite", @@ -5687,9 +5913,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.11" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" +checksum = "54a9f5c1aca50ebebf074ee665b9f99f2e84906dcf6b993a0d0090edb835166d" dependencies = [ "actix-web", "mutually_exclusive_features", @@ -5700,9 +5926,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", @@ -5711,9 +5937,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable", @@ -5721,9 +5947,9 @@ dependencies = [ [[package]] name = "tracing-error" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e" +checksum = "8b1581020d7a273442f5b45074a6a57d5757ad0a47dac0e9f0bd57b81936f3db" dependencies = [ "tracing", "tracing-subscriber", @@ -5742,9 +5968,9 @@ dependencies = [ [[package]] name = "tracing-serde" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" dependencies = [ "serde", "tracing-core", @@ -5752,9 +5978,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "nu-ansi-term", "serde", @@ -5810,13 +6036,40 @@ dependencies = [ "bumpalo", ] +[[package]] +name = "ug" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4eef2ebfc18c67a6dbcacd9d8a4d85e0568cc58c82515552382312c2730ea13" +dependencies = [ + "half 2.4.1", + "num", + "serde", + "serde_json", + "thiserror 1.0.69", +] + +[[package]] +name = "ug-cuda" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c4dcab280ad0ef3957e153a82dcad608c954d02cf253b695322f502d1f8902e" +dependencies = [ + "cudarc", + "half 2.4.1", + "serde", + "serde_json", + "thiserror 1.0.69", + "ug", +] + [[package]] name = "unescaper" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c878a167baa8afd137494101a688ef8c67125089ff2249284bd2b5f9bfedb815" dependencies = [ - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -5828,12 +6081,6 @@ dependencies = [ "version_check", ] -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - [[package]] name = "unicode-blocks" version = "0.1.9" @@ -5848,9 +6095,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] @@ -5890,9 +6137,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "2.10.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72139d247e5f97a3eff96229a7ae85ead5328a39efe76f8bf5a06313d505b6ea" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" dependencies = [ "base64 0.22.1", "flate2", @@ -5909,9 +6156,9 @@ dependencies = [ [[package]] name = "url" -version = "2.5.2" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -5925,12 +6172,24 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + [[package]] name = "utf8-width" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.1" @@ -5938,10 +6197,47 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] -name = "uuid" -version = "1.10.0" +name = "utoipa" +version = "5.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "435c6f69ef38c9017b4b4eea965dfb91e71e53d869e896db40d1cf2441dd75c0" +dependencies = [ + "indexmap", + "serde", + "serde_json", + "utoipa-gen", +] + +[[package]] +name = "utoipa-gen" +version = "5.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a77d306bc75294fd52f3e99b13ece67c02c1a2789190a6f31d32f736624326f7" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn 2.0.87", + "uuid", +] + +[[package]] +name = "utoipa-scalar" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59559e1509172f6b26c1cdbc7247c4ddd1ac6560fe94b584f81ee489b141f719" +dependencies = [ + "actix-web", + "serde", + "serde_json", + "utoipa", +] + +[[package]] +name = "uuid" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", "serde", @@ -5961,24 +6257,24 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vergen" -version = "9.0.0" +version = "9.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c32e7318e93a9ac53693b6caccfb05ff22e04a44c7cf8a279051f24c09da286f" +checksum = "31f25fc8f8f05df455c7941e87f093ad22522a9ff33d7a027774815acf6f0639" dependencies = [ "anyhow", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "rustversion", "vergen-lib", ] [[package]] name = "vergen-git2" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a62c52cd2b2b8b7ec75fc20111b3022ac3ff83e4fc14b9497cfcfd39c54f9c67" +checksum = "5e63e069d8749fead1e3bab7a9d79e8fb90516b2ec66fc2243a798ecdc1a31d7" dependencies = [ "anyhow", - "derive_builder 0.20.0", + "derive_builder 0.20.2", "git2", "rustversion", "time", @@ -5988,13 +6284,12 @@ dependencies = [ [[package]] name = "vergen-lib" -version = "0.1.3" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e06bee42361e43b60f363bad49d63798d0f42fb1768091812270eca00c784720" +checksum = "c0c767e6751c09fc85cde58722cf2f1007e80e4c8d5a4321fc90d83dc54ca147" dependencies = [ "anyhow", - "derive_builder 0.20.0", - "getset", + "derive_builder 0.20.2", "rustversion", ] @@ -6016,9 +6311,9 @@ dependencies = [ [[package]] name = "wana_kana" -version = "3.0.0" +version = "4.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "477976a5c56fb7b014795df5a2ce08d2de8bcd4d5980844c5bd3978a7fd1c30b" +checksum = "a74666202acfcb4f9b995be2e3e9f7f530deb65e05a1407b8d0b30c9c451238a" dependencies = [ "fnv", "itertools 0.10.5", @@ -6181,21 +6476,85 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows" -version = "0.52.0" +version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" dependencies = [ "windows-core", - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] name = "windows-core" -version = "0.52.0" +version = "0.57.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" dependencies = [ - "windows-targets 0.52.4", + "windows-implement", + "windows-interface", + "windows-result 0.1.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result 0.2.0", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result 0.2.0", + "windows-targets 0.52.6", ] [[package]] @@ -6222,7 +6581,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.6", ] [[package]] @@ -6257,17 +6616,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -6284,9 +6644,9 @@ checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" @@ -6302,9 +6662,9 @@ checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" @@ -6320,9 +6680,15 @@ checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" @@ -6338,9 +6704,9 @@ checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" @@ -6356,9 +6722,9 @@ checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" @@ -6374,9 +6740,9 @@ checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" @@ -6392,9 +6758,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" @@ -6407,35 +6773,25 @@ dependencies = [ [[package]] name = "winnow" -version = "0.6.13" +version = "0.6.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +checksum = "39281189af81c07ec09db316b302a3e67bf9bd7cbf6c820b50e35fee9c2fa980" dependencies = [ "memchr", ] -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "wiremock" -version = "0.6.0" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec874e1eef0df2dcac546057fe5e29186f09c378181cd7b635b4b7bcc98e9d81" +checksum = "7fff469918e7ca034884c7fd8f93fe27bacb7fcb599fd879df6c7b429a29b646" dependencies = [ "assert-json-diff", "async-trait", - "base64 0.21.7", + "base64 0.22.1", "deadpool", "futures", - "http 1.1.0", + "http 1.2.0", "http-body-util", "hyper", "hyper-util", @@ -6448,6 +6804,18 @@ dependencies = [ "url", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "wyz" version = "0.5.1" @@ -6470,7 +6838,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.12.0" +version = "1.14.0" dependencies = [ "anyhow", "build-info", @@ -6505,14 +6873,14 @@ checksum = "b0144f1a16a199846cb21024da74edd930b43443463292f536b7110b4855b5c6" dependencies = [ "form_urlencoded", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "yoke" -version = "0.7.3" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e71b2e4f287f467794c671e2b8f8a5f3716b3c829079a1c44740148eff07e4" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" dependencies = [ "serde", "stable_deref_trait", @@ -6522,9 +6890,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.3" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e6936f0cce458098a201c245a11bef556c6a0181129c7034d10d76d1ec3a2b8" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", @@ -6593,6 +6961,28 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "zip" version = "1.1.4" @@ -6605,14 +6995,14 @@ dependencies = [ "displaydoc", "indexmap", "num_enum", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "zip" -version = "2.1.3" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "775a2b471036342aa69bc5a602bc889cb0a06cda00477d0c69566757d5553d39" +checksum = "ae9c1ea7b3a5e1f4b922ff856a129881167511563dc219869afe3787fc0c1a45" dependencies = [ "aes", "arbitrary", @@ -6630,7 +7020,7 @@ dependencies = [ "pbkdf2", "rand", "sha1", - "thiserror", + "thiserror 2.0.9", "time", "zeroize", "zopfli", diff --git a/Cargo.toml b/Cargo.toml index 5e53dbfa5..b023a778e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ ] [workspace.package] -version = "1.12.0" +version = "1.14.0" authors = [ "Quentin de Quelen ", "ClƩment Renault ", @@ -36,6 +36,12 @@ license = "MIT" [profile.release] codegen-units = 1 +# We now compile heed without the NDEBUG define for better performance. +# However, we still enable debug assertions for a better detection of +# disk corruption on the cloud or in OSS. +[profile.release.package.heed] +debug-assertions = true + [profile.dev.package.flate2] opt-level = 3 @@ -43,6 +49,3 @@ opt-level = 3 opt-level = 3 [profile.dev.package.roaring] opt-level = 3 - -[patch.crates-io] -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" } diff --git a/Dockerfile b/Dockerfile index 04557df59..5a9a4691f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Compile -FROM rust:1.79.0-alpine3.20 AS compiler +FROM rust:1.85-alpine3.20 AS compiler RUN apk add -q --no-cache build-base openssl-dev diff --git a/LICENSE b/LICENSE index 76a573977..686f24931 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019-2024 Meili SAS +Copyright (c) 2019-2025 Meili SAS Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 4be92d439..d85942584 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,13 @@

Dependency status License - Bors enabled + Merge Queues enabled +

+ +

+ + Meilisearch AI-powered search general availability announcement on ProductHunt +

⚔ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow šŸ”

@@ -58,6 +64,7 @@ See the list of all our example apps in our [demos repository](https://github.co - **[Multi-Tenancy](https://www.meilisearch.com/docs/learn/security/multitenancy_tenant_tokens?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** personalize search results for any number of application tenants - **Highly Customizable:** customize Meilisearch to your specific needs or use our out-of-the-box and hassle-free presets - **[RESTful API](https://www.meilisearch.com/docs/reference/api/overview?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** integrate Meilisearch in your technical stack with our plugins and SDKs +- **AI-ready:** works out of the box with [langchain](https://www.meilisearch.com/with/langchain) and the [model context protocol](https://github.com/meilisearch/meilisearch-mcp) - **Easy to install, deploy, and maintain** ## šŸ“– Documentation diff --git a/assets/grafana-dashboard.json b/assets/grafana-dashboard.json index 2cfa85a46..027afcbba 100644 --- a/assets/grafana-dashboard.json +++ b/assets/grafana-dashboard.json @@ -1403,6 +1403,104 @@ "title": "Number of tasks by indexes", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 29, + "interval": "5s", + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "8.1.4", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "exemplar": true, + "expr": "meilisearch_task_queue_latency_seconds{instance=\"$instance\", job=\"$job\"}", + "interval": "", + "legendFormat": "{{value}} ", + "range": true, + "refId": "A" + } + ], + "title": "Task queue latency", + "type": "timeseries" + }, { "collapsed": true, "datasource": { diff --git a/assets/ph-banner.png b/assets/ph-banner.png new file mode 100644 index 000000000..399c7fd10 Binary files /dev/null and b/assets/ph-banner.png differ diff --git a/bors.toml b/bors.toml deleted file mode 100644 index 96e9ef65e..000000000 --- a/bors.toml +++ /dev/null @@ -1,11 +0,0 @@ -status = [ - 'Tests on ubuntu-20.04', - 'Tests on macos-13', - 'Tests on windows-2022', - 'Run Clippy', - 'Run Rustfmt', - 'Run tests in debug', -] -pr_status = ['Milestone Check'] -# 3 hours timeout -timeout-sec = 10800 diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index eec30ea3f..a2cddd554 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -11,27 +11,27 @@ edition.workspace = true license.workspace = true [dependencies] -anyhow = "1.0.86" +anyhow = "1.0.95" bumpalo = "3.16.0" -csv = "1.3.0" +csv = "1.3.1" memmap2 = "0.9.5" milli = { path = "../milli" } mimalloc = { version = "0.1.43", default-features = false } -serde_json = { version = "1.0.120", features = ["preserve_order"] } -tempfile = "3.14.0" +serde_json = { version = "1.0.135", features = ["preserve_order"] } +tempfile = "3.15.0" [dev-dependencies] criterion = { version = "0.5.1", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.10.6" +roaring = "0.10.10" [build-dependencies] -anyhow = "1.0.86" -bytes = "1.6.0" +anyhow = "1.0.95" +bytes = "1.9.0" convert_case = "0.6.0" -flate2 = "1.0.30" -reqwest = { version = "0.12.5", features = ["blocking", "rustls-tls"], default-features = false } +flate2 = "1.0.35" +reqwest = { version = "0.12.12", features = ["blocking", "rustls-tls"], default-features = false } [features] default = ["milli/all-tokenizations"] diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 2f33c3454..9199c3877 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -8,14 +8,16 @@ use bumpalo::Bump; use criterion::{criterion_group, criterion_main, Criterion}; use milli::documents::PrimaryKey; use milli::heed::{EnvOpenOptions, RwTxn}; +use milli::progress::Progress; use milli::update::new::indexer; -use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; +use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::Index; +use milli::{FilterableAttributesRule, Index}; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; use roaring::RoaringBitmap; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -33,10 +35,11 @@ fn setup_dir(path: impl AsRef) { fn setup_index() -> Index { let path = "benches.mmdb"; setup_dir(path); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(100); - Index::new(options, path).unwrap() + Index::new(options, path, true).unwrap() } fn setup_settings<'t>( @@ -55,7 +58,8 @@ fn setup_settings<'t>( let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + let filterable_fields = + filterable_fields.iter().map(|s| FilterableAttributesRule::Field(s.to_string())).collect(); builder.set_filterable_fields(filterable_fields); let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); @@ -136,10 +140,9 @@ fn indexing_songs_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -150,13 +153,14 @@ fn indexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -164,7 +168,7 @@ fn indexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -202,10 +206,9 @@ fn reindexing_songs_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -216,13 +219,14 @@ fn reindexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -230,7 +234,7 @@ fn reindexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -246,10 +250,9 @@ fn reindexing_songs_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -260,13 +263,14 @@ fn reindexing_songs_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -274,7 +278,7 @@ fn reindexing_songs_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -314,10 +318,9 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -328,13 +331,14 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -342,7 +346,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -390,10 +394,9 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -404,13 +407,14 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -418,7 +422,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -434,10 +438,9 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -448,13 +451,14 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -462,7 +466,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -474,10 +478,9 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -488,13 +491,14 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -502,7 +506,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -540,11 +544,10 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -555,13 +558,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -569,7 +573,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -607,10 +611,9 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -621,13 +624,14 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -635,7 +639,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -673,10 +677,9 @@ fn indexing_wiki(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -687,13 +690,14 @@ fn indexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -701,7 +705,7 @@ fn indexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -738,10 +742,9 @@ fn reindexing_wiki(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -752,13 +755,14 @@ fn reindexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -766,7 +770,7 @@ fn reindexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -782,10 +786,9 @@ fn reindexing_wiki(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -796,13 +799,14 @@ fn reindexing_wiki(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -810,7 +814,7 @@ fn reindexing_wiki(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -849,10 +853,9 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -863,13 +866,14 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -877,7 +881,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -924,11 +928,10 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -939,13 +942,14 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -953,7 +957,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -969,11 +973,10 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -984,13 +987,14 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -998,7 +1002,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1010,11 +1014,10 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -1025,13 +1028,14 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1039,7 +1043,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1077,10 +1081,9 @@ fn indexing_movies_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -1091,13 +1094,14 @@ fn indexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1105,7 +1109,7 @@ fn indexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1142,10 +1146,9 @@ fn reindexing_movies_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -1156,13 +1159,14 @@ fn reindexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1170,7 +1174,7 @@ fn reindexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1186,10 +1190,9 @@ fn reindexing_movies_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -1200,13 +1203,14 @@ fn reindexing_movies_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1214,7 +1218,7 @@ fn reindexing_movies_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1253,10 +1257,9 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = - indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -1267,13 +1270,14 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1281,7 +1285,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -1321,6 +1325,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { } create_dir_all(conf.database_name).unwrap(); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(100); - let index = Index::new(options, conf.database_name).unwrap(); + let index = Index::new(options, conf.database_name, true).unwrap(); let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); @@ -98,8 +100,8 @@ pub fn base_setup(conf: &Conf) -> Index { let mut new_fields_ids_map = db_fields_ids_map.clone(); let documents = documents_from(conf.dataset, conf.dataset_format); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); - indexer.add_documents(&documents).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -110,13 +112,14 @@ pub fn base_setup(conf: &Conf) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -124,7 +127,7 @@ pub fn base_setup(conf: &Conf) -> Index { &document_changes, EmbeddingConfigs::default(), &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/build-info/Cargo.toml b/crates/build-info/Cargo.toml index c24dffe5c..f8ede756e 100644 --- a/crates/build-info/Cargo.toml +++ b/crates/build-info/Cargo.toml @@ -11,8 +11,8 @@ license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -time = { version = "0.3.36", features = ["parsing"] } +time = { version = "0.3.37", features = ["parsing"] } [build-dependencies] -anyhow = "1.0.86" -vergen-git2 = "1.0.0" +anyhow = "1.0.95" +vergen-git2 = "1.0.2" diff --git a/crates/dump/Cargo.toml b/crates/dump/Cargo.toml index f9d2a9a0b..5c427916c 100644 --- a/crates/dump/Cargo.toml +++ b/crates/dump/Cargo.toml @@ -11,21 +11,21 @@ readme.workspace = true license.workspace = true [dependencies] -anyhow = "1.0.86" -flate2 = "1.0.30" -http = "1.1.0" +anyhow = "1.0.95" +flate2 = "1.0.35" +http = "1.2.0" meilisearch-types = { path = "../meilisearch-types" } -once_cell = "1.19.0" -regex = "1.10.5" -roaring = { version = "0.10.6", features = ["serde"] } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order"] } -tar = "0.4.41" -tempfile = "3.10.1" -thiserror = "1.0.61" -time = { version = "0.3.36", features = ["serde-well-known", "formatting", "parsing", "macros"] } -tracing = "0.1.40" -uuid = { version = "1.10.0", features = ["serde", "v4"] } +once_cell = "1.20.2" +regex = "1.11.1" +roaring = { version = "0.10.10", features = ["serde"] } +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.135", features = ["preserve_order"] } +tar = "0.4.43" +tempfile = "3.15.0" +thiserror = "2.0.9" +time = { version = "0.3.37", features = ["serde-well-known", "formatting", "parsing", "macros"] } +tracing = "0.1.41" +uuid = { version = "1.11.0", features = ["serde", "v4"] } [dev-dependencies] big_s = "1.0.2" diff --git a/crates/dump/README.md b/crates/dump/README.md index 3537f188e..42d84ec80 100644 --- a/crates/dump/README.md +++ b/crates/dump/README.md @@ -10,8 +10,10 @@ dump ā”œā”€ā”€ instance-uid.uuid ā”œā”€ā”€ keys.jsonl ā”œā”€ā”€ metadata.json -└── tasks - ā”œā”€ā”€ update_files - │ └── [task_id].jsonl +ā”œā”€ā”€ tasks +│ ā”œā”€ā”€ update_files +│ │ └── [task_id].jsonl +│ └── queue.jsonl +└── batches └── queue.jsonl -``` \ No newline at end of file +``` diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 8bed7f0d4..4e2d6ac2f 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -141,6 +141,9 @@ pub enum KindDump { instance_uid: Option, }, SnapshotCreation, + UpgradeDatabase { + from: (u32, u32, u32), + }, } impl From for TaskDump { @@ -210,6 +213,9 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, + KindWithContent::UpgradeDatabase { from: version } => { + KindDump::UpgradeDatabase { from: version } + } } } } @@ -222,14 +228,16 @@ pub(crate) mod test { use big_s::S; use maplit::{btreemap, btreeset}; + use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats}; use meilisearch_types::facet_values_sort::FacetValuesSort; - use meilisearch_types::features::RuntimeTogglableFeatures; + use meilisearch_types::features::{Network, Remote, RuntimeTogglableFeatures}; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::{Action, Key}; - use meilisearch_types::milli; use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::{self, FilterableAttributesRule}; use meilisearch_types::settings::{Checked, FacetingSettings, Settings}; - use meilisearch_types::tasks::{Details, Status}; + use meilisearch_types::task_view::DetailsView; + use meilisearch_types::tasks::{Details, Kind, Status}; use serde_json::{json, Map, Value}; use time::macros::datetime; use uuid::Uuid; @@ -271,7 +279,10 @@ pub(crate) mod test { let settings = Settings { displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(), searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(), - filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), + filterable_attributes: Setting::Set(vec![ + FilterableAttributesRule::Field(S("race")), + FilterableAttributesRule::Field(S("age")), + ]), sortable_attributes: Setting::Set(btreeset! { S("age") }), ranking_rules: Setting::NotSet, stop_words: Setting::NotSet, @@ -292,11 +303,39 @@ pub(crate) mod test { embedders: Setting::NotSet, search_cutoff_ms: Setting::NotSet, localized_attributes: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: std::marker::PhantomData, }; settings.check() } + pub fn create_test_batches() -> Vec { + vec![Batch { + uid: 0, + details: DetailsView { + received_documents: Some(12), + indexed_documents: Some(Some(10)), + ..DetailsView::default() + }, + progress: None, + stats: BatchStats { + total_nb_tasks: 1, + status: maplit::btreemap! { Status::Succeeded => 1 }, + types: maplit::btreemap! { Kind::DocumentAdditionOrUpdate => 1 }, + index_uids: maplit::btreemap! { "doggo".to_string() => 1 }, + progress_trace: Default::default(), + write_channel_congestion: None, + }, + enqueued_at: Some(BatchEnqueuedAt { + earliest: datetime!(2022-11-11 0:00 UTC), + oldest: datetime!(2022-11-11 0:00 UTC), + }), + started_at: datetime!(2022-11-20 0:00 UTC), + finished_at: Some(datetime!(2022-11-21 0:00 UTC)), + }] + } + pub fn create_test_tasks() -> Vec<(TaskDump, Option>)> { vec![ ( @@ -419,6 +458,15 @@ pub(crate) mod test { index.flush().unwrap(); index.settings(&settings).unwrap(); + // ========== pushing the batch queue + let batches = create_test_batches(); + + let mut batch_queue = dump.create_batches_queue().unwrap(); + for batch in &batches { + batch_queue.push_batch(batch).unwrap(); + } + batch_queue.flush().unwrap(); + // ========== pushing the task queue let tasks = create_test_tasks(); @@ -447,6 +495,10 @@ pub(crate) mod test { dump.create_experimental_features(features).unwrap(); + // ========== network + let network = create_test_network(); + dump.create_network(network).unwrap(); + // create the dump let mut file = tempfile::tempfile().unwrap(); dump.persist_to(&mut file).unwrap(); @@ -456,7 +508,14 @@ pub(crate) mod test { } fn create_test_features() -> RuntimeTogglableFeatures { - RuntimeTogglableFeatures { vector_store: true, ..Default::default() } + RuntimeTogglableFeatures::default() + } + + fn create_test_network() -> Network { + Network { + local: Some("myself".to_string()), + remotes: maplit::btreemap! {"other".to_string() => Remote { url: "http://test".to_string(), search_api_key: Some("apiKey".to_string()) }}, + } } #[test] @@ -507,5 +566,9 @@ pub(crate) mod test { // ==== checking the features let expected = create_test_features(); assert_eq!(dump.features().unwrap().unwrap(), expected); + + // ==== checking the network + let expected = create_test_network(); + assert_eq!(&expected, dump.network().unwrap().unwrap()); } } diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index 785542cce..6b63e7c6b 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -196,6 +196,10 @@ impl CompatV5ToV6 { pub fn features(&self) -> Result> { Ok(None) } + + pub fn network(&self) -> Result> { + Ok(None) + } } pub enum CompatIndexV5ToV6 { @@ -318,7 +322,16 @@ impl From> for v6::Settings { v6::Settings { displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(), searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(), - filterable_attributes: settings.filterable_attributes.into(), + filterable_attributes: match settings.filterable_attributes { + v5::settings::Setting::Set(filterable_attributes) => v6::Setting::Set( + filterable_attributes + .into_iter() + .map(v6::FilterableAttributesRule::Field) + .collect(), + ), + v5::settings::Setting::Reset => v6::Setting::Reset, + v5::settings::Setting::NotSet => v6::Setting::NotSet, + }, sortable_attributes: settings.sortable_attributes.into(), ranking_rules: { match settings.ranking_rules { @@ -382,6 +395,8 @@ impl From> for v6::Settings { embedders: v6::Setting::NotSet, localized_attributes: v6::Setting::NotSet, search_cutoff_ms: v6::Setting::NotSet, + facet_search: v6::Setting::NotSet, + prefix_search: v6::Setting::NotSet, _kind: std::marker::PhantomData, } } diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index f9660972a..2b4440ab7 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -23,6 +23,7 @@ mod v6; pub type Document = serde_json::Map; pub type UpdateFile = dyn Iterator>; +#[allow(clippy::large_enum_variant)] pub enum DumpReader { Current(V6Reader), Compat(CompatV5ToV6), @@ -101,6 +102,13 @@ impl DumpReader { } } + pub fn batches(&mut self) -> Result> + '_>> { + match self { + DumpReader::Current(current) => Ok(current.batches()), + DumpReader::Compat(_compat) => Ok(Box::new(std::iter::empty())), + } + } + pub fn keys(&mut self) -> Result> + '_>> { match self { DumpReader::Current(current) => Ok(current.keys()), @@ -114,6 +122,13 @@ impl DumpReader { DumpReader::Compat(compat) => compat.features(), } } + + pub fn network(&self) -> Result> { + match self { + DumpReader::Current(current) => Ok(current.network()), + DumpReader::Compat(compat) => compat.network(), + } + } } impl From for DumpReader { @@ -219,6 +234,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2024-05-16 15:51:34.151044 +00:00:00"); insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -327,10 +346,8 @@ pub(crate) mod test { } } - assert_eq!( - dump.features().unwrap().unwrap(), - RuntimeTogglableFeatures { vector_store: true, ..Default::default() } - ); + assert_eq!(dump.features().unwrap().unwrap(), RuntimeTogglableFeatures::default()); + assert_eq!(dump.network().unwrap(), None); } #[test] @@ -342,6 +359,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2023-07-06 7:10:27.21958 +00:00:00"); insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -373,10 +394,28 @@ pub(crate) mod test { assert_eq!(test.documents().unwrap().count(), 1); - assert_eq!( - dump.features().unwrap().unwrap(), - RuntimeTogglableFeatures { vector_store: true, ..Default::default() } - ); + assert_eq!(dump.features().unwrap().unwrap(), RuntimeTogglableFeatures::default()); + } + + #[test] + fn import_dump_v6_network() { + let dump = File::open("tests/assets/v6-with-network.dump").unwrap(); + let dump = DumpReader::open(dump).unwrap(); + + // top level infos + insta::assert_snapshot!(dump.date().unwrap(), @"2025-01-29 15:45:32.738676 +00:00:00"); + insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None"); + + // network + + let network = dump.network().unwrap().unwrap(); + insta::assert_snapshot!(network.local.as_ref().unwrap(), @"ms-0"); + insta::assert_snapshot!(network.remotes.get("ms-0").as_ref().unwrap().url, @"http://localhost:7700"); + insta::assert_snapshot!(network.remotes.get("ms-0").as_ref().unwrap().search_api_key.is_none(), @"true"); + insta::assert_snapshot!(network.remotes.get("ms-1").as_ref().unwrap().url, @"http://localhost:7701"); + insta::assert_snapshot!(network.remotes.get("ms-1").as_ref().unwrap().search_api_key.is_none(), @"true"); + insta::assert_snapshot!(network.remotes.get("ms-2").as_ref().unwrap().url, @"http://ms-5679.example.meilisearch.io"); + insta::assert_snapshot!(network.remotes.get("ms-2").as_ref().unwrap().search_api_key.as_ref().unwrap(), @"foo"); } #[test] @@ -388,6 +427,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-04 15:55:10.344982459 +00:00:00"); insta::assert_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -468,6 +511,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-06 12:53:49.131989609 +00:00:00"); insta::assert_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d"); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -545,6 +592,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-07 11:39:03.709153554 +00:00:00"); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -638,6 +689,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-09 20:27:59.904096267 +00:00:00"); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -731,6 +786,10 @@ pub(crate) mod test { insta::assert_snapshot!(dump.date().unwrap(), @"2023-01-30 16:26:09.247261 +00:00:00"); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); @@ -807,6 +866,10 @@ pub(crate) mod test { assert_eq!(dump.date(), None); assert_eq!(dump.instance_uid().unwrap(), None); + // batches didn't exists at the time + let batches = dump.batches().unwrap().collect::>>().unwrap(); + meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]"); + // tasks let tasks = dump.tasks().unwrap().collect::>>().unwrap(); let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip(); diff --git a/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap b/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap index 77694a629..3c770a6eb 100644 --- a/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap +++ b/crates/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-5.snap @@ -1,5 +1,5 @@ --- -source: dump/src/reader/mod.rs +source: crates/dump/src/reader/mod.rs expression: vector_index.settings().unwrap() --- { @@ -49,6 +49,7 @@ expression: vector_index.settings().unwrap() "source": "huggingFace", "model": "BAAI/bge-base-en-v1.5", "revision": "617ca489d9e86b49b8167676d8220688b99db36e", + "pooling": "forceMean", "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}" } }, diff --git a/crates/dump/src/reader/v4/meta.rs b/crates/dump/src/reader/v4/meta.rs index 2daea68a4..9b26eba25 100644 --- a/crates/dump/src/reader/v4/meta.rs +++ b/crates/dump/src/reader/v4/meta.rs @@ -108,7 +108,7 @@ where /// not supported on untagged enums. struct StarOrVisitor(PhantomData); - impl<'de, T, FE> Visitor<'de> for StarOrVisitor + impl Visitor<'_> for StarOrVisitor where T: FromStr, FE: Display, diff --git a/crates/dump/src/reader/v4/tasks.rs b/crates/dump/src/reader/v4/tasks.rs index a701d837d..8ae3f77b1 100644 --- a/crates/dump/src/reader/v4/tasks.rs +++ b/crates/dump/src/reader/v4/tasks.rs @@ -99,7 +99,7 @@ impl Task { /// Return true when a task is finished. /// A task is finished when its last state is either `Succeeded` or `Failed`. pub fn is_finished(&self) -> bool { - self.events.last().map_or(false, |event| { + self.events.last().is_some_and(|event| { matches!(event, TaskEvent::Succeded { .. } | TaskEvent::Failed { .. }) }) } diff --git a/crates/dump/src/reader/v5/meta.rs b/crates/dump/src/reader/v5/meta.rs index 2daea68a4..9b26eba25 100644 --- a/crates/dump/src/reader/v5/meta.rs +++ b/crates/dump/src/reader/v5/meta.rs @@ -108,7 +108,7 @@ where /// not supported on untagged enums. struct StarOrVisitor(PhantomData); - impl<'de, T, FE> Visitor<'de> for StarOrVisitor + impl Visitor<'_> for StarOrVisitor where T: FromStr, FE: Display, diff --git a/crates/dump/src/reader/v5/tasks.rs b/crates/dump/src/reader/v5/tasks.rs index 8dfb2d0b0..a7352bf0c 100644 --- a/crates/dump/src/reader/v5/tasks.rs +++ b/crates/dump/src/reader/v5/tasks.rs @@ -114,7 +114,7 @@ impl Task { /// Return true when a task is finished. /// A task is finished when its last state is either `Succeeded` or `Failed`. pub fn is_finished(&self) -> bool { - self.events.last().map_or(false, |event| { + self.events.last().is_some_and(|event| { matches!(event, TaskEvent::Succeeded { .. } | TaskEvent::Failed { .. }) }) } diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 50b9751a2..0b4ba5bdd 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -3,6 +3,7 @@ use std::io::{BufRead, BufReader, ErrorKind}; use std::path::Path; pub use meilisearch_types::milli; +use meilisearch_types::milli::vector::hf::OverridePooling; use tempfile::TempDir; use time::OffsetDateTime; use tracing::debug; @@ -18,8 +19,10 @@ pub type Checked = meilisearch_types::settings::Checked; pub type Unchecked = meilisearch_types::settings::Unchecked; pub type Task = crate::TaskDump; +pub type Batch = meilisearch_types::batches::Batch; pub type Key = meilisearch_types::keys::Key; pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures; +pub type Network = meilisearch_types::features::Network; // ===== Other types to clarify the code of the compat module // everything related to the tasks @@ -43,13 +46,17 @@ pub type ResponseError = meilisearch_types::error::ResponseError; pub type Code = meilisearch_types::error::Code; pub type RankingRuleView = meilisearch_types::settings::RankingRuleView; +pub type FilterableAttributesRule = meilisearch_types::milli::FilterableAttributesRule; + pub struct V6Reader { dump: TempDir, instance_uid: Option, metadata: Metadata, tasks: BufReader, + batches: Option>, keys: BufReader, features: Option, + network: Option, } impl V6Reader { @@ -77,13 +84,38 @@ impl V6Reader { } else { None }; + let batches = match File::open(dump.path().join("batches").join("queue.jsonl")) { + Ok(file) => Some(BufReader::new(file)), + // The batch file was only introduced during the v1.13, anything prior to that won't have batches + Err(err) if err.kind() == ErrorKind::NotFound => None, + Err(e) => return Err(e.into()), + }; + + let network_file = match fs::read(dump.path().join("network.json")) { + Ok(network_file) => Some(network_file), + Err(error) => match error.kind() { + // Allows the file to be missing, this will only result in all experimental features disabled. + ErrorKind::NotFound => { + debug!("`network.json` not found in dump"); + None + } + _ => return Err(error.into()), + }, + }; + let network = if let Some(network_file) = network_file { + Some(serde_json::from_reader(&*network_file)?) + } else { + None + }; Ok(V6Reader { metadata: serde_json::from_reader(&*meta_file)?, instance_uid, tasks: BufReader::new(File::open(dump.path().join("tasks").join("queue.jsonl"))?), + batches, keys: BufReader::new(File::open(dump.path().join("keys.jsonl"))?), features, + network, dump, }) } @@ -124,7 +156,7 @@ impl V6Reader { &mut self, ) -> Box>)>> + '_> { Box::new((&mut self.tasks).lines().map(|line| -> Result<_> { - let task: Task = serde_json::from_str(&line?).unwrap(); + let task: Task = serde_json::from_str(&line?)?; let update_file_path = self .dump @@ -136,8 +168,7 @@ impl V6Reader { if update_file_path.exists() { Ok(( task, - Some(Box::new(UpdateFile::new(&update_file_path).unwrap()) - as Box), + Some(Box::new(UpdateFile::new(&update_file_path)?) as Box), )) } else { Ok((task, None)) @@ -145,6 +176,16 @@ impl V6Reader { })) } + pub fn batches(&mut self) -> Box> + '_> { + match self.batches.as_mut() { + Some(batches) => Box::new((batches).lines().map(|line| -> Result<_> { + let batch = serde_json::from_str(&line?)?; + Ok(batch) + })), + None => Box::new(std::iter::empty()) as Box> + '_>, + } + } + pub fn keys(&mut self) -> Box> + '_> { Box::new( (&mut self.keys).lines().map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }), @@ -154,6 +195,10 @@ impl V6Reader { pub fn features(&self) -> Option { self.features } + + pub fn network(&self) -> Option<&Network> { + self.network.as_ref() + } } pub struct UpdateFile { @@ -210,7 +255,29 @@ impl V6IndexReader { } pub fn settings(&mut self) -> Result> { - let settings: Settings = serde_json::from_reader(&mut self.settings)?; + let mut settings: Settings = serde_json::from_reader(&mut self.settings)?; + patch_embedders(&mut settings); Ok(settings.check()) } } + +fn patch_embedders(settings: &mut Settings) { + if let Setting::Set(embedders) = &mut settings.embedders { + for settings in embedders.values_mut() { + let Setting::Set(settings) = &mut settings.inner else { + continue; + }; + if settings.source != Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace) + { + continue; + } + settings.pooling = match settings.pooling { + Setting::Set(pooling) => Setting::Set(pooling), + // if the pooling for a hugging face embedder is not set, force it to `forceMean` + // for backward compatibility with v1.13 + // dumps created in v1.14 and up will have the setting set for hugging face embedders + Setting::Reset | Setting::NotSet => Setting::Set(OverridePooling::ForceMean), + }; + } + } +} diff --git a/crates/dump/src/writer.rs b/crates/dump/src/writer.rs index 3ee51cabf..63b006b5c 100644 --- a/crates/dump/src/writer.rs +++ b/crates/dump/src/writer.rs @@ -4,7 +4,8 @@ use std::path::PathBuf; use flate2::write::GzEncoder; use flate2::Compression; -use meilisearch_types::features::RuntimeTogglableFeatures; +use meilisearch_types::batches::Batch; +use meilisearch_types::features::{Network, RuntimeTogglableFeatures}; use meilisearch_types::keys::Key; use meilisearch_types::settings::{Checked, Settings}; use serde_json::{Map, Value}; @@ -54,6 +55,10 @@ impl DumpWriter { TaskWriter::new(self.dir.path().join("tasks")) } + pub fn create_batches_queue(&self) -> Result { + BatchWriter::new(self.dir.path().join("batches")) + } + pub fn create_experimental_features(&self, features: RuntimeTogglableFeatures) -> Result<()> { Ok(std::fs::write( self.dir.path().join("experimental-features.json"), @@ -61,6 +66,10 @@ impl DumpWriter { )?) } + pub fn create_network(&self, network: Network) -> Result<()> { + Ok(std::fs::write(self.dir.path().join("network.json"), serde_json::to_string(&network)?)?) + } + pub fn persist_to(self, mut writer: impl Write) -> Result<()> { let gz_encoder = GzEncoder::new(&mut writer, Compression::default()); let mut tar_encoder = tar::Builder::new(gz_encoder); @@ -84,7 +93,7 @@ impl KeyWriter { } pub fn push_key(&mut self, key: &Key) -> Result<()> { - self.keys.write_all(&serde_json::to_vec(key)?)?; + serde_json::to_writer(&mut self.keys, &key)?; self.keys.write_all(b"\n")?; Ok(()) } @@ -114,7 +123,7 @@ impl TaskWriter { /// Pushes tasks in the dump. /// If the tasks has an associated `update_file` it'll use the `task_id` as its name. pub fn push_task(&mut self, task: &TaskDump) -> Result { - self.queue.write_all(&serde_json::to_vec(task)?)?; + serde_json::to_writer(&mut self.queue, &task)?; self.queue.write_all(b"\n")?; Ok(UpdateFile::new(self.update_files.join(format!("{}.jsonl", task.uid)))) @@ -126,6 +135,30 @@ impl TaskWriter { } } +pub struct BatchWriter { + queue: BufWriter, +} + +impl BatchWriter { + pub(crate) fn new(path: PathBuf) -> Result { + std::fs::create_dir(&path)?; + let queue = File::create(path.join("queue.jsonl"))?; + Ok(BatchWriter { queue: BufWriter::new(queue) }) + } + + /// Pushes batches in the dump. + pub fn push_batch(&mut self, batch: &Batch) -> Result<()> { + serde_json::to_writer(&mut self.queue, &batch)?; + self.queue.write_all(b"\n")?; + Ok(()) + } + + pub fn flush(mut self) -> Result<()> { + self.queue.flush()?; + Ok(()) + } +} + pub struct UpdateFile { path: PathBuf, writer: Option>, @@ -137,8 +170,8 @@ impl UpdateFile { } pub fn push_document(&mut self, document: &Document) -> Result<()> { - if let Some(writer) = self.writer.as_mut() { - writer.write_all(&serde_json::to_vec(document)?)?; + if let Some(mut writer) = self.writer.as_mut() { + serde_json::to_writer(&mut writer, &document)?; writer.write_all(b"\n")?; } else { let file = File::create(&self.path).unwrap(); @@ -205,8 +238,8 @@ pub(crate) mod test { use super::*; use crate::reader::Document; use crate::test::{ - create_test_api_keys, create_test_documents, create_test_dump, create_test_instance_uid, - create_test_settings, create_test_tasks, + create_test_api_keys, create_test_batches, create_test_documents, create_test_dump, + create_test_instance_uid, create_test_settings, create_test_tasks, }; fn create_directory_hierarchy(dir: &Path) -> String { @@ -281,8 +314,10 @@ pub(crate) mod test { let dump_path = dump.path(); // ==== checking global file hierarchy (we want to be sure there isn't too many files or too few) - insta::assert_snapshot!(create_directory_hierarchy(dump_path), @r###" + insta::assert_snapshot!(create_directory_hierarchy(dump_path), @r" . + ā”œ---- batches/ + │ ā””---- queue.jsonl ā”œ---- indexes/ │ ā””---- doggos/ │ │ ā”œ---- documents.jsonl @@ -295,8 +330,9 @@ pub(crate) mod test { ā”œ---- experimental-features.json ā”œ---- instance_uid.uuid ā”œ---- keys.jsonl - ā””---- metadata.json - "###); + ā”œ---- metadata.json + ā””---- network.json + "); // ==== checking the top level infos let metadata = fs::read_to_string(dump_path.join("metadata.json")).unwrap(); @@ -349,6 +385,16 @@ pub(crate) mod test { } } + // ==== checking the batch queue + let batches_queue = fs::read_to_string(dump_path.join("batches/queue.jsonl")).unwrap(); + for (batch, expected) in batches_queue.lines().zip(create_test_batches()) { + let mut batch = serde_json::from_str::(batch).unwrap(); + if batch.details.settings == Some(Box::new(Settings::::default())) { + batch.details.settings = None; + } + assert_eq!(batch, expected, "{batch:#?}{expected:#?}"); + } + // ==== checking the keys let keys = fs::read_to_string(dump_path.join("keys.jsonl")).unwrap(); for (key, expected) in keys.lines().zip(create_test_api_keys()) { diff --git a/crates/dump/tests/assets/v6-with-network.dump b/crates/dump/tests/assets/v6-with-network.dump new file mode 100644 index 000000000..4d0d9ddc9 Binary files /dev/null and b/crates/dump/tests/assets/v6-with-network.dump differ diff --git a/crates/file-store/Cargo.toml b/crates/file-store/Cargo.toml index 08b7bb717..66ea65336 100644 --- a/crates/file-store/Cargo.toml +++ b/crates/file-store/Cargo.toml @@ -11,7 +11,7 @@ edition.workspace = true license.workspace = true [dependencies] -tempfile = "3.10.1" -thiserror = "1.0.61" -tracing = "0.1.40" -uuid = { version = "1.10.0", features = ["serde", "v4"] } +tempfile = "3.15.0" +thiserror = "2.0.9" +tracing = "0.1.41" +uuid = { version = "1.11.0", features = ["serde", "v4"] } diff --git a/crates/file-store/src/lib.rs b/crates/file-store/src/lib.rs index c8b3849ab..39ed9482b 100644 --- a/crates/file-store/src/lib.rs +++ b/crates/file-store/src/lib.rs @@ -136,6 +136,14 @@ pub struct File { } impl File { + pub fn from_parts(path: PathBuf, file: Option) -> Self { + Self { path, file } + } + + pub fn into_parts(self) -> (PathBuf, Option) { + (self.path, self.file) + } + pub fn dry_file() -> Result { Ok(Self { path: PathBuf::new(), file: None }) } diff --git a/crates/filter-parser/Cargo.toml b/crates/filter-parser/Cargo.toml index 6725a35d1..2657315a4 100644 --- a/crates/filter-parser/Cargo.toml +++ b/crates/filter-parser/Cargo.toml @@ -17,4 +17,5 @@ nom_locate = "4.2.0" unescaper = "0.1.5" [dev-dependencies] -insta = "1.39.0" +# fixed version due to format breakages in v1.40 +insta = "=1.39.0" diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 04b6dc266..0fc007bf1 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -30,6 +30,25 @@ pub enum Condition<'a> { StartsWith { keyword: Token<'a>, word: Token<'a> }, } +impl Condition<'_> { + pub fn operator(&self) -> &str { + match self { + Condition::GreaterThan(_) => ">", + Condition::GreaterThanOrEqual(_) => ">=", + Condition::Equal(_) => "=", + Condition::NotEqual(_) => "!=", + Condition::Null => "IS NULL", + Condition::Empty => "IS EMPTY", + Condition::Exists => "EXISTS", + Condition::LowerThan(_) => "<", + Condition::LowerThanOrEqual(_) => "<=", + Condition::Between { .. } => "TO", + Condition::Contains { .. } => "CONTAINS", + Condition::StartsWith { .. } => "STARTS WITH", + } + } +} + /// condition = value ("==" | ">" ...) value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index 122396b87..855ce983e 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -35,7 +35,7 @@ impl NomErrorExt for nom::Err { pub fn cut_with_err<'a, O>( mut parser: impl FnMut(Span<'a>) -> IResult<'a, O>, mut with: impl FnMut(Error<'a>) -> Error<'a>, -) -> impl FnMut(Span<'a>) -> IResult { +) -> impl FnMut(Span<'a>) -> IResult<'a, O> { move |input| match parser.parse(input) { Err(nom::Err::Error(e)) => Err(nom::Err::Failure(with(e))), rest => rest, @@ -121,7 +121,7 @@ impl<'a> ParseError> for Error<'a> { } } -impl<'a> Display for Error<'a> { +impl Display for Error<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let input = self.context.fragment(); // When printing our error message we want to escape all `\n` to be sure we keep our format with the diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index cfe009acb..938702103 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -80,7 +80,7 @@ pub struct Token<'a> { value: Option, } -impl<'a> PartialEq for Token<'a> { +impl PartialEq for Token<'_> { fn eq(&self, other: &Self) -> bool { self.span.fragment() == other.span.fragment() } @@ -179,6 +179,26 @@ impl<'a> FilterCondition<'a> { } } + pub fn fids(&self, depth: usize) -> Box + '_> { + if depth == 0 { + return Box::new(std::iter::empty()); + } + match self { + FilterCondition::Condition { fid, .. } | FilterCondition::In { fid, .. } => { + Box::new(std::iter::once(fid)) + } + FilterCondition::Not(filter) => { + let depth = depth.saturating_sub(1); + filter.fids(depth) + } + FilterCondition::And(subfilters) | FilterCondition::Or(subfilters) => { + let depth = depth.saturating_sub(1); + Box::new(subfilters.iter().flat_map(move |f| f.fids(depth))) + } + _ => Box::new(std::iter::empty()), + } + } + /// Returns the first token found at the specified depth, `None` if no token at this depth. pub fn token_at_depth(&self, depth: usize) -> Option<&Token> { match self { @@ -206,7 +226,7 @@ impl<'a> FilterCondition<'a> { } } - pub fn parse(input: &'a str) -> Result, Error> { + pub fn parse(input: &'a str) -> Result, Error<'a>> { if input.trim().is_empty() { return Ok(None); } @@ -507,7 +527,7 @@ pub fn parse_filter(input: Span) -> IResult { terminated(|input| parse_expression(input, 0), eof)(input) } -impl<'a> std::fmt::Display for FilterCondition<'a> { +impl std::fmt::Display for FilterCondition<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { FilterCondition::Not(filter) => { @@ -556,7 +576,8 @@ impl<'a> std::fmt::Display for FilterCondition<'a> { } } } -impl<'a> std::fmt::Display for Condition<'a> { + +impl std::fmt::Display for Condition<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Condition::GreaterThan(token) => write!(f, "> {token}"), @@ -574,7 +595,8 @@ impl<'a> std::fmt::Display for Condition<'a> { } } } -impl<'a> std::fmt::Display for Token<'a> { + +impl std::fmt::Display for Token<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{{{}}}", self.value()) } @@ -978,6 +1000,43 @@ pub mod tests { assert!(filter.token_at_depth(3).is_none()); } + #[test] + fn fids() { + let filter = Fc::parse("field = value").unwrap().unwrap(); + let fids: Vec<_> = filter.fids(MAX_FILTER_DEPTH).collect(); + assert_eq!(fids.len(), 1); + assert_eq!(fids[0].value(), "field"); + + let filter = Fc::parse("field IN [1, 2, 3]").unwrap().unwrap(); + let fids: Vec<_> = filter.fids(MAX_FILTER_DEPTH).collect(); + assert_eq!(fids.len(), 1); + assert_eq!(fids[0].value(), "field"); + + let filter = Fc::parse("field != value").unwrap().unwrap(); + let fids: Vec<_> = filter.fids(MAX_FILTER_DEPTH).collect(); + assert_eq!(fids.len(), 1); + assert_eq!(fids[0].value(), "field"); + + let filter = Fc::parse("field1 = value1 AND field2 = value2").unwrap().unwrap(); + let fids: Vec<_> = filter.fids(MAX_FILTER_DEPTH).collect(); + assert_eq!(fids.len(), 2); + assert!(fids[0].value() == "field1"); + assert!(fids[1].value() == "field2"); + + let filter = Fc::parse("field1 = value1 OR field2 = value2").unwrap().unwrap(); + let fids: Vec<_> = filter.fids(MAX_FILTER_DEPTH).collect(); + assert_eq!(fids.len(), 2); + assert!(fids[0].value() == "field1"); + assert!(fids[1].value() == "field2"); + + let depth = 2; + let filter = + Fc::parse("field1 = value1 AND (field2 = value2 OR field3 = value3)").unwrap().unwrap(); + let fids: Vec<_> = filter.fids(depth).collect(); + assert_eq!(fids.len(), 1); + assert_eq!(fids[0].value(), "field1"); + } + #[test] fn token_from_str() { let s = "test string that should not be parsed"; diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index 5912f6900..98cac39fe 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -52,7 +52,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { } // word = (alphanumeric | _ | - | .)+ except for reserved keywords -pub fn word_not_keyword<'a>(input: Span<'a>) -> IResult> { +pub fn word_not_keyword<'a>(input: Span<'a>) -> IResult<'a, Token<'a>> { let (input, word): (_, Token<'a>) = take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; if is_keyword(word.value()) { diff --git a/crates/fuzzers/Cargo.toml b/crates/fuzzers/Cargo.toml index 86a0f779d..a838350ba 100644 --- a/crates/fuzzers/Cargo.toml +++ b/crates/fuzzers/Cargo.toml @@ -11,12 +11,12 @@ edition.workspace = true license.workspace = true [dependencies] -arbitrary = { version = "1.3.2", features = ["derive"] } +arbitrary = { version = "1.4.1", features = ["derive"] } bumpalo = "3.16.0" -clap = { version = "4.5.9", features = ["derive"] } +clap = { version = "4.5.24", features = ["derive"] } either = "1.13.0" -fastrand = "2.1.0" +fastrand = "2.3.0" milli = { path = "../milli" } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order"] } -tempfile = "3.10.1" +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.135", features = ["preserve_order"] } +tempfile = "3.15.0" diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index f335938b9..4df989b51 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -10,8 +10,9 @@ use either::Either; use fuzzers::Operation; use milli::documents::mmap_from_objects; use milli::heed::EnvOpenOptions; +use milli::progress::Progress; use milli::update::new::indexer; -use milli::update::{IndexDocumentsMethod, IndexerConfig}; +use milli::update::IndexerConfig; use milli::vector::EmbeddingConfigs; use milli::Index; use serde_json::Value; @@ -56,13 +57,14 @@ fn main() { let opt = opt.clone(); let handle = std::thread::spawn(move || { - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(1024 * 1024 * 1024 * 1024); let tempdir = match opt.path { Some(path) => TempDir::new_in(path).unwrap(), None => TempDir::new().unwrap(), }; - let index = Index::new(options, tempdir.path()).unwrap(); + let index = Index::new(options, tempdir.path(), true).unwrap(); let indexer_config = IndexerConfig::default(); std::thread::scope(|s| { @@ -88,9 +90,7 @@ fn main() { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new( - IndexDocumentsMethod::ReplaceDocuments, - ); + let mut indexer = indexer::DocumentOperation::new(); let mut operations = Vec::new(); for op in batch.0 { @@ -114,7 +114,7 @@ fn main() { for op in &operations { match op { Either::Left(documents) => { - indexer.add_documents(documents).unwrap() + indexer.replace_documents(documents).unwrap() } Either::Right(ids) => indexer.delete_documents(ids), } @@ -128,13 +128,14 @@ fn main() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -142,7 +143,7 @@ fn main() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 657dd6dfe..37b3ea835 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -11,41 +11,42 @@ edition.workspace = true license.workspace = true [dependencies] -anyhow = "1.0.86" +anyhow = "1.0.95" bincode = "1.3.3" -csv = "1.3.0" -derive_builder = "0.20.0" +bumpalo = "3.16.0" +bumparaw-collections = "0.1.4" +convert_case = "0.6.0" +csv = "1.3.1" +derive_builder = "0.20.2" dump = { path = "../dump" } enum-iterator = "2.1.0" file-store = { path = "../file-store" } -flate2 = "1.0.30" +flate2 = "1.0.35" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.5" page_size = "0.6.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order"] } +roaring = { version = "0.10.10", features = ["serde"] } +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.138", features = ["preserve_order"] } synchronoise = "1.0.1" -tempfile = "3.10.1" -thiserror = "1.0.61" -memmap2 = "0.9.4" -time = { version = "0.3.36", features = [ +tempfile = "3.15.0" +thiserror = "2.0.9" +time = { version = "0.3.37", features = [ "serde-well-known", "formatting", "parsing", "macros", ] } -tracing = "0.1.40" -ureq = "2.10.0" -uuid = { version = "1.10.0", features = ["serde", "v4"] } -bumpalo = "3.16.0" +tracing = "0.1.41" +ureq = "2.12.1" +uuid = { version = "1.11.0", features = ["serde", "v4"] } [dev-dependencies] -arroy = "0.5.0" big_s = "1.0.2" -crossbeam = "0.8.4" -insta = { version = "1.39.0", features = ["json", "redactions"] } +crossbeam-channel = "0.5.14" +# fixed version due to format breakages in v1.40 +insta = { version = "=1.39.0", features = ["json", "redactions"] } maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/crates/index-scheduler/src/autobatcher.rs b/crates/index-scheduler/src/autobatcher.rs deleted file mode 100644 index 0f6aa8a3a..000000000 --- a/crates/index-scheduler/src/autobatcher.rs +++ /dev/null @@ -1,1019 +0,0 @@ -/*! -The autobatcher is responsible for combining the next enqueued -tasks affecting a single index into a [batch](crate::batch::Batch). - -The main function of the autobatcher is [`next_autobatch`]. -*/ - -use std::ops::ControlFlow::{self, Break, Continue}; - -use meilisearch_types::milli::update::IndexDocumentsMethod::{ - self, ReplaceDocuments, UpdateDocuments, -}; -use meilisearch_types::tasks::TaskId; - -use crate::KindWithContent; - -/// Succinctly describes a task's [`Kind`](meilisearch_types::tasks::Kind) -/// for the purpose of simplifying the implementation of the autobatcher. -/// -/// Only the non-prioritised tasks that can be grouped in a batch have a corresponding [`AutobatchKind`] -enum AutobatchKind { - DocumentImport { - method: IndexDocumentsMethod, - allow_index_creation: bool, - primary_key: Option, - }, - DocumentEdition, - DocumentDeletion { - by_filter: bool, - }, - DocumentClear, - Settings { - allow_index_creation: bool, - }, - IndexCreation, - IndexDeletion, - IndexUpdate, - IndexSwap, -} - -impl AutobatchKind { - #[rustfmt::skip] - fn allow_index_creation(&self) -> Option { - match self { - AutobatchKind::DocumentImport { allow_index_creation, .. } - | AutobatchKind::Settings { allow_index_creation, .. } => Some(*allow_index_creation), - _ => None, - } - } - - fn primary_key(&self) -> Option> { - match self { - AutobatchKind::DocumentImport { primary_key, .. } => Some(primary_key.as_deref()), - _ => None, - } - } -} - -impl From for AutobatchKind { - fn from(kind: KindWithContent) -> Self { - match kind { - KindWithContent::DocumentAdditionOrUpdate { - method, - allow_index_creation, - primary_key, - .. - } => AutobatchKind::DocumentImport { method, allow_index_creation, primary_key }, - KindWithContent::DocumentEdition { .. } => AutobatchKind::DocumentEdition, - KindWithContent::DocumentDeletion { .. } => { - AutobatchKind::DocumentDeletion { by_filter: false } - } - KindWithContent::DocumentClear { .. } => AutobatchKind::DocumentClear, - KindWithContent::DocumentDeletionByFilter { .. } => { - AutobatchKind::DocumentDeletion { by_filter: true } - } - KindWithContent::SettingsUpdate { allow_index_creation, is_deletion, .. } => { - AutobatchKind::Settings { - allow_index_creation: allow_index_creation && !is_deletion, - } - } - KindWithContent::IndexDeletion { .. } => AutobatchKind::IndexDeletion, - KindWithContent::IndexCreation { .. } => AutobatchKind::IndexCreation, - KindWithContent::IndexUpdate { .. } => AutobatchKind::IndexUpdate, - KindWithContent::IndexSwap { .. } => AutobatchKind::IndexSwap, - KindWithContent::TaskCancelation { .. } - | KindWithContent::TaskDeletion { .. } - | KindWithContent::DumpCreation { .. } - | KindWithContent::SnapshotCreation => { - panic!("The autobatcher should never be called with tasks that don't apply to an index.") - } - } - } -} - -#[derive(Debug)] -pub enum BatchKind { - DocumentClear { - ids: Vec, - }, - DocumentOperation { - method: IndexDocumentsMethod, - allow_index_creation: bool, - primary_key: Option, - operation_ids: Vec, - }, - DocumentEdition { - id: TaskId, - }, - DocumentDeletion { - deletion_ids: Vec, - includes_by_filter: bool, - }, - ClearAndSettings { - other: Vec, - allow_index_creation: bool, - settings_ids: Vec, - }, - SettingsAndDocumentOperation { - settings_ids: Vec, - method: IndexDocumentsMethod, - allow_index_creation: bool, - primary_key: Option, - operation_ids: Vec, - }, - Settings { - allow_index_creation: bool, - settings_ids: Vec, - }, - IndexDeletion { - ids: Vec, - }, - IndexCreation { - id: TaskId, - }, - IndexUpdate { - id: TaskId, - }, - IndexSwap { - id: TaskId, - }, -} - -impl BatchKind { - #[rustfmt::skip] - fn allow_index_creation(&self) -> Option { - match self { - BatchKind::DocumentOperation { allow_index_creation, .. } - | BatchKind::ClearAndSettings { allow_index_creation, .. } - | BatchKind::SettingsAndDocumentOperation { allow_index_creation, .. } - | BatchKind::Settings { allow_index_creation, .. } => Some(*allow_index_creation), - _ => None, - } - } - - fn primary_key(&self) -> Option> { - match self { - BatchKind::DocumentOperation { primary_key, .. } - | BatchKind::SettingsAndDocumentOperation { primary_key, .. } => { - Some(primary_key.as_deref()) - } - _ => None, - } - } -} - -impl BatchKind { - /// Returns a `ControlFlow::Break` if you must stop right now. - /// The boolean tell you if an index has been created by the batched task. - /// To ease the writing of the code. `true` can be returned when you don't need to create an index - /// but false can't be returned if you needs to create an index. - // TODO use an AutoBatchKind as input - pub fn new( - task_id: TaskId, - kind: KindWithContent, - primary_key: Option<&str>, - ) -> (ControlFlow, bool) { - use AutobatchKind as K; - - match AutobatchKind::from(kind) { - K::IndexCreation => (Break(BatchKind::IndexCreation { id: task_id }), true), - K::IndexDeletion => (Break(BatchKind::IndexDeletion { ids: vec![task_id] }), false), - K::IndexUpdate => (Break(BatchKind::IndexUpdate { id: task_id }), false), - K::IndexSwap => (Break(BatchKind::IndexSwap { id: task_id }), false), - K::DocumentClear => (Continue(BatchKind::DocumentClear { ids: vec![task_id] }), false), - K::DocumentImport { method, allow_index_creation, primary_key: pk } - if primary_key.is_none() || pk.is_none() || primary_key == pk.as_deref() => - { - ( - Continue(BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key: pk, - operation_ids: vec![task_id], - }), - allow_index_creation, - ) - } - // if the primary key set in the task was different than ours we should stop and make this batch fail asap. - K::DocumentImport { method, allow_index_creation, primary_key } => ( - Break(BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids: vec![task_id], - }), - allow_index_creation, - ), - K::DocumentEdition => (Break(BatchKind::DocumentEdition { id: task_id }), false), - K::DocumentDeletion { by_filter: includes_by_filter } => ( - Continue(BatchKind::DocumentDeletion { - deletion_ids: vec![task_id], - includes_by_filter, - }), - false, - ), - K::Settings { allow_index_creation } => ( - Continue(BatchKind::Settings { allow_index_creation, settings_ids: vec![task_id] }), - allow_index_creation, - ), - } - } - - /// Returns a `ControlFlow::Break` if you must stop right now. - /// The boolean tell you if an index has been created by the batched task. - /// To ease the writing of the code. `true` can be returned when you don't need to create an index - /// but false can't be returned if you needs to create an index. - #[rustfmt::skip] - fn accumulate(self, id: TaskId, kind: AutobatchKind, index_already_exists: bool, primary_key: Option<&str>) -> ControlFlow { - use AutobatchKind as K; - - match (self, kind) { - // We don't batch any of these operations - (this, K::IndexCreation | K::IndexUpdate | K::IndexSwap | K::DocumentEdition) => Break(this), - // We must not batch tasks that don't have the same index creation rights if the index doesn't already exists. - (this, kind) if !index_already_exists && this.allow_index_creation() == Some(false) && kind.allow_index_creation() == Some(true) => { - Break(this) - }, - // NOTE: We need to negate the whole condition since we're checking if we need to break instead of continue. - // I wrote it this way because it's easier to understand than the other way around. - (this, kind) if !( - // 1. If both task don't interact with primary key -> we can continue - (this.primary_key().is_none() && kind.primary_key().is_none()) || - // 2. Else -> - ( - // 2.1 If we already have a primary-key -> - ( - primary_key.is_some() && - // 2.1.1 If the task we're trying to accumulate have a pk it must be equal to our primary key - // 2.1.2 If the task don't have a primary-key -> we can continue - kind.primary_key().map_or(true, |pk| pk == primary_key) - ) || - // 2.2 If we don't have a primary-key -> - ( - // 2.2.1 If both the batch and the task have a primary key they should be equal - // 2.2.2 If the batch is set to Some(None), the task should be too - // 2.2.3 If the batch is set to None -> we can continue - this.primary_key().zip(kind.primary_key()).map_or(true, |(this, kind)| this == kind) - ) - ) - - ) // closing the negation - - => { - Break(this) - }, - // The index deletion can batch with everything but must stop after - ( - BatchKind::DocumentClear { mut ids } - | BatchKind::DocumentDeletion { deletion_ids: mut ids, includes_by_filter: _ } - | BatchKind::DocumentOperation { method: _, allow_index_creation: _, primary_key: _, operation_ids: mut ids } - | BatchKind::Settings { allow_index_creation: _, settings_ids: mut ids }, - K::IndexDeletion, - ) => { - ids.push(id); - Break(BatchKind::IndexDeletion { ids }) - } - ( - BatchKind::ClearAndSettings { settings_ids: mut ids, allow_index_creation: _, mut other } - | BatchKind::SettingsAndDocumentOperation { operation_ids: mut ids, method: _, allow_index_creation: _, primary_key: _, settings_ids: mut other }, - K::IndexDeletion, - ) => { - ids.push(id); - ids.append(&mut other); - Break(BatchKind::IndexDeletion { ids }) - } - - ( - BatchKind::DocumentClear { mut ids }, - K::DocumentClear | K::DocumentDeletion { by_filter: _ }, - ) => { - ids.push(id); - Continue(BatchKind::DocumentClear { ids }) - } - ( - this @ BatchKind::DocumentClear { .. }, - K::DocumentImport { .. } | K::Settings { .. }, - ) => Break(this), - ( - BatchKind::DocumentOperation { method: _, allow_index_creation: _, primary_key: _, mut operation_ids }, - K::DocumentClear, - ) => { - operation_ids.push(id); - Continue(BatchKind::DocumentClear { ids: operation_ids }) - } - - // we can autobatch the same kind of document additions / updates - ( - BatchKind::DocumentOperation { method: ReplaceDocuments, allow_index_creation, primary_key: _, mut operation_ids }, - K::DocumentImport { method: ReplaceDocuments, primary_key: pk, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::DocumentOperation { - method: ReplaceDocuments, - allow_index_creation, - operation_ids, - primary_key: pk, - }) - } - ( - BatchKind::DocumentOperation { method: UpdateDocuments, allow_index_creation, primary_key: _, mut operation_ids }, - K::DocumentImport { method: UpdateDocuments, primary_key: pk, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::DocumentOperation { - method: UpdateDocuments, - allow_index_creation, - primary_key: pk, - operation_ids, - }) - } - ( - BatchKind::DocumentOperation { method, allow_index_creation, primary_key, mut operation_ids }, - K::DocumentDeletion { by_filter: false }, - ) => { - operation_ids.push(id); - - Continue(BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids, - }) - } - // We can't batch a document operation with a delete by filter - ( - this @ BatchKind::DocumentOperation { .. }, - K::DocumentDeletion { by_filter: true }, - ) => { - Break(this) - } - // but we can't autobatch documents if it's not the same kind - // this match branch MUST be AFTER the previous one - ( - this @ BatchKind::DocumentOperation { .. }, - K::DocumentImport { .. }, - ) => Break(this), - - ( - BatchKind::DocumentOperation { method, allow_index_creation, primary_key, operation_ids }, - K::Settings { .. }, - ) => Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids: vec![id], - method, - allow_index_creation, - primary_key, - operation_ids, - }), - - (BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: _ }, K::DocumentClear) => { - deletion_ids.push(id); - Continue(BatchKind::DocumentClear { ids: deletion_ids }) - } - // we can't autobatch the deletion and import if the document deletion contained a filter - ( - this @ BatchKind::DocumentDeletion { deletion_ids: _, includes_by_filter: true }, - K::DocumentImport { .. } - ) => Break(this), - // we can autobatch the deletion and import if the index already exists - ( - BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: false }, - K::DocumentImport { method, allow_index_creation, primary_key } - ) if index_already_exists => { - deletion_ids.push(id); - - Continue(BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids: deletion_ids, - }) - } - // we can autobatch the deletion and import if both can't create an index - ( - BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: false }, - K::DocumentImport { method, allow_index_creation, primary_key } - ) if !allow_index_creation => { - deletion_ids.push(id); - - Continue(BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids: deletion_ids, - }) - } - // we can't autobatch a deletion and an import if the index does not exists but would be created by an addition - ( - this @ BatchKind::DocumentDeletion { .. }, - K::DocumentImport { .. } - ) => { - Break(this) - } - (BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter }, K::DocumentDeletion { by_filter }) => { - deletion_ids.push(id); - Continue(BatchKind::DocumentDeletion { deletion_ids, includes_by_filter: includes_by_filter | by_filter }) - } - (this @ BatchKind::DocumentDeletion { .. }, K::Settings { .. }) => Break(this), - - ( - BatchKind::Settings { settings_ids, allow_index_creation }, - K::DocumentClear, - ) => Continue(BatchKind::ClearAndSettings { - settings_ids, - allow_index_creation, - other: vec![id], - }), - ( - this @ BatchKind::Settings { .. }, - K::DocumentImport { .. } | K::DocumentDeletion { .. }, - ) => Break(this), - ( - BatchKind::Settings { mut settings_ids, allow_index_creation }, - K::Settings { .. }, - ) => { - settings_ids.push(id); - Continue(BatchKind::Settings { - allow_index_creation, - settings_ids, - }) - } - - ( - BatchKind::ClearAndSettings { mut other, settings_ids, allow_index_creation }, - K::DocumentClear, - ) => { - other.push(id); - Continue(BatchKind::ClearAndSettings { - other, - settings_ids, - allow_index_creation, - }) - } - (this @ BatchKind::ClearAndSettings { .. }, K::DocumentImport { .. }) => Break(this), - ( - BatchKind::ClearAndSettings { - mut other, - settings_ids, - allow_index_creation, - }, - K::DocumentDeletion { .. }, - ) => { - other.push(id); - Continue(BatchKind::ClearAndSettings { - other, - settings_ids, - allow_index_creation, - }) - } - ( - BatchKind::ClearAndSettings { mut settings_ids, other, allow_index_creation }, - K::Settings { .. }, - ) => { - settings_ids.push(id); - Continue(BatchKind::ClearAndSettings { - other, - settings_ids, - allow_index_creation, - }) - } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: _, mut operation_ids, allow_index_creation, primary_key: _ }, - K::DocumentClear, - ) => { - operation_ids.push(id); - Continue(BatchKind::ClearAndSettings { - settings_ids, - other: operation_ids, - allow_index_creation, - }) - } - - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: ReplaceDocuments, mut operation_ids, allow_index_creation, primary_key: _}, - K::DocumentImport { method: ReplaceDocuments, primary_key: pk2, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method: ReplaceDocuments, - allow_index_creation, - primary_key: pk2, - operation_ids, - }) - } - ( - BatchKind::SettingsAndDocumentOperation { settings_ids, method: UpdateDocuments, allow_index_creation, primary_key: _, mut operation_ids }, - K::DocumentImport { method: UpdateDocuments, primary_key: pk2, .. }, - ) => { - operation_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method: UpdateDocuments, - allow_index_creation, - primary_key: pk2, - operation_ids, - }) - } - // But we can't batch a settings and a doc op with another doc op - // this MUST be AFTER the two previous branch - ( - this @ BatchKind::SettingsAndDocumentOperation { .. }, - K::DocumentDeletion { .. } | K::DocumentImport { .. }, - ) => Break(this), - ( - BatchKind::SettingsAndDocumentOperation { mut settings_ids, method, allow_index_creation,primary_key, operation_ids }, - K::Settings { .. }, - ) => { - settings_ids.push(id); - Continue(BatchKind::SettingsAndDocumentOperation { - settings_ids, - method, - allow_index_creation, - primary_key, - operation_ids, - }) - } - ( - BatchKind::IndexCreation { .. } - | BatchKind::IndexDeletion { .. } - | BatchKind::IndexUpdate { .. } - | BatchKind::IndexSwap { .. } - | BatchKind::DocumentEdition { .. }, - _, - ) => { - unreachable!() - } - } - } -} - -/// Create a batch from an ordered list of tasks. -/// -/// ## Preconditions -/// 1. The tasks must be enqueued and given in the order in which they were enqueued -/// 2. The tasks must not be prioritised tasks (e.g. task cancellation, dump, snapshot, task deletion) -/// 3. The tasks must all be related to the same index -/// -/// ## Return -/// `None` if the list of tasks is empty. Otherwise, an [`AutoBatch`] that represents -/// a subset of the given tasks. -pub fn autobatch( - enqueued: Vec<(TaskId, KindWithContent)>, - index_already_exists: bool, - primary_key: Option<&str>, -) -> Option<(BatchKind, bool)> { - let mut enqueued = enqueued.into_iter(); - let (id, kind) = enqueued.next()?; - - // index_exist will keep track of if the index should exist at this point after the tasks we batched. - let mut index_exist = index_already_exists; - - let (mut acc, must_create_index) = match BatchKind::new(id, kind, primary_key) { - (Continue(acc), create) => (acc, create), - (Break(acc), create) => return Some((acc, create)), - }; - - // if an index has been created in the previous step we can consider it as existing. - index_exist |= must_create_index; - - for (id, kind) in enqueued { - acc = match acc.accumulate(id, kind.into(), index_exist, primary_key) { - Continue(acc) => acc, - Break(acc) => return Some((acc, must_create_index)), - }; - } - - Some((acc, must_create_index)) -} - -#[cfg(test)] -mod tests { - use meilisearch_types::tasks::IndexSwap; - use uuid::Uuid; - - use super::*; - use crate::debug_snapshot; - - fn autobatch_from( - index_already_exists: bool, - primary_key: Option<&str>, - input: impl IntoIterator, - ) -> Option<(BatchKind, bool)> { - autobatch( - input.into_iter().enumerate().map(|(id, kind)| (id as TaskId, kind)).collect(), - index_already_exists, - primary_key, - ) - } - - fn doc_imp( - method: IndexDocumentsMethod, - allow_index_creation: bool, - primary_key: Option<&str>, - ) -> KindWithContent { - KindWithContent::DocumentAdditionOrUpdate { - index_uid: String::from("doggo"), - primary_key: primary_key.map(|pk| pk.to_string()), - method, - content_file: Uuid::new_v4(), - documents_count: 0, - allow_index_creation, - } - } - - fn doc_del() -> KindWithContent { - KindWithContent::DocumentDeletion { - index_uid: String::from("doggo"), - documents_ids: Vec::new(), - } - } - - fn doc_del_fil() -> KindWithContent { - KindWithContent::DocumentDeletionByFilter { - index_uid: String::from("doggo"), - filter_expr: serde_json::json!("cuteness > 100"), - } - } - - fn doc_clr() -> KindWithContent { - KindWithContent::DocumentClear { index_uid: String::from("doggo") } - } - - fn settings(allow_index_creation: bool) -> KindWithContent { - KindWithContent::SettingsUpdate { - index_uid: String::from("doggo"), - new_settings: Default::default(), - is_deletion: false, - allow_index_creation, - } - } - - fn idx_create() -> KindWithContent { - KindWithContent::IndexCreation { index_uid: String::from("doggo"), primary_key: None } - } - - fn idx_update() -> KindWithContent { - KindWithContent::IndexUpdate { index_uid: String::from("doggo"), primary_key: None } - } - - fn idx_del() -> KindWithContent { - KindWithContent::IndexDeletion { index_uid: String::from("doggo") } - } - - fn idx_swap() -> KindWithContent { - KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: (String::from("doggo"), String::from("catto")) }], - } - } - - #[test] - fn autobatch_simple_operation_together() { - // we can autobatch one or multiple `ReplaceDocuments` together. - // if the index exists. - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp( ReplaceDocuments, true , None), doc_imp(ReplaceDocuments, true , None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp( ReplaceDocuments, false , None), doc_imp(ReplaceDocuments, false , None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1, 2] }, false))"); - - // if it doesn't exists. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), doc_imp( ReplaceDocuments, true , None), doc_imp(ReplaceDocuments, true , None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp( ReplaceDocuments, true , None), doc_imp(ReplaceDocuments, true , None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - - // we can autobatch one or multiple `UpdateDocuments` together. - // if the index exists. - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1, 2] }, false))"); - - // if it doesn't exists. - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1, 2] }, false))"); - - // we can autobatch one or multiple DocumentDeletion together - debug_snapshot!(autobatch_from(true, None, [doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_del(), doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_del(), doc_del(), doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: false }, false))"); - - // we can autobatch one or multiple DocumentDeletionByFilter together - debug_snapshot!(autobatch_from(true, None, [doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_del_fil(), doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_del_fil(), doc_del_fil(), doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: true }, false))"); - - // we can autobatch one or multiple Settings together - debug_snapshot!(autobatch_from(true, None, [settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [settings(true), settings(true), settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))"); - - debug_snapshot!(autobatch_from(false,None, [settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(false,None, [settings(true), settings(true), settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))"); - - // We can autobatch document addition with document deletion - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - // And the other way around - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); - - // But we can't autobatch document addition with document deletion by filter - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); - debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); - // And the other way around - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - } - - #[test] - fn simple_document_operation_dont_autobatch_with_other() { - // addition, updates and deletion by filter can't batch together - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_create()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_create()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_create()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_create()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_update()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_update()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_update()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_update()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_swap()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_swap()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_swap()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_swap()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); - } - - #[test] - fn document_addition_batch_with_settings() { - // simple case - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - - // multiple settings and doc addition - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [2, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [2, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - - // addition and setting unordered - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1, 3], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1, 3], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 2] }, true))"); - - // We ensure this kind of batch doesn't batch with forbidden operations - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_del()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_del()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_create()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_create()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_update()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_update()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_swap()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_swap()]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - } - - #[test] - fn clear_and_additions() { - // these two doesn't need to batch - debug_snapshot!(autobatch_from(true, None, [doc_clr(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentClear { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_clr(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentClear { ids: [0] }, false))"); - - // Basic use case - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); - - // This batch kind doesn't mix with other document addition - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), doc_clr(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_clr(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); - - // But you can batch multiple clear together - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), doc_clr(), doc_clr(), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2, 3, 4] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_clr(), doc_clr(), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2, 3, 4] }, true))"); - } - - #[test] - fn clear_and_additions_and_settings() { - // A clear don't need to autobatch the settings that happens AFTER there is no documents - debug_snapshot!(autobatch_from(true, None, [doc_clr(), settings(true)]), @"Some((DocumentClear { ids: [0] }, false))"); - - debug_snapshot!(autobatch_from(true, None, [settings(true), doc_clr(), settings(true)]), @"Some((ClearAndSettings { other: [1], allow_index_creation: true, settings_ids: [0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr()]), @"Some((ClearAndSettings { other: [0, 2], allow_index_creation: true, settings_ids: [1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr()]), @"Some((ClearAndSettings { other: [0, 2], allow_index_creation: true, settings_ids: [1] }, true))"); - } - - #[test] - fn anything_and_index_deletion() { - // The `IndexDeletion` doesn't batch with anything that happens AFTER. - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_del()]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_del_fil()]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_clr()]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), settings(true)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(true, None, [idx_del(), settings(false)]), @"Some((IndexDeletion { ids: [0] }, false))"); - - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_del()]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_del_fil()]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_clr()]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), settings(true)]), @"Some((IndexDeletion { ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [idx_del(), settings(false)]), @"Some((IndexDeletion { ids: [0] }, false))"); - - // The index deletion can accept almost any type of `BatchKind` and transform it to an `IndexDeletion`. - // First, the basic cases - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_del(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_del_fil(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); - - // Then the mixed cases. - // The index already exists, whatever is the right of the tasks it shouldn't change the result. - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,false, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments,true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - - // When the index doesn't exists yet it's more complicated. - // Either the first task we encounter create it, in which case we can create a big batch with everything. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - // The right of the tasks following isn't really important. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, true))"); - // Or, the second case; the first task doesn't create the index and thus we wants to batch it with only tasks that can't create an index. - // that can be a second task that don't have the right to create an index. Or anything that can't create an index like an index deletion, document deletion, document clear, etc. - // All theses tasks are going to throw an error `Index doesn't exist` once the batch is processed. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 2, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(false), doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [1, 3, 0, 2] }, false))"); - // The third and final case is when the first task doesn't create an index but is directly followed by a task creating an index. In this case we can't batch whit what - // follows because we first need to process the erronous batch. - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(true), idx_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(true), idx_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments,false, None), settings(true), doc_clr(), idx_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), settings(true), doc_clr(), idx_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - } - - #[test] - fn allowed_and_disallowed_index_creation() { - // `DocumentImport` can't be mixed with those disallowed to do so except if the index already exists. - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((SettingsAndDocumentOperation { settings_ids: [1], method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); - - // batch deletion and addition - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); - } - - #[test] - fn autobatch_primary_key() { - // ==> If I have a pk - // With a single update - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - - // With a multiple updates - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("other"))]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - - // ==> If I don't have a pk - // With a single update - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); - - // With a multiple updates - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("id"))]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); - debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); - } -} diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs deleted file mode 100644 index 630471790..000000000 --- a/crates/index-scheduler/src/batch.rs +++ /dev/null @@ -1,1906 +0,0 @@ -/*! -This module handles the creation and processing of batch operations. - -A batch is a combination of multiple tasks that can be processed at once. -Executing a batch operation should always be functionally equivalent to -executing each of its tasks' operations individually and in order. - -For example, if the user sends two tasks: -1. import documents X -2. import documents Y - -We can combine the two tasks in a single batch: -1. import documents X and Y - -Processing this batch is functionally equivalent to processing the two -tasks individually, but should be much faster since we are only performing -one indexing operation. -*/ - -use std::collections::{BTreeSet, HashMap, HashSet}; -use std::ffi::OsStr; -use std::fmt; -use std::fs::{self, File}; -use std::io::BufWriter; -use std::sync::atomic::{self, AtomicU64}; -use std::time::Duration; - -use bumpalo::collections::CollectIn; -use bumpalo::Bump; -use dump::IndexMetadata; -use meilisearch_types::batches::BatchId; -use meilisearch_types::error::Code; -use meilisearch_types::heed::{RoTxn, RwTxn}; -use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey}; -use meilisearch_types::milli::heed::CompactionOption; -use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; -use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings}; -use meilisearch_types::milli::vector::parsed_vectors::{ - ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, -}; -use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder}; -use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked}; -use meilisearch_types::tasks::{ - Details, IndexSwap, Kind, KindWithContent, Status, Task, TaskProgress, -}; -use meilisearch_types::{compression, Index, VERSION_FILE_NAME}; -use roaring::RoaringBitmap; -use time::macros::format_description; -use time::OffsetDateTime; -use uuid::Uuid; - -use crate::autobatcher::{self, BatchKind}; -use crate::utils::{self, swap_index_uid_in_task, ProcessingBatch}; -use crate::{Error, IndexScheduler, Result, TaskId}; - -/// Represents a combination of tasks that can all be processed at the same time. -/// -/// A batch contains the set of tasks that it represents (accessible through -/// [`self.ids()`](Batch::ids)), as well as additional information on how to -/// be processed. -#[derive(Debug)] -pub(crate) enum Batch { - TaskCancelation { - /// The task cancelation itself. - task: Task, - }, - TaskDeletions(Vec), - SnapshotCreation(Vec), - Dump(Task), - IndexOperation { - op: IndexOperation, - must_create_index: bool, - }, - IndexCreation { - index_uid: String, - primary_key: Option, - task: Task, - }, - IndexUpdate { - index_uid: String, - primary_key: Option, - task: Task, - }, - IndexDeletion { - index_uid: String, - tasks: Vec, - index_has_been_created: bool, - }, - IndexSwap { - task: Task, - }, -} - -#[derive(Debug)] -pub(crate) enum DocumentOperation { - Add(Uuid), - Delete(Vec), -} - -/// A [batch](Batch) that combines multiple tasks operating on an index. -#[derive(Debug)] -pub(crate) enum IndexOperation { - DocumentOperation { - index_uid: String, - primary_key: Option, - method: IndexDocumentsMethod, - documents_counts: Vec, - operations: Vec, - tasks: Vec, - }, - DocumentEdition { - index_uid: String, - task: Task, - }, - DocumentDeletion { - index_uid: String, - tasks: Vec, - }, - DocumentClear { - index_uid: String, - tasks: Vec, - }, - Settings { - index_uid: String, - // The boolean indicates if it's a settings deletion or creation. - settings: Vec<(bool, Settings)>, - tasks: Vec, - }, - DocumentClearAndSetting { - index_uid: String, - cleared_tasks: Vec, - - // The boolean indicates if it's a settings deletion or creation. - settings: Vec<(bool, Settings)>, - settings_tasks: Vec, - }, - SettingsAndDocumentOperation { - index_uid: String, - - primary_key: Option, - method: IndexDocumentsMethod, - documents_counts: Vec, - operations: Vec, - document_import_tasks: Vec, - - // The boolean indicates if it's a settings deletion or creation. - settings: Vec<(bool, Settings)>, - settings_tasks: Vec, - }, -} - -impl Batch { - /// Return the task ids associated with this batch. - pub fn ids(&self) -> RoaringBitmap { - match self { - Batch::TaskCancelation { task, .. } - | Batch::Dump(task) - | Batch::IndexCreation { task, .. } - | Batch::IndexUpdate { task, .. } => { - RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() - } - Batch::SnapshotCreation(tasks) - | Batch::TaskDeletions(tasks) - | Batch::IndexDeletion { tasks, .. } => { - RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) - } - Batch::IndexOperation { op, .. } => match op { - IndexOperation::DocumentOperation { tasks, .. } - | IndexOperation::Settings { tasks, .. } - | IndexOperation::DocumentDeletion { tasks, .. } - | IndexOperation::DocumentClear { tasks, .. } => { - RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) - } - IndexOperation::DocumentEdition { task, .. } => { - RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() - } - IndexOperation::SettingsAndDocumentOperation { - document_import_tasks: tasks, - settings_tasks: other, - .. - } - | IndexOperation::DocumentClearAndSetting { - cleared_tasks: tasks, - settings_tasks: other, - .. - } => RoaringBitmap::from_iter(tasks.iter().chain(other).map(|task| task.uid)), - }, - Batch::IndexSwap { task } => { - RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() - } - } - } - - /// Return the index UID associated with this batch - pub fn index_uid(&self) -> Option<&str> { - use Batch::*; - match self { - TaskCancelation { .. } - | TaskDeletions(_) - | SnapshotCreation(_) - | Dump(_) - | IndexSwap { .. } => None, - IndexOperation { op, .. } => Some(op.index_uid()), - IndexCreation { index_uid, .. } - | IndexUpdate { index_uid, .. } - | IndexDeletion { index_uid, .. } => Some(index_uid), - } - } -} - -impl fmt::Display for Batch { - /// A text used when we debug the profiling reports. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let index_uid = self.index_uid(); - let tasks = self.ids(); - match self { - Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?, - Batch::TaskDeletions(_) => f.write_str("TaskDeletion")?, - Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?, - Batch::Dump(_) => f.write_str("Dump")?, - Batch::IndexOperation { op, .. } => write!(f, "{op}")?, - Batch::IndexCreation { .. } => f.write_str("IndexCreation")?, - Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?, - Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?, - Batch::IndexSwap { .. } => f.write_str("IndexSwap")?, - }; - match index_uid { - Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")), - None => f.write_fmt(format_args!(" from tasks: {tasks:?}")), - } - } -} - -impl IndexOperation { - pub fn index_uid(&self) -> &str { - match self { - IndexOperation::DocumentOperation { index_uid, .. } - | IndexOperation::DocumentEdition { index_uid, .. } - | IndexOperation::DocumentDeletion { index_uid, .. } - | IndexOperation::DocumentClear { index_uid, .. } - | IndexOperation::Settings { index_uid, .. } - | IndexOperation::DocumentClearAndSetting { index_uid, .. } - | IndexOperation::SettingsAndDocumentOperation { index_uid, .. } => index_uid, - } - } -} - -impl fmt::Display for IndexOperation { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - IndexOperation::DocumentOperation { .. } => { - f.write_str("IndexOperation::DocumentOperation") - } - IndexOperation::DocumentEdition { .. } => { - f.write_str("IndexOperation::DocumentEdition") - } - IndexOperation::DocumentDeletion { .. } => { - f.write_str("IndexOperation::DocumentDeletion") - } - IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"), - IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"), - IndexOperation::DocumentClearAndSetting { .. } => { - f.write_str("IndexOperation::DocumentClearAndSetting") - } - IndexOperation::SettingsAndDocumentOperation { .. } => { - f.write_str("IndexOperation::SettingsAndDocumentOperation") - } - } - } -} - -impl IndexScheduler { - /// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`]. - /// - /// ## Arguments - /// - `rtxn`: read transaction - /// - `index_uid`: name of the index affected by the operations of the autobatch - /// - `batch`: the result of the autobatcher - pub(crate) fn create_next_batch_index( - &self, - rtxn: &RoTxn, - index_uid: String, - batch: BatchKind, - current_batch: &mut ProcessingBatch, - must_create_index: bool, - ) -> Result> { - match batch { - BatchKind::DocumentClear { ids } => Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentClear { - tasks: self.get_existing_tasks_for_processing_batch( - rtxn, - current_batch, - ids, - )?, - index_uid, - }, - must_create_index, - })), - BatchKind::DocumentEdition { id } => { - let mut task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - match &task.kind { - KindWithContent::DocumentEdition { index_uid, .. } => { - Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentEdition { - index_uid: index_uid.clone(), - task, - }, - must_create_index: false, - })) - } - _ => unreachable!(), - } - } - BatchKind::DocumentOperation { method, operation_ids, .. } => { - let tasks = self.get_existing_tasks_for_processing_batch( - rtxn, - current_batch, - operation_ids, - )?; - let primary_key = tasks - .iter() - .find_map(|task| match task.kind { - KindWithContent::DocumentAdditionOrUpdate { ref primary_key, .. } => { - // we want to stop on the first document addition - Some(primary_key.clone()) - } - KindWithContent::DocumentDeletion { .. } => None, - _ => unreachable!(), - }) - .flatten(); - - let mut documents_counts = Vec::new(); - let mut operations = Vec::new(); - - for task in tasks.iter() { - match task.kind { - KindWithContent::DocumentAdditionOrUpdate { - content_file, - documents_count, - .. - } => { - documents_counts.push(documents_count); - operations.push(DocumentOperation::Add(content_file)); - } - KindWithContent::DocumentDeletion { ref documents_ids, .. } => { - documents_counts.push(documents_ids.len() as u64); - operations.push(DocumentOperation::Delete(documents_ids.clone())); - } - _ => unreachable!(), - } - } - - Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - tasks, - }, - must_create_index, - })) - } - BatchKind::DocumentDeletion { deletion_ids, includes_by_filter: _ } => { - let tasks = self.get_existing_tasks_for_processing_batch( - rtxn, - current_batch, - deletion_ids, - )?; - - Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentDeletion { index_uid, tasks }, - must_create_index, - })) - } - BatchKind::Settings { settings_ids, .. } => { - let tasks = self.get_existing_tasks_for_processing_batch( - rtxn, - current_batch, - settings_ids, - )?; - - let mut settings = Vec::new(); - for task in &tasks { - match task.kind { - KindWithContent::SettingsUpdate { - ref new_settings, is_deletion, .. - } => settings.push((is_deletion, *new_settings.clone())), - _ => unreachable!(), - } - } - - Ok(Some(Batch::IndexOperation { - op: IndexOperation::Settings { index_uid, settings, tasks }, - must_create_index, - })) - } - BatchKind::ClearAndSettings { other, settings_ids, allow_index_creation } => { - let (index_uid, settings, settings_tasks) = match self - .create_next_batch_index( - rtxn, - index_uid, - BatchKind::Settings { settings_ids, allow_index_creation }, - current_batch, - must_create_index, - )? - .unwrap() - { - Batch::IndexOperation { - op: IndexOperation::Settings { index_uid, settings, tasks, .. }, - .. - } => (index_uid, settings, tasks), - _ => unreachable!(), - }; - let (index_uid, cleared_tasks) = match self - .create_next_batch_index( - rtxn, - index_uid, - BatchKind::DocumentClear { ids: other }, - current_batch, - must_create_index, - )? - .unwrap() - { - Batch::IndexOperation { - op: IndexOperation::DocumentClear { index_uid, tasks }, - .. - } => (index_uid, tasks), - _ => unreachable!(), - }; - - Ok(Some(Batch::IndexOperation { - op: IndexOperation::DocumentClearAndSetting { - index_uid, - cleared_tasks, - settings, - settings_tasks, - }, - must_create_index, - })) - } - BatchKind::SettingsAndDocumentOperation { - settings_ids, - method, - allow_index_creation, - primary_key, - operation_ids, - } => { - let settings = self.create_next_batch_index( - rtxn, - index_uid.clone(), - BatchKind::Settings { settings_ids, allow_index_creation }, - current_batch, - must_create_index, - )?; - - let document_import = self.create_next_batch_index( - rtxn, - index_uid.clone(), - BatchKind::DocumentOperation { - method, - allow_index_creation, - primary_key, - operation_ids, - }, - current_batch, - must_create_index, - )?; - - match (document_import, settings) { - ( - Some(Batch::IndexOperation { - op: - IndexOperation::DocumentOperation { - primary_key, - documents_counts, - operations, - tasks: document_import_tasks, - .. - }, - .. - }), - Some(Batch::IndexOperation { - op: IndexOperation::Settings { settings, tasks: settings_tasks, .. }, - .. - }), - ) => Ok(Some(Batch::IndexOperation { - op: IndexOperation::SettingsAndDocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - document_import_tasks, - settings, - settings_tasks, - }, - must_create_index, - })), - _ => unreachable!(), - } - } - BatchKind::IndexCreation { id } => { - let mut task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - let (index_uid, primary_key) = match &task.kind { - KindWithContent::IndexCreation { index_uid, primary_key } => { - (index_uid.clone(), primary_key.clone()) - } - _ => unreachable!(), - }; - Ok(Some(Batch::IndexCreation { index_uid, primary_key, task })) - } - BatchKind::IndexUpdate { id } => { - let mut task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - let primary_key = match &task.kind { - KindWithContent::IndexUpdate { primary_key, .. } => primary_key.clone(), - _ => unreachable!(), - }; - Ok(Some(Batch::IndexUpdate { index_uid, primary_key, task })) - } - BatchKind::IndexDeletion { ids } => Ok(Some(Batch::IndexDeletion { - index_uid, - index_has_been_created: must_create_index, - tasks: self.get_existing_tasks_for_processing_batch(rtxn, current_batch, ids)?, - })), - BatchKind::IndexSwap { id } => { - let mut task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - Ok(Some(Batch::IndexSwap { task })) - } - } - } - - /// Create the next batch to be processed; - /// 1. We get the *last* task to cancel. - /// 2. We get the *next* task to delete. - /// 3. We get the *next* snapshot to process. - /// 4. We get the *next* dump to process. - /// 5. We get the *next* tasks to process for a specific index. - #[tracing::instrument(level = "trace", skip(self, rtxn), target = "indexing::scheduler")] - pub(crate) fn create_next_batch( - &self, - rtxn: &RoTxn, - ) -> Result> { - #[cfg(test)] - self.maybe_fail(crate::tests::FailureLocation::InsideCreateBatch)?; - - let batch_id = self.next_batch_id(rtxn)?; - let mut current_batch = ProcessingBatch::new(batch_id); - - let enqueued = &self.get_status(rtxn, Status::Enqueued)?; - let to_cancel = self.get_kind(rtxn, Kind::TaskCancelation)? & enqueued; - - // 1. we get the last task to cancel. - if let Some(task_id) = to_cancel.max() { - let mut task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - return Ok(Some((Batch::TaskCancelation { task }, current_batch))); - } - - // 2. we get the next task to delete - let to_delete = self.get_kind(rtxn, Kind::TaskDeletion)? & enqueued; - if !to_delete.is_empty() { - let mut tasks = self.get_existing_tasks(rtxn, to_delete)?; - current_batch.processing(&mut tasks); - return Ok(Some((Batch::TaskDeletions(tasks), current_batch))); - } - - // 3. we batch the snapshot. - let to_snapshot = self.get_kind(rtxn, Kind::SnapshotCreation)? & enqueued; - if !to_snapshot.is_empty() { - let mut tasks = self.get_existing_tasks(rtxn, to_snapshot)?; - current_batch.processing(&mut tasks); - return Ok(Some((Batch::SnapshotCreation(tasks), current_batch))); - } - - // 4. we batch the dumps. - let to_dump = self.get_kind(rtxn, Kind::DumpCreation)? & enqueued; - if let Some(to_dump) = to_dump.min() { - let mut task = self.get_task(rtxn, to_dump)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - return Ok(Some((Batch::Dump(task), current_batch))); - } - - // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. - let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; - let mut task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - current_batch.processing(Some(&mut task)); - - // If the task is not associated with any index, verify that it is an index swap and - // create the batch directly. Otherwise, get the index name associated with the task - // and use the autobatcher to batch the enqueued tasks associated with it - - let index_name = if let Some(&index_name) = task.indexes().first() { - index_name - } else { - assert!(matches!(&task.kind, KindWithContent::IndexSwap { swaps } if swaps.is_empty())); - return Ok(Some((Batch::IndexSwap { task }, current_batch))); - }; - - let index_already_exists = self.index_mapper.exists(rtxn, index_name)?; - let mut primary_key = None; - if index_already_exists { - let index = self.index_mapper.index(rtxn, index_name)?; - let rtxn = index.read_txn()?; - primary_key = index.primary_key(&rtxn)?.map(|pk| pk.to_string()); - } - - let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued; - - // If autobatching is disabled we only take one task at a time. - // Otherwise, we take only a maximum of tasks to create batches. - let tasks_limit = - if self.autobatching_enabled { self.max_number_of_batched_tasks } else { 1 }; - - let enqueued = index_tasks - .into_iter() - .take(tasks_limit) - .map(|task_id| { - self.get_task(rtxn, task_id) - .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) - .map(|task| (task.uid, task.kind)) - }) - .collect::>>()?; - - if let Some((batchkind, create_index)) = - autobatcher::autobatch(enqueued, index_already_exists, primary_key.as_deref()) - { - return Ok(self - .create_next_batch_index( - rtxn, - index_name.to_string(), - batchkind, - &mut current_batch, - create_index, - )? - .map(|batch| (batch, current_batch))); - } - - // If we found no tasks then we were notified for something that got autobatched - // somehow and there is nothing to do. - Ok(None) - } - - /// Apply the operation associated with the given batch. - /// - /// ## Return - /// The list of tasks that were processed. The metadata of each task in the returned - /// list is updated accordingly, with the exception of the its date fields - /// [`finished_at`](meilisearch_types::tasks::Task::finished_at) and [`started_at`](meilisearch_types::tasks::Task::started_at). - #[tracing::instrument(level = "trace", skip(self, batch), target = "indexing::scheduler", fields(batch=batch.to_string()))] - pub(crate) fn process_batch( - &self, - batch: Batch, - current_batch: &mut ProcessingBatch, - ) -> Result> { - #[cfg(test)] - { - self.maybe_fail(crate::tests::FailureLocation::InsideProcessBatch)?; - self.maybe_fail(crate::tests::FailureLocation::PanicInsideProcessBatch)?; - self.breakpoint(crate::Breakpoint::InsideProcessBatch); - } - - match batch { - Batch::TaskCancelation { mut task } => { - // 1. Retrieve the tasks that matched the query at enqueue-time. - let matched_tasks = - if let KindWithContent::TaskCancelation { tasks, query: _ } = &task.kind { - tasks - } else { - unreachable!() - }; - - let rtxn = self.env.read_txn()?; - let mut canceled_tasks = - self.cancel_matched_tasks(&rtxn, task.uid, current_batch, matched_tasks)?; - - task.status = Status::Succeeded; - match &mut task.details { - Some(Details::TaskCancelation { - matched_tasks: _, - canceled_tasks: canceled_tasks_details, - original_filter: _, - }) => { - *canceled_tasks_details = Some(canceled_tasks.len() as u64); - } - _ => unreachable!(), - } - - canceled_tasks.push(task); - - Ok(canceled_tasks) - } - Batch::TaskDeletions(mut tasks) => { - // 1. Retrieve the tasks that matched the query at enqueue-time. - let mut matched_tasks = RoaringBitmap::new(); - - for task in tasks.iter() { - if let KindWithContent::TaskDeletion { tasks, query: _ } = &task.kind { - matched_tasks |= tasks; - } else { - unreachable!() - } - } - - let mut wtxn = self.env.write_txn()?; - let mut deleted_tasks = self.delete_matched_tasks(&mut wtxn, &matched_tasks)?; - wtxn.commit()?; - - for task in tasks.iter_mut() { - task.status = Status::Succeeded; - let KindWithContent::TaskDeletion { tasks, query: _ } = &task.kind else { - unreachable!() - }; - - let deleted_tasks_count = deleted_tasks.intersection_len(tasks); - deleted_tasks -= tasks; - - match &mut task.details { - Some(Details::TaskDeletion { - matched_tasks: _, - deleted_tasks, - original_filter: _, - }) => { - *deleted_tasks = Some(deleted_tasks_count); - } - _ => unreachable!(), - } - } - Ok(tasks) - } - Batch::SnapshotCreation(mut tasks) => { - fs::create_dir_all(&self.snapshots_path)?; - let temp_snapshot_dir = tempfile::tempdir()?; - - // 1. Snapshot the version file. - let dst = temp_snapshot_dir.path().join(VERSION_FILE_NAME); - fs::copy(&self.version_file_path, dst)?; - - // 2. Snapshot the index-scheduler LMDB env - // - // When we call copy_to_file, LMDB opens a read transaction by itself, - // we can't provide our own. It is an issue as we would like to know - // the update files to copy but new ones can be enqueued between the copy - // of the env and the new transaction we open to retrieve the enqueued tasks. - // So we prefer opening a new transaction after copying the env and copy more - // update files than not enough. - // - // Note that there cannot be any update files deleted between those - // two read operations as the task processing is synchronous. - - // 2.1 First copy the LMDB env of the index-scheduler - let dst = temp_snapshot_dir.path().join("tasks"); - fs::create_dir_all(&dst)?; - self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; - - // 2.2 Create a read transaction on the index-scheduler - let rtxn = self.env.read_txn()?; - - // 2.3 Create the update files directory - let update_files_dir = temp_snapshot_dir.path().join("update_files"); - fs::create_dir_all(&update_files_dir)?; - - // 2.4 Only copy the update files of the enqueued tasks - for task_id in self.get_status(&rtxn, Status::Enqueued)? { - let task = self.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - if let Some(content_uuid) = task.content_uuid() { - let src = self.file_store.get_update_path(content_uuid); - let dst = update_files_dir.join(content_uuid.to_string()); - fs::copy(src, dst)?; - } - } - - // 3. Snapshot every indexes - for result in self.index_mapper.index_mapping.iter(&rtxn)? { - let (name, uuid) = result?; - let index = self.index_mapper.index(&rtxn, name)?; - let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string()); - fs::create_dir_all(&dst)?; - index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; - } - - drop(rtxn); - - // 4. Snapshot the auth LMDB env - let dst = temp_snapshot_dir.path().join("auth"); - fs::create_dir_all(&dst)?; - // TODO We can't use the open_auth_store_env function here but we should - let auth = unsafe { - milli::heed::EnvOpenOptions::new() - .map_size(1024 * 1024 * 1024) // 1 GiB - .max_dbs(2) - .open(&self.auth_path) - }?; - auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; - - // 5. Copy and tarball the flat snapshot - // 5.1 Find the original name of the database - // TODO find a better way to get this path - let mut base_path = self.env.path().to_owned(); - base_path.pop(); - let db_name = base_path.file_name().and_then(OsStr::to_str).unwrap_or("data.ms"); - - // 5.2 Tarball the content of the snapshot in a tempfile with a .snapshot extension - let snapshot_path = self.snapshots_path.join(format!("{}.snapshot", db_name)); - let temp_snapshot_file = tempfile::NamedTempFile::new_in(&self.snapshots_path)?; - compression::to_tar_gz(temp_snapshot_dir.path(), temp_snapshot_file.path())?; - let file = temp_snapshot_file.persist(snapshot_path)?; - - // 5.3 Change the permission to make the snapshot readonly - let mut permissions = file.metadata()?.permissions(); - permissions.set_readonly(true); - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - #[allow(clippy::non_octal_unix_permissions)] - // rwxrwxrwx - permissions.set_mode(0b100100100); - } - - file.set_permissions(permissions)?; - - for task in &mut tasks { - task.status = Status::Succeeded; - } - - Ok(tasks) - } - Batch::Dump(mut task) => { - let started_at = OffsetDateTime::now_utc(); - let (keys, instance_uid) = - if let KindWithContent::DumpCreation { keys, instance_uid } = &task.kind { - (keys, instance_uid) - } else { - unreachable!(); - }; - let dump = dump::DumpWriter::new(*instance_uid)?; - - // 1. dump the keys - let mut dump_keys = dump.create_keys()?; - for key in keys { - dump_keys.push_key(key)?; - } - dump_keys.flush()?; - - let rtxn = self.env.read_txn()?; - - // 2. dump the tasks - let mut dump_tasks = dump.create_tasks_queue()?; - for ret in self.all_tasks.iter(&rtxn)? { - if self.must_stop_processing.get() { - return Err(Error::AbortedTask); - } - - let (_, mut t) = ret?; - let status = t.status; - let content_file = t.content_uuid(); - - // In the case we're dumping ourselves we want to be marked as finished - // to not loop over ourselves indefinitely. - if t.uid == task.uid { - let finished_at = OffsetDateTime::now_utc(); - - // We're going to fake the date because we don't know if everything is going to go well. - // But we need to dump the task as finished and successful. - // If something fail everything will be set appropriately in the end. - t.status = Status::Succeeded; - t.started_at = Some(started_at); - t.finished_at = Some(finished_at); - } - let mut dump_content_file = dump_tasks.push_task(&t.into())?; - - // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. - if let Some(content_file) = content_file { - if self.must_stop_processing.get() { - return Err(Error::AbortedTask); - } - if status == Status::Enqueued { - let content_file = self.file_store.get_update(content_file)?; - - let reader = DocumentsBatchReader::from_reader(content_file) - .map_err(milli::Error::from)?; - - let (mut cursor, documents_batch_index) = - reader.into_cursor_and_fields_index(); - - while let Some(doc) = - cursor.next_document().map_err(milli::Error::from)? - { - dump_content_file - .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; - } - dump_content_file.flush()?; - } - } - } - dump_tasks.flush()?; - - // 3. Dump the indexes - self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> { - let rtxn = index.read_txn()?; - let metadata = IndexMetadata { - uid: uid.to_owned(), - primary_key: index.primary_key(&rtxn)?.map(String::from), - created_at: index.created_at(&rtxn)?, - updated_at: index.updated_at(&rtxn)?, - }; - let mut index_dumper = dump.create_index(uid, &metadata)?; - - let fields_ids_map = index.fields_ids_map(&rtxn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(&rtxn)?; - - // 3.1. Dump the documents - for ret in index.all_documents(&rtxn)? { - if self.must_stop_processing.get() { - return Err(Error::AbortedTask); - } - - let (id, doc) = ret?; - - let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?; - - 'inject_vectors: { - let embeddings = index.embeddings(&rtxn, id)?; - - if embeddings.is_empty() { - break 'inject_vectors; - } - - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME.to_owned()) - .or_insert(serde_json::Value::Object(Default::default())); - - let serde_json::Value::Object(vectors) = vectors else { - return Err(milli::Error::UserError( - milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&rtxn, std::iter::once(id)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={id}") - } - }, - value: vectors.clone(), - }, - ) - .into()); - }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); - - let embeddings = ExplicitVectors { - embeddings: Some( - VectorOrArrayOfVectors::from_array_of_vectors(embeddings), - ), - regenerate: !user_provided, - }; - vectors.insert( - embedder_name, - serde_json::to_value(embeddings).unwrap(), - ); - } - } - - index_dumper.push_document(&document)?; - } - - // 3.2. Dump the settings - let settings = meilisearch_types::settings::settings( - index, - &rtxn, - meilisearch_types::settings::SecretPolicy::RevealSecrets, - )?; - index_dumper.settings(&settings)?; - Ok(()) - })?; - - // 4. Dump experimental feature settings - let features = self.features().runtime_features(); - dump.create_experimental_features(features)?; - - let dump_uid = started_at.format(format_description!( - "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" - )).unwrap(); - - if self.must_stop_processing.get() { - return Err(Error::AbortedTask); - } - let path = self.dumps_path.join(format!("{}.dump", dump_uid)); - let file = File::create(path)?; - dump.persist_to(BufWriter::new(file))?; - - // if we reached this step we can tell the scheduler we succeeded to dump ourselves. - task.status = Status::Succeeded; - task.details = Some(Details::Dump { dump_uid: Some(dump_uid) }); - Ok(vec![task]) - } - Batch::IndexOperation { op, must_create_index } => { - let index_uid = op.index_uid().to_string(); - let index = if must_create_index { - // create the index if it doesn't already exist - let wtxn = self.env.write_txn()?; - self.index_mapper.create_index(wtxn, &index_uid, None)? - } else { - let rtxn = self.env.read_txn()?; - self.index_mapper.index(&rtxn, &index_uid)? - }; - - // the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick - self.index_mapper - .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); - - let mut index_wtxn = index.write_txn()?; - let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?; - index_wtxn.commit()?; - - // if the update processed successfully, we're going to store the new - // stats of the index. Since the tasks have already been processed and - // this is a non-critical operation. If it fails, we should not fail - // the entire batch. - let res = || -> Result<()> { - let index_rtxn = index.read_txn()?; - let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?; - let mut wtxn = self.env.write_txn()?; - self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; - wtxn.commit()?; - Ok(()) - }(); - - match res { - Ok(_) => (), - Err(e) => tracing::error!( - error = &e as &dyn std::error::Error, - "Could not write the stats of the index" - ), - } - - Ok(tasks) - } - Batch::IndexCreation { index_uid, primary_key, task } => { - let wtxn = self.env.write_txn()?; - if self.index_mapper.exists(&wtxn, &index_uid)? { - return Err(Error::IndexAlreadyExists(index_uid)); - } - self.index_mapper.create_index(wtxn, &index_uid, None)?; - - self.process_batch( - Batch::IndexUpdate { index_uid, primary_key, task }, - current_batch, - ) - } - Batch::IndexUpdate { index_uid, primary_key, mut task } => { - let rtxn = self.env.read_txn()?; - let index = self.index_mapper.index(&rtxn, &index_uid)?; - - if let Some(primary_key) = primary_key.clone() { - let mut index_wtxn = index.write_txn()?; - let mut builder = MilliSettings::new( - &mut index_wtxn, - &index, - self.index_mapper.indexer_config(), - ); - builder.set_primary_key(primary_key); - let must_stop_processing = self.must_stop_processing.clone(); - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; - index_wtxn.commit()?; - } - - // drop rtxn before starting a new wtxn on the same db - rtxn.commit()?; - - task.status = Status::Succeeded; - task.details = Some(Details::IndexInfo { primary_key }); - - // if the update processed successfully, we're going to store the new - // stats of the index. Since the tasks have already been processed and - // this is a non-critical operation. If it fails, we should not fail - // the entire batch. - let res = || -> Result<()> { - let mut wtxn = self.env.write_txn()?; - let index_rtxn = index.read_txn()?; - let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?; - self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; - wtxn.commit()?; - Ok(()) - }(); - - match res { - Ok(_) => (), - Err(e) => tracing::error!( - error = &e as &dyn std::error::Error, - "Could not write the stats of the index" - ), - } - - Ok(vec![task]) - } - Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => { - let wtxn = self.env.write_txn()?; - - // it's possible that the index doesn't exist - let number_of_documents = || -> Result { - let index = self.index_mapper.index(&wtxn, &index_uid)?; - let index_rtxn = index.read_txn()?; - Ok(index.number_of_documents(&index_rtxn)?) - }() - .unwrap_or_default(); - - // The write transaction is directly owned and committed inside. - match self.index_mapper.delete_index(wtxn, &index_uid) { - Ok(()) => (), - Err(Error::IndexNotFound(_)) if index_has_been_created => (), - Err(e) => return Err(e), - } - - // We set all the tasks details to the default value. - for task in &mut tasks { - task.status = Status::Succeeded; - task.details = match &task.kind { - KindWithContent::IndexDeletion { .. } => { - Some(Details::ClearAll { deleted_documents: Some(number_of_documents) }) - } - otherwise => otherwise.default_finished_details(), - }; - } - - Ok(tasks) - } - Batch::IndexSwap { mut task } => { - let mut wtxn = self.env.write_txn()?; - let swaps = if let KindWithContent::IndexSwap { swaps } = &task.kind { - swaps - } else { - unreachable!() - }; - let mut not_found_indexes = BTreeSet::new(); - for IndexSwap { indexes: (lhs, rhs) } in swaps { - for index in [lhs, rhs] { - let index_exists = self.index_mapper.index_exists(&wtxn, index)?; - if !index_exists { - not_found_indexes.insert(index); - } - } - } - if !not_found_indexes.is_empty() { - if not_found_indexes.len() == 1 { - return Err(Error::SwapIndexNotFound( - not_found_indexes.into_iter().next().unwrap().clone(), - )); - } else { - return Err(Error::SwapIndexesNotFound( - not_found_indexes.into_iter().cloned().collect(), - )); - } - } - for swap in swaps { - self.apply_index_swap(&mut wtxn, task.uid, &swap.indexes.0, &swap.indexes.1)?; - } - wtxn.commit()?; - task.status = Status::Succeeded; - Ok(vec![task]) - } - } - } - - /// Swap the index `lhs` with the index `rhs`. - fn apply_index_swap(&self, wtxn: &mut RwTxn, task_id: u32, lhs: &str, rhs: &str) -> Result<()> { - // 1. Verify that both lhs and rhs are existing indexes - let index_lhs_exists = self.index_mapper.index_exists(wtxn, lhs)?; - if !index_lhs_exists { - return Err(Error::IndexNotFound(lhs.to_owned())); - } - let index_rhs_exists = self.index_mapper.index_exists(wtxn, rhs)?; - if !index_rhs_exists { - return Err(Error::IndexNotFound(rhs.to_owned())); - } - - // 2. Get the task set for index = name that appeared before the index swap task - let mut index_lhs_task_ids = self.index_tasks(wtxn, lhs)?; - index_lhs_task_ids.remove_range(task_id..); - let mut index_rhs_task_ids = self.index_tasks(wtxn, rhs)?; - index_rhs_task_ids.remove_range(task_id..); - - // 3. before_name -> new_name in the task's KindWithContent - for task_id in &index_lhs_task_ids | &index_rhs_task_ids { - let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - swap_index_uid_in_task(&mut task, (lhs, rhs)); - self.all_tasks.put(wtxn, &task_id, &task)?; - } - - // 4. remove the task from indexuid = before_name - // 5. add the task to indexuid = after_name - self.update_index(wtxn, lhs, |lhs_tasks| { - *lhs_tasks -= &index_lhs_task_ids; - *lhs_tasks |= &index_rhs_task_ids; - })?; - self.update_index(wtxn, rhs, |rhs_tasks| { - *rhs_tasks -= &index_rhs_task_ids; - *rhs_tasks |= &index_lhs_task_ids; - })?; - - // 6. Swap in the index mapper - self.index_mapper.swap(wtxn, lhs, rhs)?; - - Ok(()) - } - - /// Process the index operation on the given index. - /// - /// ## Return - /// The list of processed tasks. - #[tracing::instrument( - level = "trace", - skip(self, index_wtxn, index), - target = "indexing::scheduler" - )] - fn apply_index_operation<'i>( - &self, - index_wtxn: &mut RwTxn<'i>, - index: &'i Index, - operation: IndexOperation, - ) -> Result> { - let indexer_alloc = Bump::new(); - - let started_processing_at = std::time::Instant::now(); - let secs_since_started_processing_at = AtomicU64::new(0); - const PRINT_SECS_DELTA: u64 = 5; - - let processing_tasks = self.processing_tasks.clone(); - let must_stop_processing = self.must_stop_processing.clone(); - let send_progress = |progress| { - let now = std::time::Instant::now(); - let elapsed = secs_since_started_processing_at.load(atomic::Ordering::Relaxed); - let previous = started_processing_at + Duration::from_secs(elapsed); - let elapsed = now - previous; - - if elapsed.as_secs() < PRINT_SECS_DELTA { - return; - } - - secs_since_started_processing_at - .store((now - started_processing_at).as_secs(), atomic::Ordering::Relaxed); - - let TaskProgress { - current_step, - finished_steps, - total_steps, - finished_substeps, - total_substeps, - } = processing_tasks.write().unwrap().update_progress(progress); - - tracing::info!( - current_step, - finished_steps, - total_steps, - finished_substeps, - total_substeps - ); - }; - - match operation { - IndexOperation::DocumentClear { mut tasks, .. } => { - let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?; - - let mut first_clear_found = false; - for task in &mut tasks { - task.status = Status::Succeeded; - // The first document clear will effectively delete every documents - // in the database but the next ones will clear 0 documents. - task.details = match &task.kind { - KindWithContent::DocumentClear { .. } => { - let count = if first_clear_found { 0 } else { count }; - first_clear_found = true; - Some(Details::ClearAll { deleted_documents: Some(count) }) - } - otherwise => otherwise.default_details(), - }; - } - - Ok(tasks) - } - IndexOperation::DocumentOperation { - index_uid: _, - primary_key, - method, - documents_counts: _, - operations, - mut tasks, - } => { - // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. - // this is made difficult by the fact we're doing private clones of the index scheduler and sending it - // to a fresh thread. - let mut content_files = Vec::new(); - for operation in &operations { - if let DocumentOperation::Add(content_uuid) = operation { - let content_file = self.file_store.get_update(*content_uuid)?; - let mmap = unsafe { memmap2::Mmap::map(&content_file)? }; - if !mmap.is_empty() { - content_files.push(mmap); - } - } - } - - let rtxn = index.read_txn()?; - let db_fields_ids_map = index.fields_ids_map(&rtxn)?; - let mut new_fields_ids_map = db_fields_ids_map.clone(); - - let mut content_files_iter = content_files.iter(); - let mut indexer = indexer::DocumentOperation::new(method); - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; - for operation in operations { - match operation { - DocumentOperation::Add(_content_uuid) => { - let mmap = content_files_iter.next().unwrap(); - indexer.add_documents(mmap)?; - } - DocumentOperation::Delete(document_ids) => { - let document_ids: bumpalo::collections::vec::Vec<_> = document_ids - .iter() - .map(|s| &*indexer_alloc.alloc_str(s)) - .collect_in(&indexer_alloc); - indexer.delete_documents(document_ids.into_bump_slice()); - } - } - } - - let local_pool; - let indexer_config = self.index_mapper.indexer_config(); - let pool = match &indexer_config.thread_pool { - Some(pool) => pool, - None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); - &local_pool - } - }; - - let (document_changes, operation_stats, primary_key) = indexer.into_changes( - &indexer_alloc, - index, - &rtxn, - primary_key.as_deref(), - &mut new_fields_ids_map, - &|| must_stop_processing.get(), - &send_progress, - )?; - - let mut addition = 0; - for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { - addition += stats.document_count; - match stats.error { - Some(error) => { - task.status = Status::Failed; - task.error = Some(milli::Error::UserError(error).into()); - } - None => task.status = Status::Succeeded, - } - - task.details = match task.details { - Some(Details::DocumentAdditionOrUpdate { received_documents, .. }) => { - Some(Details::DocumentAdditionOrUpdate { - received_documents, - indexed_documents: Some(stats.document_count), - }) - } - Some(Details::DocumentDeletion { provided_ids, .. }) => { - Some(Details::DocumentDeletion { - provided_ids, - deleted_documents: Some(stats.document_count), - }) - } - _ => { - // In the case of a `documentAdditionOrUpdate` or `DocumentDeletion` - // the details MUST be set to either addition or deletion - unreachable!(); - } - } - } - - if tasks.iter().any(|res| res.error.is_none()) { - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - primary_key, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; - - tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); - } - - Ok(tasks) - } - IndexOperation::DocumentEdition { mut task, .. } => { - let (filter, code) = if let KindWithContent::DocumentEdition { - filter_expr, - context: _, - function, - .. - } = &task.kind - { - (filter_expr, function) - } else { - unreachable!() - }; - - let candidates = match filter.as_ref().map(Filter::from_json) { - Some(Ok(Some(filter))) => { - filter.evaluate(index_wtxn, index).map_err(|err| match err { - milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { - Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter) - } - e => e.into(), - })? - } - None | Some(Ok(None)) => index.documents_ids(index_wtxn)?, - Some(Err(e)) => return Err(e.into()), - }; - - let (original_filter, context, function) = if let Some(Details::DocumentEdition { - original_filter, - context, - function, - .. - }) = task.details - { - (original_filter, context, function) - } else { - // In the case of a `documentEdition` the details MUST be set - unreachable!(); - }; - - if candidates.is_empty() { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentEdition { - original_filter, - context, - function, - deleted_documents: Some(0), - edited_documents: Some(0), - }); - - return Ok(vec![task]); - } - - let rtxn = index.read_txn()?; - let db_fields_ids_map = index.fields_ids_map(&rtxn)?; - let mut new_fields_ids_map = db_fields_ids_map.clone(); - // candidates not empty => index not empty => a primary key is set - let primary_key = index.primary_key(&rtxn)?.unwrap(); - - let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) - .map_err(milli::Error::from)?; - - let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; - - if task.error.is_none() { - let local_pool; - let indexer_config = self.index_mapper.indexer_config(); - let pool = match &indexer_config.thread_pool { - Some(pool) => pool, - None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); - &local_pool - } - }; - - pool.install(|| { - let indexer = - UpdateByFunction::new(candidates, context.clone(), code.clone()); - let document_changes = indexer.into_changes(&primary_key)?; - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; - - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // cannot change primary key in DocumentEdition - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; - - Result::Ok(()) - }) - .unwrap()?; - - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); - } - - match result_count { - Ok((deleted_documents, edited_documents)) => { - task.status = Status::Succeeded; - task.details = Some(Details::DocumentEdition { - original_filter, - context, - function, - deleted_documents: Some(deleted_documents), - edited_documents: Some(edited_documents), - }); - } - Err(e) => { - task.status = Status::Failed; - task.details = Some(Details::DocumentEdition { - original_filter, - context, - function, - deleted_documents: Some(0), - edited_documents: Some(0), - }); - task.error = Some(e.into()); - } - } - - Ok(vec![task]) - } - IndexOperation::DocumentDeletion { mut tasks, index_uid: _ } => { - let mut to_delete = RoaringBitmap::new(); - let external_documents_ids = index.external_documents_ids(); - - for task in tasks.iter_mut() { - let before = to_delete.len(); - task.status = Status::Succeeded; - - match &task.kind { - KindWithContent::DocumentDeletion { index_uid: _, documents_ids } => { - for id in documents_ids { - if let Some(id) = external_documents_ids.get(index_wtxn, id)? { - to_delete.insert(id); - } - } - let will_be_removed = to_delete.len() - before; - task.details = Some(Details::DocumentDeletion { - provided_ids: documents_ids.len(), - deleted_documents: Some(will_be_removed), - }); - } - KindWithContent::DocumentDeletionByFilter { index_uid: _, filter_expr } => { - let before = to_delete.len(); - let filter = match Filter::from_json(filter_expr) { - Ok(filter) => filter, - Err(err) => { - // theorically, this should be catched by deserr before reaching the index-scheduler and cannot happens - task.status = Status::Failed; - task.error = match err { - milli::Error::UserError( - milli::UserError::InvalidFilterExpression { .. }, - ) => Some( - Error::from(err) - .with_custom_error_code(Code::InvalidDocumentFilter) - .into(), - ), - e => Some(e.into()), - }; - None - } - }; - if let Some(filter) = filter { - let candidates = - filter.evaluate(index_wtxn, index).map_err(|err| match err { - milli::Error::UserError( - milli::UserError::InvalidFilter(_), - ) => Error::from(err) - .with_custom_error_code(Code::InvalidDocumentFilter), - e => e.into(), - }); - match candidates { - Ok(candidates) => to_delete |= candidates, - Err(err) => { - task.status = Status::Failed; - task.error = Some(err.into()); - } - }; - } - let will_be_removed = to_delete.len() - before; - if let Some(Details::DocumentDeletionByFilter { - original_filter: _, - deleted_documents, - }) = &mut task.details - { - *deleted_documents = Some(will_be_removed); - } else { - // In the case of a `documentDeleteByFilter` the details MUST be set - unreachable!() - } - } - _ => unreachable!(), - } - } - - if to_delete.is_empty() { - return Ok(tasks); - } - - let rtxn = index.read_txn()?; - let db_fields_ids_map = index.fields_ids_map(&rtxn)?; - let mut new_fields_ids_map = db_fields_ids_map.clone(); - - // to_delete not empty => index not empty => primary key set - let primary_key = index.primary_key(&rtxn)?.unwrap(); - - let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) - .map_err(milli::Error::from)?; - - if !tasks.iter().all(|res| res.error.is_some()) { - let local_pool; - let indexer_config = self.index_mapper.indexer_config(); - let pool = match &indexer_config.thread_pool { - Some(pool) => pool, - None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); - &local_pool - } - }; - - let mut indexer = indexer::DocumentDeletion::new(); - indexer.delete_documents_by_docids(to_delete); - let document_changes = indexer.into_changes(&indexer_alloc, primary_key); - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; - - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // document deletion never changes primary key - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; - - // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); - } - - Ok(tasks) - } - IndexOperation::Settings { index_uid: _, settings, mut tasks } => { - let indexer_config = self.index_mapper.indexer_config(); - let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); - - for (task, (_, settings)) in tasks.iter_mut().zip(settings) { - let checked_settings = settings.clone().check(); - task.details = Some(Details::SettingsUpdate { settings: Box::new(settings) }); - apply_settings_to_builder(&checked_settings, &mut builder); - - // We can apply the status right now and if an update fail later - // the whole batch will be marked as failed. - task.status = Status::Succeeded; - } - - builder.execute( - |indexing_step| tracing::debug!(update = ?indexing_step), - || must_stop_processing.get(), - )?; - - Ok(tasks) - } - IndexOperation::SettingsAndDocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - document_import_tasks, - settings, - settings_tasks, - } => { - let settings_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::Settings { - index_uid: index_uid.clone(), - settings, - tasks: settings_tasks, - }, - )?; - - let mut import_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::DocumentOperation { - index_uid, - primary_key, - method, - documents_counts, - operations, - tasks: document_import_tasks, - }, - )?; - - let mut tasks = settings_tasks; - tasks.append(&mut import_tasks); - Ok(tasks) - } - IndexOperation::DocumentClearAndSetting { - index_uid, - cleared_tasks, - settings, - settings_tasks, - } => { - let mut import_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::DocumentClear { - index_uid: index_uid.clone(), - tasks: cleared_tasks, - }, - )?; - - let settings_tasks = self.apply_index_operation( - index_wtxn, - index, - IndexOperation::Settings { index_uid, settings, tasks: settings_tasks }, - )?; - - let mut tasks = settings_tasks; - tasks.append(&mut import_tasks); - Ok(tasks) - } - } - } - - /// Delete each given task from all the databases (if it is deleteable). - /// - /// Return the number of tasks that were actually deleted. - fn delete_matched_tasks( - &self, - wtxn: &mut RwTxn, - matched_tasks: &RoaringBitmap, - ) -> Result { - // 1. Remove from this list the tasks that we are not allowed to delete - let enqueued_tasks = self.get_status(wtxn, Status::Enqueued)?; - let processing_tasks = &self.processing_tasks.read().unwrap().processing.clone(); - - let all_task_ids = self.all_task_ids(wtxn)?; - let mut to_delete_tasks = all_task_ids & matched_tasks; - to_delete_tasks -= processing_tasks; - to_delete_tasks -= enqueued_tasks; - - // 2. We now have a list of tasks to delete, delete them - - let mut affected_indexes = HashSet::new(); - let mut affected_statuses = HashSet::new(); - let mut affected_kinds = HashSet::new(); - let mut affected_canceled_by = RoaringBitmap::new(); - // The tasks that have been removed *per batches*. - let mut affected_batches: HashMap = HashMap::new(); - - for task_id in to_delete_tasks.iter() { - let task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; - - affected_indexes.extend(task.indexes().into_iter().map(|x| x.to_owned())); - affected_statuses.insert(task.status); - affected_kinds.insert(task.kind.as_kind()); - // Note: don't delete the persisted task data since - // we can only delete succeeded, failed, and canceled tasks. - // In each of those cases, the persisted data is supposed to - // have been deleted already. - utils::remove_task_datetime(wtxn, self.enqueued_at, task.enqueued_at, task.uid)?; - if let Some(started_at) = task.started_at { - utils::remove_task_datetime(wtxn, self.started_at, started_at, task.uid)?; - } - if let Some(finished_at) = task.finished_at { - utils::remove_task_datetime(wtxn, self.finished_at, finished_at, task.uid)?; - } - if let Some(canceled_by) = task.canceled_by { - affected_canceled_by.insert(canceled_by); - } - if let Some(batch_uid) = task.batch_uid { - affected_batches.entry(batch_uid).or_default().insert(task_id); - } - } - - for index in affected_indexes.iter() { - self.update_index(wtxn, index, |bitmap| *bitmap -= &to_delete_tasks)?; - } - - for status in affected_statuses.iter() { - self.update_status(wtxn, *status, |bitmap| *bitmap -= &to_delete_tasks)?; - } - - for kind in affected_kinds.iter() { - self.update_kind(wtxn, *kind, |bitmap| *bitmap -= &to_delete_tasks)?; - } - - for task in to_delete_tasks.iter() { - self.all_tasks.delete(wtxn, &task)?; - } - for canceled_by in affected_canceled_by { - if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? { - tasks -= &to_delete_tasks; - if tasks.is_empty() { - self.canceled_by.delete(wtxn, &canceled_by)?; - } else { - self.canceled_by.put(wtxn, &canceled_by, &tasks)?; - } - } - } - for (batch_id, to_delete_tasks) in affected_batches { - if let Some(mut tasks) = self.batch_to_tasks_mapping.get(wtxn, &batch_id)? { - tasks -= &to_delete_tasks; - // We must remove the batch entirely - if tasks.is_empty() { - self.all_batches.delete(wtxn, &batch_id)?; - self.batch_to_tasks_mapping.delete(wtxn, &batch_id)?; - } - // Anyway, we must remove the batch from all its reverse indexes. - // The only way to do that is to check - - for index in affected_indexes.iter() { - let index_tasks = self.index_tasks(wtxn, index)?; - let remaining_index_tasks = index_tasks & &tasks; - if remaining_index_tasks.is_empty() { - self.update_batch_index(wtxn, index, |bitmap| { - bitmap.remove(batch_id); - })?; - } - } - - for status in affected_statuses.iter() { - let status_tasks = self.get_status(wtxn, *status)?; - let remaining_status_tasks = status_tasks & &tasks; - if remaining_status_tasks.is_empty() { - self.update_batch_status(wtxn, *status, |bitmap| { - bitmap.remove(batch_id); - })?; - } - } - - for kind in affected_kinds.iter() { - let kind_tasks = self.get_kind(wtxn, *kind)?; - let remaining_kind_tasks = kind_tasks & &tasks; - if remaining_kind_tasks.is_empty() { - self.update_batch_kind(wtxn, *kind, |bitmap| { - bitmap.remove(batch_id); - })?; - } - } - } - } - - Ok(to_delete_tasks) - } - - /// Cancel each given task from all the databases (if it is cancelable). - /// - /// Returns the list of tasks that matched the filter and must be written in the database. - fn cancel_matched_tasks( - &self, - rtxn: &RoTxn, - cancel_task_id: TaskId, - current_batch: &mut ProcessingBatch, - matched_tasks: &RoaringBitmap, - ) -> Result> { - // 1. Remove from this list the tasks that we are not allowed to cancel - // Notice that only the _enqueued_ ones are cancelable and we should - // have already aborted the indexation of the _processing_ ones - let cancelable_tasks = self.get_status(rtxn, Status::Enqueued)?; - let tasks_to_cancel = cancelable_tasks & matched_tasks; - - // 2. We now have a list of tasks to cancel, cancel them - let mut tasks = self.get_existing_tasks(rtxn, tasks_to_cancel.iter())?; - - for task in tasks.iter_mut() { - task.status = Status::Canceled; - task.canceled_by = Some(cancel_task_id); - task.details = task.details.as_ref().map(|d| d.to_failed()); - current_batch.processing(Some(task)); - } - - Ok(tasks) - } -} diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs new file mode 100644 index 000000000..ca26e50c8 --- /dev/null +++ b/crates/index-scheduler/src/dump.rs @@ -0,0 +1,304 @@ +use std::collections::HashMap; +use std::io; + +use dump::{KindDump, TaskDump, UpdateFile}; +use meilisearch_types::batches::{Batch, BatchId}; +use meilisearch_types::heed::RwTxn; +use meilisearch_types::milli; +use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; +use roaring::RoaringBitmap; +use uuid::Uuid; + +use crate::{utils, Error, IndexScheduler, Result}; + +pub struct Dump<'a> { + index_scheduler: &'a IndexScheduler, + wtxn: RwTxn<'a>, + + batch_to_task_mapping: HashMap, + + indexes: HashMap, + statuses: HashMap, + kinds: HashMap, + + batch_indexes: HashMap, + batch_statuses: HashMap, + batch_kinds: HashMap, +} + +impl<'a> Dump<'a> { + pub(crate) fn new(index_scheduler: &'a mut IndexScheduler) -> Result { + // While loading a dump no one should be able to access the scheduler thus I can block everything. + let wtxn = index_scheduler.env.write_txn()?; + + Ok(Dump { + index_scheduler, + wtxn, + batch_to_task_mapping: HashMap::new(), + indexes: HashMap::new(), + statuses: HashMap::new(), + kinds: HashMap::new(), + batch_indexes: HashMap::new(), + batch_statuses: HashMap::new(), + batch_kinds: HashMap::new(), + }) + } + + /// Register a new batch coming from a dump in the scheduler. + /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. + pub fn register_dumped_batch(&mut self, batch: Batch) -> Result<()> { + self.index_scheduler.queue.batches.all_batches.put(&mut self.wtxn, &batch.uid, &batch)?; + if let Some(enqueued_at) = batch.enqueued_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.enqueued_at, + enqueued_at.earliest, + batch.uid, + )?; + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.enqueued_at, + enqueued_at.oldest, + batch.uid, + )?; + } + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.started_at, + batch.started_at, + batch.uid, + )?; + if let Some(finished_at) = batch.finished_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.batches.finished_at, + finished_at, + batch.uid, + )?; + } + + for index in batch.stats.index_uids.keys() { + match self.batch_indexes.get_mut(index) { + Some(bitmap) => { + bitmap.insert(batch.uid); + } + None => { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(batch.uid); + self.batch_indexes.insert(index.to_string(), bitmap); + } + }; + } + + for status in batch.stats.status.keys() { + self.batch_statuses.entry(*status).or_default().insert(batch.uid); + } + for kind in batch.stats.types.keys() { + self.batch_kinds.entry(*kind).or_default().insert(batch.uid); + } + + Ok(()) + } + + /// Register a new task coming from a dump in the scheduler. + /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. + pub fn register_dumped_task( + &mut self, + task: TaskDump, + content_file: Option>, + ) -> Result { + let task_has_no_docs = matches!(task.kind, KindDump::DocumentImport { documents_count, .. } if documents_count == 0); + + let content_uuid = match content_file { + Some(content_file) if task.status == Status::Enqueued => { + let (uuid, file) = self.index_scheduler.queue.create_update_file(false)?; + let mut writer = io::BufWriter::new(file); + for doc in content_file { + let doc = doc?; + serde_json::to_writer(&mut writer, &doc).map_err(|e| { + Error::from_milli(milli::InternalError::SerdeJson(e).into(), None) + })?; + } + let file = writer.into_inner().map_err(|e| e.into_error())?; + file.persist()?; + + Some(uuid) + } + // If the task isn't `Enqueued` then just generate a recognisable `Uuid` + // in case we try to open it later. + _ if task.status != Status::Enqueued => Some(Uuid::nil()), + None if task.status == Status::Enqueued && task_has_no_docs => { + let (uuid, file) = self.index_scheduler.queue.create_update_file(false)?; + file.persist()?; + + Some(uuid) + } + _ => None, + }; + + let task = Task { + uid: task.uid, + batch_uid: task.batch_uid, + enqueued_at: task.enqueued_at, + started_at: task.started_at, + finished_at: task.finished_at, + error: task.error, + canceled_by: task.canceled_by, + details: task.details, + status: task.status, + kind: match task.kind { + KindDump::DocumentImport { + primary_key, + method, + documents_count, + allow_index_creation, + } => KindWithContent::DocumentAdditionOrUpdate { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + primary_key, + method, + content_file: content_uuid.ok_or(Error::CorruptedDump)?, + documents_count, + allow_index_creation, + }, + KindDump::DocumentDeletion { documents_ids } => KindWithContent::DocumentDeletion { + documents_ids, + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + }, + KindDump::DocumentDeletionByFilter { filter } => { + KindWithContent::DocumentDeletionByFilter { + filter_expr: filter, + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + } + } + KindDump::DocumentEdition { filter, context, function } => { + KindWithContent::DocumentEdition { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + filter_expr: filter, + context, + function, + } + } + KindDump::DocumentClear => KindWithContent::DocumentClear { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + }, + KindDump::Settings { settings, is_deletion, allow_index_creation } => { + KindWithContent::SettingsUpdate { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + new_settings: settings, + is_deletion, + allow_index_creation, + } + } + KindDump::IndexDeletion => KindWithContent::IndexDeletion { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + }, + KindDump::IndexCreation { primary_key } => KindWithContent::IndexCreation { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + primary_key, + }, + KindDump::IndexUpdate { primary_key } => KindWithContent::IndexUpdate { + index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, + primary_key, + }, + KindDump::IndexSwap { swaps } => KindWithContent::IndexSwap { swaps }, + KindDump::TaskCancelation { query, tasks } => { + KindWithContent::TaskCancelation { query, tasks } + } + KindDump::TasksDeletion { query, tasks } => { + KindWithContent::TaskDeletion { query, tasks } + } + KindDump::DumpCreation { keys, instance_uid } => { + KindWithContent::DumpCreation { keys, instance_uid } + } + KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, + KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, + }, + }; + + self.index_scheduler.queue.tasks.all_tasks.put(&mut self.wtxn, &task.uid, &task)?; + if let Some(batch_id) = task.batch_uid { + self.batch_to_task_mapping.entry(batch_id).or_default().insert(task.uid); + } + + for index in task.indexes() { + match self.indexes.get_mut(index) { + Some(bitmap) => { + bitmap.insert(task.uid); + } + None => { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(task.uid); + self.indexes.insert(index.to_string(), bitmap); + } + }; + } + + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.tasks.enqueued_at, + task.enqueued_at, + task.uid, + )?; + + // we can't override the started_at & finished_at, so we must only set it if the tasks is finished and won't change + if matches!(task.status, Status::Succeeded | Status::Failed | Status::Canceled) { + if let Some(started_at) = task.started_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.tasks.started_at, + started_at, + task.uid, + )?; + } + if let Some(finished_at) = task.finished_at { + utils::insert_task_datetime( + &mut self.wtxn, + self.index_scheduler.queue.tasks.finished_at, + finished_at, + task.uid, + )?; + } + } + + self.statuses.entry(task.status).or_default().insert(task.uid); + self.kinds.entry(task.kind.as_kind()).or_default().insert(task.uid); + + Ok(task) + } + + /// Commit all the changes and exit the importing dump state + pub fn finish(mut self) -> Result<()> { + for (batch_id, task_ids) in self.batch_to_task_mapping { + self.index_scheduler.queue.batch_to_tasks_mapping.put( + &mut self.wtxn, + &batch_id, + &task_ids, + )?; + } + + for (index, bitmap) in self.indexes { + self.index_scheduler.queue.tasks.index_tasks.put(&mut self.wtxn, &index, &bitmap)?; + } + for (status, bitmap) in self.statuses { + self.index_scheduler.queue.tasks.put_status(&mut self.wtxn, status, &bitmap)?; + } + for (kind, bitmap) in self.kinds { + self.index_scheduler.queue.tasks.put_kind(&mut self.wtxn, kind, &bitmap)?; + } + + for (index, bitmap) in self.batch_indexes { + self.index_scheduler.queue.batches.index_tasks.put(&mut self.wtxn, &index, &bitmap)?; + } + for (status, bitmap) in self.batch_statuses { + self.index_scheduler.queue.batches.put_status(&mut self.wtxn, status, &bitmap)?; + } + for (kind, bitmap) in self.batch_kinds { + self.index_scheduler.queue.batches.put_kind(&mut self.wtxn, kind, &bitmap)?; + } + + self.wtxn.commit()?; + self.index_scheduler.scheduler.wake_up.signal(); + + Ok(()) + } +} diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index f6a4ecc04..280127d04 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -104,11 +104,13 @@ pub enum Error { )] InvalidTaskCanceledBy { canceled_by: String }, #[error( - "{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes." + "{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 400 bytes." )] InvalidIndexUid { index_uid: String }, #[error("Task `{0}` not found.")] TaskNotFound(TaskId), + #[error("Task `{0}` does not contain any documents. Only `documentAdditionOrUpdate` tasks with the statuses `enqueued` or `processing` contain documents")] + TaskFileNotFound(TaskId), #[error("Batch `{0}` not found.")] BatchNotFound(BatchId), #[error("Query parameters to filter the tasks to delete are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")] @@ -122,10 +124,13 @@ pub enum Error { Dump(#[from] dump::Error), #[error(transparent)] Heed(#[from] heed::Error), - #[error(transparent)] - Milli(#[from] milli::Error), - #[error("An unexpected crash occurred when processing the task.")] - ProcessBatchPanicked, + #[error("{}", match .index_uid { + Some(uid) if !uid.is_empty() => format!("Index `{}`: {error}", uid), + _ => format!("{error}") + })] + Milli { error: milli::Error, index_uid: Option }, + #[error("An unexpected crash occurred when processing the task: {0}")] + ProcessBatchPanicked(String), #[error(transparent)] FileStore(#[from] file_store::Error), #[error(transparent)] @@ -144,7 +149,9 @@ pub enum Error { #[error("Corrupted task queue.")] CorruptedTaskQueue, #[error(transparent)] - TaskDatabaseUpdate(Box), + DatabaseUpgrade(Box), + #[error(transparent)] + UnrecoverableError(Box), #[error(transparent)] HeedTransaction(heed::Error), @@ -184,14 +191,15 @@ impl Error { | Error::InvalidTaskCanceledBy { .. } | Error::InvalidIndexUid { .. } | Error::TaskNotFound(_) + | Error::TaskFileNotFound(_) | Error::BatchNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) - | Error::Milli(_) - | Error::ProcessBatchPanicked + | Error::Milli { .. } + | Error::ProcessBatchPanicked(_) | Error::FileStore(_) | Error::IoError(_) | Error::Persist(_) @@ -199,7 +207,8 @@ impl Error { | Error::Anyhow(_) => true, Error::CreateBatch(_) | Error::CorruptedTaskQueue - | Error::TaskDatabaseUpdate(_) + | Error::DatabaseUpgrade(_) + | Error::UnrecoverableError(_) | Error::HeedTransaction(_) => false, #[cfg(test)] Error::PlannedFailure => false, @@ -209,6 +218,20 @@ impl Error { pub fn with_custom_error_code(self, code: Code) -> Self { Self::WithCustomErrorCode(code, Box::new(self)) } + + pub fn from_milli(err: milli::Error, index_uid: Option) -> Self { + match err { + milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { + Self::Milli { error: err, index_uid } + .with_custom_error_code(Code::InvalidDocumentFilter) + } + milli::Error::UserError(milli::UserError::InvalidFilterExpression { .. }) => { + Self::Milli { error: err, index_uid } + .with_custom_error_code(Code::InvalidDocumentFilter) + } + _ => Self::Milli { error: err, index_uid }, + } + } } impl ErrorCode for Error { @@ -230,14 +253,15 @@ impl ErrorCode for Error { Error::InvalidTaskCanceledBy { .. } => Code::InvalidTaskCanceledBy, Error::InvalidIndexUid { .. } => Code::InvalidIndexUid, Error::TaskNotFound(_) => Code::TaskNotFound, + Error::TaskFileNotFound(_) => Code::TaskFileNotFound, Error::BatchNotFound(_) => Code::BatchNotFound, Error::TaskDeletionWithEmptyQuery => Code::MissingTaskFilters, Error::TaskCancelationWithEmptyQuery => Code::MissingTaskFilters, // TODO: not sure of the Code to use Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice, Error::Dump(e) => e.error_code(), - Error::Milli(e) => e.error_code(), - Error::ProcessBatchPanicked => Code::Internal, + Error::Milli { error, .. } => error.error_code(), + Error::ProcessBatchPanicked(_) => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), Error::FileStore(e) => e.error_code(), @@ -249,7 +273,8 @@ impl ErrorCode for Error { Error::Anyhow(_) => Code::Internal, Error::CorruptedTaskQueue => Code::Internal, Error::CorruptedDump => Code::Internal, - Error::TaskDatabaseUpdate(_) => Code::Internal, + Error::DatabaseUpgrade(_) => Code::Internal, + Error::UnrecoverableError(_) => Code::Internal, Error::CreateBatch(_) => Code::Internal, // This one should never be seen by the end user diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index f4ac80511..109e6b867 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -1,18 +1,29 @@ use std::sync::{Arc, RwLock}; -use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; +use meilisearch_types::features::{InstanceTogglableFeatures, Network, RuntimeTogglableFeatures}; use meilisearch_types::heed::types::{SerdeJson, Str}; -use meilisearch_types::heed::{Database, Env, RwTxn}; +use meilisearch_types::heed::{Database, Env, RwTxn, WithoutTls}; use crate::error::FeatureNotEnabledError; use crate::Result; -const EXPERIMENTAL_FEATURES: &str = "experimental-features"; +/// The number of database used by features +const NUMBER_OF_DATABASES: u32 = 1; +/// Database const names for the `FeatureData`. +mod db_name { + pub const EXPERIMENTAL_FEATURES: &str = "experimental-features"; +} + +mod db_keys { + pub const EXPERIMENTAL_FEATURES: &str = "experimental-features"; + pub const NETWORK: &str = "network"; +} #[derive(Clone)] pub(crate) struct FeatureData { persisted: Database>, runtime: Arc>, + network: Arc>, } #[derive(Debug, Clone, Copy)] @@ -56,19 +67,6 @@ impl RoFeatures { } } - pub fn check_vector(&self, disabled_action: &'static str) -> Result<()> { - if self.runtime.vector_store { - Ok(()) - } else { - Err(FeatureNotEnabledError { - disabled_action, - feature: "vector store", - issue_link: "https://github.com/meilisearch/product/discussions/677", - } - .into()) - } - } - pub fn check_edit_documents_by_function(&self, disabled_action: &'static str) -> Result<()> { if self.runtime.edit_documents_by_function { Ok(()) @@ -94,17 +92,62 @@ impl RoFeatures { .into()) } } + + pub fn check_network(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.network { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "network", + issue_link: "https://github.com/orgs/meilisearch/discussions/805", + } + .into()) + } + } + + pub fn check_get_task_documents_route(&self) -> Result<()> { + if self.runtime.get_task_documents_route { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action: "Getting the documents of an enqueued task", + feature: "get task documents route", + issue_link: "https://github.com/orgs/meilisearch/discussions/808", + } + .into()) + } + } + + pub fn check_composite_embedders(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.composite_embedders { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "composite embedders", + issue_link: "https://github.com/orgs/meilisearch/discussions/816", + } + .into()) + } + } } impl FeatureData { - pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result { - let mut wtxn = env.write_txn()?; - let runtime_features_db = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?; - wtxn.commit()?; + pub(crate) const fn nb_db() -> u32 { + NUMBER_OF_DATABASES + } + + pub fn new( + env: &Env, + wtxn: &mut RwTxn, + instance_features: InstanceTogglableFeatures, + ) -> Result { + let runtime_features_db = + env.create_database(wtxn, Some(db_name::EXPERIMENTAL_FEATURES))?; - let txn = env.read_txn()?; let persisted_features: RuntimeTogglableFeatures = - runtime_features_db.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default(); + runtime_features_db.get(wtxn, db_keys::EXPERIMENTAL_FEATURES)?.unwrap_or_default(); let InstanceTogglableFeatures { metrics, logs_route, contains_filter } = instance_features; let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures { metrics: metrics || persisted_features.metrics, @@ -113,7 +156,14 @@ impl FeatureData { ..persisted_features })); - Ok(Self { persisted: runtime_features_db, runtime }) + let network_db = runtime_features_db.remap_data_type::>(); + let network: Network = network_db.get(wtxn, db_keys::NETWORK)?.unwrap_or_default(); + + Ok(Self { + persisted: runtime_features_db, + runtime, + network: Arc::new(RwLock::new(network)), + }) } pub fn put_runtime_features( @@ -121,7 +171,7 @@ impl FeatureData { mut wtxn: RwTxn, features: RuntimeTogglableFeatures, ) -> Result<()> { - self.persisted.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?; + self.persisted.put(&mut wtxn, db_keys::EXPERIMENTAL_FEATURES, &features)?; wtxn.commit()?; // safe to unwrap, the lock will only fail if: @@ -142,4 +192,21 @@ impl FeatureData { pub fn features(&self) -> RoFeatures { RoFeatures::new(self) } + + pub fn put_network(&self, mut wtxn: RwTxn, new_network: Network) -> Result<()> { + self.persisted.remap_data_type::>().put( + &mut wtxn, + db_keys::NETWORK, + &new_network, + )?; + wtxn.commit()?; + + let mut network = self.network.write().unwrap(); + *network = new_network; + Ok(()) + } + + pub fn network(&self) -> Network { + Network::clone(&*self.network.read().unwrap()) + } } diff --git a/crates/index-scheduler/src/index_mapper/index_map.rs b/crates/index-scheduler/src/index_mapper/index_map.rs index f8080d23b..acb95f249 100644 --- a/crates/index-scheduler/src/index_mapper/index_map.rs +++ b/crates/index-scheduler/src/index_mapper/index_map.rs @@ -1,16 +1,17 @@ use std::collections::BTreeMap; +use std::env::VarError; use std::path::Path; +use std::str::FromStr; use std::time::Duration; use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions}; -use meilisearch_types::milli::Index; +use meilisearch_types::milli::{Index, Result}; use time::OffsetDateTime; use uuid::Uuid; use super::IndexStatus::{self, Available, BeingDeleted, Closing, Missing}; +use crate::clamp_to_page_size; use crate::lru::{InsertionOutcome, LruMap}; -use crate::{clamp_to_page_size, Result}; - /// Keep an internally consistent view of the open indexes in memory. /// /// This view is made of an LRU cache that will evict the least frequently used indexes when new indexes are opened. @@ -103,7 +104,7 @@ impl ReopenableIndex { return Ok(()); } map.unavailable.remove(&self.uuid); - map.create(&self.uuid, path, None, self.enable_mdb_writemap, self.map_size)?; + map.create(&self.uuid, path, None, self.enable_mdb_writemap, self.map_size, false)?; } Ok(()) } @@ -172,11 +173,12 @@ impl IndexMap { date: Option<(OffsetDateTime, OffsetDateTime)>, enable_mdb_writemap: bool, map_size: usize, + creation: bool, ) -> Result { if !matches!(self.get_unavailable(uuid), Missing) { panic!("Attempt to open an index that was unavailable"); } - let index = create_or_open_index(path, date, enable_mdb_writemap, map_size)?; + let index = create_or_open_index(path, date, enable_mdb_writemap, map_size, creation)?; match self.available.insert(*uuid, index.clone()) { InsertionOutcome::InsertedNew => (), InsertionOutcome::Evicted(evicted_uuid, evicted_index) => { @@ -300,18 +302,31 @@ fn create_or_open_index( date: Option<(OffsetDateTime, OffsetDateTime)>, enable_mdb_writemap: bool, map_size: usize, + creation: bool, ) -> Result { - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(clamp_to_page_size(map_size)); - options.max_readers(1024); + + // You can find more details about this experimental + // environment variable on the following GitHub discussion: + // + let max_readers = match std::env::var("MEILI_EXPERIMENTAL_INDEX_MAX_READERS") { + Ok(value) => u32::from_str(&value).unwrap(), + Err(VarError::NotPresent) => 1024, + Err(VarError::NotUnicode(value)) => panic!( + "Invalid unicode for the `MEILI_EXPERIMENTAL_INDEX_MAX_READERS` env var: {value:?}" + ), + }; + options.max_readers(max_readers); if enable_mdb_writemap { unsafe { options.flags(EnvFlags::WRITE_MAP) }; } if let Some((created, updated)) = date { - Ok(Index::new_with_creation_dates(options, path, created, updated)?) + Ok(Index::new_with_creation_dates(options, path, created, updated, creation)?) } else { - Ok(Index::new(options, path)?) + Ok(Index::new(options, path, creation)?) } } @@ -319,17 +334,17 @@ fn create_or_open_index( #[cfg(test)] mod tests { - use meilisearch_types::heed::Env; + use meilisearch_types::heed::{Env, WithoutTls}; use meilisearch_types::Index; use uuid::Uuid; use super::super::IndexMapper; - use crate::tests::IndexSchedulerHandle; + use crate::test_utils::IndexSchedulerHandle; use crate::utils::clamp_to_page_size; use crate::IndexScheduler; impl IndexMapper { - fn test() -> (Self, Env, IndexSchedulerHandle) { + fn test() -> (Self, Env, IndexSchedulerHandle) { let (index_scheduler, handle) = IndexScheduler::test(true, vec![]); (index_scheduler.index_mapper, index_scheduler.env, handle) } diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 3cccb5a69..c1f6ff472 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -4,7 +4,9 @@ use std::time::Duration; use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; -use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; +use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; +use meilisearch_types::milli; +use meilisearch_types::milli::database_stats::DatabaseStats; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -15,12 +17,17 @@ use uuid::Uuid; use self::index_map::IndexMap; use self::IndexStatus::{Available, BeingDeleted, Closing, Missing}; use crate::uuid_codec::UuidCodec; -use crate::{Error, Result}; +use crate::{Error, IndexBudget, IndexSchedulerOptions, Result}; mod index_map; -const INDEX_MAPPING: &str = "index-mapping"; -const INDEX_STATS: &str = "index-stats"; +/// The number of database used by index mapper +const NUMBER_OF_DATABASES: u32 = 2; +/// Database const names for the `IndexMapper`. +mod db_name { + pub const INDEX_MAPPING: &str = "index-mapping"; + pub const INDEX_STATS: &str = "index-stats"; +} /// Structure managing meilisearch's indexes. /// @@ -92,19 +99,32 @@ pub enum IndexStatus { /// The statistics that can be computed from an `Index` object. #[derive(Serialize, Deserialize, Debug)] pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, + /// Stats of the documents database. + #[serde(default)] + pub documents_database_stats: DatabaseStats, + + #[serde(default, skip_serializing)] + pub number_of_documents: Option, + /// Size taken up by the index' DB, in bytes. /// /// This includes the size taken by both the used and free pages of the DB, and as the free pages /// are not returned to the disk after a deletion, this number is typically larger than /// `used_database_size` that only includes the size of the used pages. pub database_size: u64, + /// Number of embeddings in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embeddings: Option, + /// Number of embedded documents in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embedded_documents: Option, /// Size taken by the used pages of the index' DB, in bytes. /// /// As the DB backend does not return to the disk the pages that are not currently used by the DB, /// this value is typically smaller than `database_size`. pub used_database_size: u64, + /// The primary key of the index + pub primary_key: Option, /// Association of every field name with the number of times it occurs in the documents. pub field_distribution: FieldDistribution, /// Creation date of the index. @@ -121,11 +141,16 @@ impl IndexStats { /// # Parameters /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. - pub fn new(index: &Index, rtxn: &RoTxn) -> Result { + pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { + let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { - number_of_documents: index.number_of_documents(rtxn)?, + number_of_embeddings: Some(arroy_stats.number_of_embeddings), + number_of_embedded_documents: Some(arroy_stats.documents.len()), + documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(), + number_of_documents: None, database_size: index.on_disk_size()?, used_database_size: index.used_size()?, + primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), field_distribution: index.field_distribution(rtxn)?, created_at: index.created_at(rtxn)?, updated_at: index.updated_at(rtxn)?, @@ -134,29 +159,25 @@ impl IndexStats { } impl IndexMapper { - pub fn new( - env: &Env, - base_path: PathBuf, - index_base_map_size: usize, - index_growth_amount: usize, - index_count: usize, - enable_mdb_writemap: bool, - indexer_config: IndexerConfig, - ) -> Result { - let mut wtxn = env.write_txn()?; - let index_mapping = env.create_database(&mut wtxn, Some(INDEX_MAPPING))?; - let index_stats = env.create_database(&mut wtxn, Some(INDEX_STATS))?; - wtxn.commit()?; + pub(crate) const fn nb_db() -> u32 { + NUMBER_OF_DATABASES + } + pub fn new( + env: &Env, + wtxn: &mut RwTxn, + options: &IndexSchedulerOptions, + budget: IndexBudget, + ) -> Result { Ok(Self { - index_map: Arc::new(RwLock::new(IndexMap::new(index_count))), - index_mapping, - index_stats, - base_path, - index_base_map_size, - index_growth_amount, - enable_mdb_writemap, - indexer_config: Arc::new(indexer_config), + index_map: Arc::new(RwLock::new(IndexMap::new(budget.index_count))), + index_mapping: env.create_database(wtxn, Some(db_name::INDEX_MAPPING))?, + index_stats: env.create_database(wtxn, Some(db_name::INDEX_STATS))?, + base_path: options.indexes_path.clone(), + index_base_map_size: budget.map_size, + index_growth_amount: options.index_growth_amount, + enable_mdb_writemap: options.enable_mdb_writemap, + indexer_config: options.indexer_config.clone(), currently_updating_index: Default::default(), }) } @@ -183,13 +204,24 @@ impl IndexMapper { // Error if the UUIDv4 somehow already exists in the map, since it should be fresh. // This is very unlikely to happen in practice. // TODO: it would be better to lazily create the index. But we need an Index::open function for milli. - let index = self.index_map.write().unwrap().create( - &uuid, - &index_path, - date, - self.enable_mdb_writemap, - self.index_base_map_size, - )?; + let index = self + .index_map + .write() + .unwrap() + .create( + &uuid, + &index_path, + date, + self.enable_mdb_writemap, + self.index_base_map_size, + true, + ) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; + let index_rtxn = index.read_txn()?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(name.to_string())))?; + self.store_stats_of(&mut wtxn, name, &stats)?; + drop(index_rtxn); wtxn.commit()?; @@ -357,7 +389,9 @@ impl IndexMapper { }; let index_path = self.base_path.join(uuid.to_string()); // take the lock to reopen the environment. - reopen.reopen(&mut self.index_map.write().unwrap(), &index_path)?; + reopen + .reopen(&mut self.index_map.write().unwrap(), &index_path) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; continue; } BeingDeleted => return Err(Error::IndexNotFound(name.to_string())), @@ -372,13 +406,16 @@ impl IndexMapper { Missing => { let index_path = self.base_path.join(uuid.to_string()); - break index_map.create( - &uuid, - &index_path, - None, - self.enable_mdb_writemap, - self.index_base_map_size, - )?; + break index_map + .create( + &uuid, + &index_path, + None, + self.enable_mdb_writemap, + self.index_base_map_size, + false, + ) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?; } Available(index) => break index, Closing(_) => { @@ -460,6 +497,7 @@ impl IndexMapper { let index = self.index(rtxn, index_uid)?; let index_rtxn = index.read_txn()?; IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(uuid.to_string()))) } } } diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index bcd5966b5..bcc295afd 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -1,15 +1,16 @@ use std::collections::BTreeSet; use std::fmt::Write; -use meilisearch_types::batches::Batch; +use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats}; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str}; use meilisearch_types::heed::{Database, RoTxn}; use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; -use meilisearch_types::tasks::{Details, Task}; +use meilisearch_types::tasks::{Details, Kind, Status, Task}; +use meilisearch_types::versioning; use roaring::RoaringBitmap; use crate::index_mapper::IndexMapper; -use crate::{IndexScheduler, Kind, Status, BEI128}; +use crate::{IndexScheduler, BEI128}; pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { // Since we'll snapshot the index right afterward, we don't need to ensure it's internally consistent for every run. @@ -18,41 +19,15 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { scheduler.assert_internally_consistent(); let IndexScheduler { - autobatching_enabled, cleanup_enabled: _, - must_stop_processing: _, processing_tasks, - file_store, env, - all_tasks, - all_batches, - batch_to_tasks_mapping, - // task reverse index - status, - kind, - index_tasks, - canceled_by, - enqueued_at, - started_at, - finished_at, - - // batch reverse index - batch_status, - batch_kind, - batch_index_tasks, - batch_enqueued_at, - batch_started_at, - batch_finished_at, + version, + queue, + scheduler, index_mapper, features: _, - max_number_of_tasks: _, - max_number_of_batched_tasks: _, - wake_up: _, - dumps_path: _, - snapshots_path: _, - auth_path: _, - version_file_path: _, webhook_url: _, webhook_authorization_header: _, test_breakpoint_sdr: _, @@ -65,8 +40,18 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { let mut snap = String::new(); + let indx_sched_version = version.get_version(&rtxn).unwrap(); + let latest_version = ( + versioning::VERSION_MAJOR.parse().unwrap(), + versioning::VERSION_MINOR.parse().unwrap(), + versioning::VERSION_PATCH.parse().unwrap(), + ); + if indx_sched_version != Some(latest_version) { + snap.push_str(&format!("index scheduler running on version {indx_sched_version:?}\n")); + } + let processing = processing_tasks.read().unwrap().clone(); - snap.push_str(&format!("### Autobatching Enabled = {autobatching_enabled}\n")); + snap.push_str(&format!("### Autobatching Enabled = {}\n", scheduler.autobatching_enabled)); snap.push_str(&format!( "### Processing batch {:?}:\n", processing.batch.as_ref().map(|batch| batch.uid) @@ -79,19 +64,19 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { snap.push_str("\n----------------------------------------------------------------------\n"); snap.push_str("### All Tasks:\n"); - snap.push_str(&snapshot_all_tasks(&rtxn, *all_tasks)); + snap.push_str(&snapshot_all_tasks(&rtxn, queue.tasks.all_tasks)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Status:\n"); - snap.push_str(&snapshot_status(&rtxn, *status)); + snap.push_str(&snapshot_status(&rtxn, queue.tasks.status)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Kind:\n"); - snap.push_str(&snapshot_kind(&rtxn, *kind)); + snap.push_str(&snapshot_kind(&rtxn, queue.tasks.kind)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Index Tasks:\n"); - snap.push_str(&snapshot_index_tasks(&rtxn, *index_tasks)); + snap.push_str(&snapshot_index_tasks(&rtxn, queue.tasks.index_tasks)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Index Mapper:\n"); @@ -99,55 +84,55 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { snap.push_str("\n----------------------------------------------------------------------\n"); snap.push_str("### Canceled By:\n"); - snap.push_str(&snapshot_canceled_by(&rtxn, *canceled_by)); + snap.push_str(&snapshot_canceled_by(&rtxn, queue.tasks.canceled_by)); snap.push_str("\n----------------------------------------------------------------------\n"); snap.push_str("### Enqueued At:\n"); - snap.push_str(&snapshot_date_db(&rtxn, *enqueued_at)); + snap.push_str(&snapshot_date_db(&rtxn, queue.tasks.enqueued_at)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Started At:\n"); - snap.push_str(&snapshot_date_db(&rtxn, *started_at)); + snap.push_str(&snapshot_date_db(&rtxn, queue.tasks.started_at)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Finished At:\n"); - snap.push_str(&snapshot_date_db(&rtxn, *finished_at)); + snap.push_str(&snapshot_date_db(&rtxn, queue.tasks.finished_at)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### All Batches:\n"); - snap.push_str(&snapshot_all_batches(&rtxn, *all_batches)); + snap.push_str(&snapshot_all_batches(&rtxn, queue.batches.all_batches)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batch to tasks mapping:\n"); - snap.push_str(&snapshot_batches_to_tasks_mappings(&rtxn, *batch_to_tasks_mapping)); + snap.push_str(&snapshot_batches_to_tasks_mappings(&rtxn, queue.batch_to_tasks_mapping)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batches Status:\n"); - snap.push_str(&snapshot_status(&rtxn, *batch_status)); + snap.push_str(&snapshot_status(&rtxn, queue.batches.status)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batches Kind:\n"); - snap.push_str(&snapshot_kind(&rtxn, *batch_kind)); + snap.push_str(&snapshot_kind(&rtxn, queue.batches.kind)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batches Index Tasks:\n"); - snap.push_str(&snapshot_index_tasks(&rtxn, *batch_index_tasks)); + snap.push_str(&snapshot_index_tasks(&rtxn, queue.batches.index_tasks)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batches Enqueued At:\n"); - snap.push_str(&snapshot_date_db(&rtxn, *batch_enqueued_at)); + snap.push_str(&snapshot_date_db(&rtxn, queue.batches.enqueued_at)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batches Started At:\n"); - snap.push_str(&snapshot_date_db(&rtxn, *batch_started_at)); + snap.push_str(&snapshot_date_db(&rtxn, queue.batches.started_at)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### Batches Finished At:\n"); - snap.push_str(&snapshot_date_db(&rtxn, *batch_finished_at)); + snap.push_str(&snapshot_date_db(&rtxn, queue.batches.finished_at)); snap.push_str("----------------------------------------------------------------------\n"); snap.push_str("### File Store:\n"); - snap.push_str(&snapshot_file_store(file_store)); + snap.push_str(&snapshot_file_store(&queue.file_store)); snap.push_str("\n----------------------------------------------------------------------\n"); snap @@ -306,6 +291,9 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } + Details::UpgradeDatabase { from, to } => { + format!("{{ from: {from:?}, to: {to:?} }}") + } } } @@ -353,14 +341,23 @@ pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database String { let mut snap = String::new(); - let Batch { uid, details, stats, started_at, finished_at } = batch; + let Batch { uid, details, stats, started_at, finished_at, progress: _, enqueued_at } = batch; + let stats = BatchStats { + progress_trace: Default::default(), + write_channel_congestion: None, + ..stats.clone() + }; if let Some(finished_at) = finished_at { assert!(finished_at > started_at); } + let BatchEnqueuedAt { earliest, oldest } = enqueued_at.unwrap(); + assert!(*started_at > earliest); + assert!(earliest >= oldest); + snap.push('{'); snap.push_str(&format!("uid: {uid}, ")); snap.push_str(&format!("details: {}, ", serde_json::to_string(details).unwrap())); - snap.push_str(&format!("stats: {}, ", serde_json::to_string(stats).unwrap())); + snap.push_str(&format!("stats: {}, ", serde_json::to_string(&stats).unwrap())); snap.push('}'); snap } @@ -373,7 +370,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String { let stats = mapper.stats_of(rtxn, &name).unwrap(); s.push_str(&format!( "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", - stats.number_of_documents, stats.field_distribution + stats.documents_database_stats.number_of_entries(), + stats.field_distribution )); } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index cef24c1ea..feb08316c 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -1,7 +1,7 @@ /*! This crate defines the index scheduler, which is responsible for: 1. Keeping references to meilisearch's indexes and mapping them to their -user-defined names. + user-defined names. 2. Scheduling tasks given by the user and executing them, in batch if possible. When an `IndexScheduler` is created, a new thread containing a reference to the @@ -18,247 +18,62 @@ called asynchronously from any thread. These methods can either query the content of the scheduler or enqueue new tasks. */ -mod autobatcher; -mod batch; +mod dump; pub mod error; mod features; mod index_mapper; #[cfg(test)] mod insta_snapshot; mod lru; +mod processing; +mod queue; +mod scheduler; +#[cfg(test)] +mod test_utils; +pub mod upgrade; mod utils; pub mod uuid_codec; +pub mod versioning; -pub type Result = std::result::Result; +pub type Result = std::result::Result; pub type TaskId = u32; use std::collections::{BTreeMap, HashMap}; use std::io::{self, BufReader, Read}; -use std::ops::{Bound, RangeBounds}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::path::{Path, PathBuf}; -use std::sync::atomic::Ordering::{self, Relaxed}; -use std::sync::atomic::{AtomicBool, AtomicU32}; use std::sync::{Arc, RwLock}; use std::time::Duration; -use dump::{KindDump, TaskDump, UpdateFile}; +use dump::Dump; pub use error::Error; pub use features::RoFeatures; -use file_store::FileStore; use flate2::bufread::GzEncoder; use flate2::Compression; -use meilisearch_types::batches::{Batch, BatchId}; -use meilisearch_types::error::ResponseError; -use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures}; +use meilisearch_types::batches::Batch; +use meilisearch_types::features::{InstanceTogglableFeatures, Network, RuntimeTogglableFeatures}; use meilisearch_types::heed::byteorder::BE; -use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; -use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; -use meilisearch_types::milli::documents::DocumentsBatchBuilder; +use meilisearch_types::heed::types::I128; +use meilisearch_types::heed::{self, Env, RoTxn, WithoutTls}; use meilisearch_types::milli::index::IndexEmbeddingConfig; -use meilisearch_types::milli::update::new::indexer::document_changes::Progress; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; -use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; +use meilisearch_types::milli::{self, Index}; use meilisearch_types::task_view::TaskView; -use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task, TaskProgress}; -use rayon::current_num_threads; -use rayon::prelude::{IntoParallelIterator, ParallelIterator}; +use meilisearch_types::tasks::{KindWithContent, Task}; +use processing::ProcessingTasks; +pub use queue::Query; +use queue::Queue; use roaring::RoaringBitmap; -use synchronoise::SignalEvent; -use time::format_description::well_known::Rfc3339; +use scheduler::Scheduler; use time::OffsetDateTime; -use utils::{filter_out_references_to_newer_tasks, keep_ids_within_datetimes, map_bound}; -use uuid::Uuid; +use versioning::Versioning; use crate::index_mapper::IndexMapper; -use crate::utils::{check_index_swap_validity, clamp_to_page_size, ProcessingBatch}; +use crate::utils::clamp_to_page_size; pub(crate) type BEI128 = I128; -/// Defines a subset of tasks to be retrieved from the [`IndexScheduler`]. -/// -/// An empty/default query (where each field is set to `None`) matches all tasks. -/// Each non-null field restricts the set of tasks further. -#[derive(Default, Debug, Clone, PartialEq, Eq)] -pub struct Query { - /// The maximum number of tasks to be matched - pub limit: Option, - /// The minimum [task id](`meilisearch_types::tasks::Task::uid`) to be matched - pub from: Option, - /// The order used to return the tasks. By default the newest tasks are returned first and the boolean is `false`. - pub reverse: Option, - /// The [task ids](`meilisearch_types::tasks::Task::uid`) to be matched - pub uids: Option>, - /// The [batch ids](`meilisearch_types::batches::Batch::uid`) to be matched - pub batch_uids: Option>, - /// The allowed [statuses](`meilisearch_types::tasks::Task::status`) of the matched tasls - pub statuses: Option>, - /// The allowed [kinds](meilisearch_types::tasks::Kind) of the matched tasks. - /// - /// The kind of a task is given by: - /// ``` - /// # use meilisearch_types::tasks::{Task, Kind}; - /// # fn doc_func(task: Task) -> Kind { - /// task.kind.as_kind() - /// # } - /// ``` - pub types: Option>, - /// The allowed [index ids](meilisearch_types::tasks::Task::index_uid) of the matched tasks - pub index_uids: Option>, - /// The [task ids](`meilisearch_types::tasks::Task::uid`) of the [`TaskCancelation`](meilisearch_types::tasks::Task::Kind::TaskCancelation) tasks - /// that canceled the matched tasks. - pub canceled_by: Option>, - /// Exclusive upper bound of the matched tasks' [`enqueued_at`](meilisearch_types::tasks::Task::enqueued_at) field. - pub before_enqueued_at: Option, - /// Exclusive lower bound of the matched tasks' [`enqueued_at`](meilisearch_types::tasks::Task::enqueued_at) field. - pub after_enqueued_at: Option, - /// Exclusive upper bound of the matched tasks' [`started_at`](meilisearch_types::tasks::Task::started_at) field. - pub before_started_at: Option, - /// Exclusive lower bound of the matched tasks' [`started_at`](meilisearch_types::tasks::Task::started_at) field. - pub after_started_at: Option, - /// Exclusive upper bound of the matched tasks' [`finished_at`](meilisearch_types::tasks::Task::finished_at) field. - pub before_finished_at: Option, - /// Exclusive lower bound of the matched tasks' [`finished_at`](meilisearch_types::tasks::Task::finished_at) field. - pub after_finished_at: Option, -} - -impl Query { - /// Return `true` if every field of the query is set to `None`, such that the query - /// matches all tasks. - pub fn is_empty(&self) -> bool { - matches!( - self, - Query { - limit: None, - from: None, - reverse: None, - uids: None, - batch_uids: None, - statuses: None, - types: None, - index_uids: None, - canceled_by: None, - before_enqueued_at: None, - after_enqueued_at: None, - before_started_at: None, - after_started_at: None, - before_finished_at: None, - after_finished_at: None, - } - ) - } - - /// Add an [index id](meilisearch_types::tasks::Task::index_uid) to the list of permitted indexes. - pub fn with_index(self, index_uid: String) -> Self { - let mut index_vec = self.index_uids.unwrap_or_default(); - index_vec.push(index_uid); - Self { index_uids: Some(index_vec), ..self } - } - - // Removes the `from` and `limit` restrictions from the query. - // Useful to get the total number of tasks matching a filter. - pub fn without_limits(self) -> Self { - Query { limit: None, from: None, ..self } - } -} - -#[derive(Debug, Clone)] -pub struct ProcessingTasks { - batch: Option, - /// The list of tasks ids that are currently running. - processing: RoaringBitmap, - /// The progress on processing tasks - progress: Option, -} - -impl ProcessingTasks { - /// Creates an empty `ProcessingAt` struct. - fn new() -> ProcessingTasks { - ProcessingTasks { batch: None, processing: RoaringBitmap::new(), progress: None } - } - - /// Stores the currently processing tasks, and the date time at which it started. - fn start_processing(&mut self, processing_batch: ProcessingBatch, processing: RoaringBitmap) { - self.batch = Some(processing_batch); - self.processing = processing; - } - - fn update_progress(&mut self, progress: Progress) -> TaskProgress { - self.progress.get_or_insert_with(TaskProgress::default).update(progress) - } - - /// Set the processing tasks to an empty list - fn stop_processing(&mut self) -> Self { - self.progress = None; - - Self { - batch: std::mem::take(&mut self.batch), - processing: std::mem::take(&mut self.processing), - progress: None, - } - } - - /// Returns `true` if there, at least, is one task that is currently processing that we must stop. - fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool { - !self.processing.is_disjoint(canceled_tasks) - } -} - -#[derive(Default, Clone, Debug)] -struct MustStopProcessing(Arc); - -impl MustStopProcessing { - fn get(&self) -> bool { - self.0.load(Relaxed) - } - - fn must_stop(&self) { - self.0.store(true, Relaxed); - } - - fn reset(&self) { - self.0.store(false, Relaxed); - } -} - -/// Database const names for the `IndexScheduler`. -mod db_name { - pub const ALL_TASKS: &str = "all-tasks"; - pub const ALL_BATCHES: &str = "all-batches"; - pub const BATCH_TO_TASKS_MAPPING: &str = "batch-to-tasks-mapping"; - pub const STATUS: &str = "status"; - pub const KIND: &str = "kind"; - pub const INDEX_TASKS: &str = "index-tasks"; - pub const CANCELED_BY: &str = "canceled_by"; - pub const ENQUEUED_AT: &str = "enqueued-at"; - pub const STARTED_AT: &str = "started-at"; - pub const FINISHED_AT: &str = "finished-at"; - - pub const BATCH_STATUS: &str = "batch-status"; - pub const BATCH_KIND: &str = "batch-kind"; - pub const BATCH_INDEX_TASKS: &str = "batch-index-tasks"; - pub const BATCH_ENQUEUED_AT: &str = "batch-enqueued-at"; - pub const BATCH_STARTED_AT: &str = "batch-started-at"; - pub const BATCH_FINISHED_AT: &str = "batch-finished-at"; -} - -#[cfg(test)] -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Breakpoint { - // this state is only encountered while creating the scheduler in the test suite. - Init, - - Start, - BatchCreated, - BeforeProcessing, - AfterProcessing, - AbortedIndexation, - ProcessBatchSucceeded, - ProcessBatchFailed, - InsideProcessBatch, -} - #[derive(Debug)] pub struct IndexSchedulerOptions { /// The path to the version file of Meilisearch. @@ -290,7 +105,7 @@ pub struct IndexSchedulerOptions { /// The number of indexes that can be concurrently opened in memory. pub index_count: usize, /// Configuration used during indexing for each meilisearch index. - pub indexer_config: IndexerConfig, + pub indexer_config: Arc, /// Set to `true` iff the index scheduler is allowed to automatically /// batch tasks together, to process multiple tasks at once. pub autobatching_enabled: bool, @@ -303,102 +118,53 @@ pub struct IndexSchedulerOptions { /// If the autobatcher is allowed to automatically batch tasks /// it will only batch this defined number of tasks at once. pub max_number_of_batched_tasks: usize, + /// If the autobatcher is allowed to automatically batch tasks + /// it will only batch this defined maximum size (in bytes) of tasks at once. + pub batched_tasks_size_limit: u64, /// The experimental features enabled for this instance. pub instance_features: InstanceTogglableFeatures, + /// The experimental features enabled for this instance. + pub auto_upgrade: bool, + /// The maximal number of entries in the search query cache of an embedder. + /// + /// 0 disables the cache. + pub embedding_cache_cap: usize, } /// Structure which holds meilisearch's indexes and schedules the tasks /// to be performed on them. pub struct IndexScheduler { /// The LMDB environment which the DBs are associated with. - pub(crate) env: Env, - - /// A boolean that can be set to true to stop the currently processing tasks. - pub(crate) must_stop_processing: MustStopProcessing, + pub(crate) env: Env, /// The list of tasks currently processing pub(crate) processing_tasks: Arc>, - /// The list of files referenced by the tasks - pub(crate) file_store: FileStore, - - /// The main database, it contains all the tasks accessible by their Id. - pub(crate) all_tasks: Database>, - - /// Contains all the batches accessible by their Id. - pub(crate) all_batches: Database>, - - /// Matches a batch id with the associated task ids. - pub(crate) batch_to_tasks_mapping: Database, - - /// All the tasks ids grouped by their status. - // TODO we should not be able to serialize a `Status::Processing` in this database. - pub(crate) status: Database, RoaringBitmapCodec>, - /// All the tasks ids grouped by their kind. - pub(crate) kind: Database, RoaringBitmapCodec>, - /// Store the tasks associated to an index. - pub(crate) index_tasks: Database, - /// Store the tasks that were canceled by a task uid - pub(crate) canceled_by: Database, - /// Store the task ids of tasks which were enqueued at a specific date - pub(crate) enqueued_at: Database, - /// Store the task ids of finished tasks which started being processed at a specific date - pub(crate) started_at: Database, - /// Store the task ids of tasks which finished at a specific date - pub(crate) finished_at: Database, - - /// All the batches containing a task matching the selected status. - pub(crate) batch_status: Database, RoaringBitmapCodec>, - /// All the batches ids grouped by the kind of their task. - pub(crate) batch_kind: Database, RoaringBitmapCodec>, - /// Store the batches associated to an index. - pub(crate) batch_index_tasks: Database, - /// Store the batches containing tasks which were enqueued at a specific date - pub(crate) batch_enqueued_at: Database, - /// Store the batches containing finished tasks started at a specific date - pub(crate) batch_started_at: Database, - /// Store the batches containing tasks finished at a specific date - pub(crate) batch_finished_at: Database, - + /// A database containing only the version of the index-scheduler + pub version: versioning::Versioning, + /// The queue containing both the tasks and the batches. + pub queue: queue::Queue, /// In charge of creating, opening, storing and returning indexes. pub(crate) index_mapper: IndexMapper, - /// In charge of fetching and setting the status of experimental features. features: features::FeatureData, - /// Get a signal when a batch needs to be processed. - pub(crate) wake_up: Arc, - - /// Whether auto-batching is enabled or not. - pub(crate) autobatching_enabled: bool, + /// Everything related to the processing of the tasks + pub scheduler: scheduler::Scheduler, /// Whether we should automatically cleanup the task queue or not. pub(crate) cleanup_enabled: bool, - /// The max number of tasks allowed before the scheduler starts to delete - /// the finished tasks automatically. - pub(crate) max_number_of_tasks: usize, - - /// The maximum number of tasks that will be batched together. - pub(crate) max_number_of_batched_tasks: usize, - /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, /// The Authorization header to send to the webhook URL. pub(crate) webhook_authorization_header: Option, - /// The path used to create the dumps. - pub(crate) dumps_path: PathBuf, - - /// The path used to create the snapshots. - pub(crate) snapshots_path: PathBuf, - - /// The path to the folder containing the auth LMDB env. - pub(crate) auth_path: PathBuf, - - /// The path to the version file of Meilisearch. - pub(crate) version_file_path: PathBuf, - + /// A map to retrieve the runtime representation of an embedder depending on its configuration. + /// + /// This map may return the same embedder object for two different indexes or embedder settings, + /// but it will only do this if the embedder configuration options are the same, leading + /// to the same embeddings for the same input text. embedders: Arc>>>, // ================= test @@ -407,13 +173,13 @@ pub struct IndexScheduler { /// /// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation. #[cfg(test)] - test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + test_breakpoint_sdr: crossbeam_channel::Sender<(test_utils::Breakpoint, bool)>, /// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler. /// /// The first field is the iteration index and the second field identifies a location in the code. #[cfg(test)] - planned_failures: Vec<(usize, tests::FailureLocation)>, + planned_failures: Vec<(usize, test_utils::FailureLocation)>, /// A counter that is incremented before every call to [`tick`](IndexScheduler::tick) #[cfg(test)] @@ -424,40 +190,13 @@ impl IndexScheduler { fn private_clone(&self) -> IndexScheduler { IndexScheduler { env: self.env.clone(), - must_stop_processing: self.must_stop_processing.clone(), processing_tasks: self.processing_tasks.clone(), - file_store: self.file_store.clone(), - all_tasks: self.all_tasks, - all_batches: self.all_batches, - batch_to_tasks_mapping: self.batch_to_tasks_mapping, - - // Tasks reverse index - status: self.status, - kind: self.kind, - index_tasks: self.index_tasks, - canceled_by: self.canceled_by, - enqueued_at: self.enqueued_at, - started_at: self.started_at, - finished_at: self.finished_at, - - // Batches reverse index - batch_status: self.batch_status, - batch_kind: self.batch_kind, - batch_index_tasks: self.batch_index_tasks, - batch_enqueued_at: self.batch_enqueued_at, - batch_started_at: self.batch_started_at, - batch_finished_at: self.batch_finished_at, + version: self.version.clone(), + queue: self.queue.private_clone(), + scheduler: self.scheduler.private_clone(), index_mapper: self.index_mapper.clone(), - wake_up: self.wake_up.clone(), - autobatching_enabled: self.autobatching_enabled, cleanup_enabled: self.cleanup_enabled, - max_number_of_tasks: self.max_number_of_tasks, - max_number_of_batched_tasks: self.max_number_of_batched_tasks, - snapshots_path: self.snapshots_path.clone(), - dumps_path: self.dumps_path.clone(), - auth_path: self.auth_path.clone(), - version_file_path: self.version_file_path.clone(), webhook_url: self.webhook_url.clone(), webhook_authorization_header: self.webhook_authorization_header.clone(), embedders: self.embedders.clone(), @@ -470,14 +209,19 @@ impl IndexScheduler { features: self.features.clone(), } } -} -impl IndexScheduler { + pub(crate) const fn nb_db() -> u32 { + Versioning::nb_db() + Queue::nb_db() + IndexMapper::nb_db() + features::FeatureData::nb_db() + } + /// Create an index scheduler and start its run loop. + #[allow(private_interfaces)] // because test_utils is private pub fn new( options: IndexSchedulerOptions, - #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, - #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>, + auth_env: Env, + from_db_version: (u32, u32, u32), + #[cfg(test)] test_breakpoint_sdr: crossbeam_channel::Sender<(test_utils::Breakpoint, bool)>, + #[cfg(test)] planned_failures: Vec<(usize, test_utils::FailureLocation)>, ) -> Result { std::fs::create_dir_all(&options.tasks_path)?; std::fs::create_dir_all(&options.update_file_path)?; @@ -506,83 +250,33 @@ impl IndexScheduler { }; let env = unsafe { - heed::EnvOpenOptions::new() - .max_dbs(19) + let env_options = heed::EnvOpenOptions::new(); + let mut env_options = env_options.read_txn_without_tls(); + env_options + .max_dbs(Self::nb_db()) .map_size(budget.task_db_size) - .open(options.tasks_path) + .open(&options.tasks_path) }?; - let features = features::FeatureData::new(&env, options.instance_features)?; - - let file_store = FileStore::new(&options.update_file_path)?; + // We **must** starts by upgrading the version because it'll also upgrade the required database before we can open them + let version = versioning::Versioning::new(&env, from_db_version)?; let mut wtxn = env.write_txn()?; - let all_tasks = env.create_database(&mut wtxn, Some(db_name::ALL_TASKS))?; - let all_batches = env.create_database(&mut wtxn, Some(db_name::ALL_BATCHES))?; - let batch_to_tasks_mapping = - env.create_database(&mut wtxn, Some(db_name::BATCH_TO_TASKS_MAPPING))?; - - let status = env.create_database(&mut wtxn, Some(db_name::STATUS))?; - let kind = env.create_database(&mut wtxn, Some(db_name::KIND))?; - let index_tasks = env.create_database(&mut wtxn, Some(db_name::INDEX_TASKS))?; - let canceled_by = env.create_database(&mut wtxn, Some(db_name::CANCELED_BY))?; - let enqueued_at = env.create_database(&mut wtxn, Some(db_name::ENQUEUED_AT))?; - let started_at = env.create_database(&mut wtxn, Some(db_name::STARTED_AT))?; - let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?; - - let batch_status = env.create_database(&mut wtxn, Some(db_name::BATCH_STATUS))?; - let batch_kind = env.create_database(&mut wtxn, Some(db_name::BATCH_KIND))?; - let batch_index_tasks = env.create_database(&mut wtxn, Some(db_name::BATCH_INDEX_TASKS))?; - let batch_enqueued_at = env.create_database(&mut wtxn, Some(db_name::BATCH_ENQUEUED_AT))?; - let batch_started_at = env.create_database(&mut wtxn, Some(db_name::BATCH_STARTED_AT))?; - let batch_finished_at = env.create_database(&mut wtxn, Some(db_name::BATCH_FINISHED_AT))?; + let features = features::FeatureData::new(&env, &mut wtxn, options.instance_features)?; + let queue = Queue::new(&env, &mut wtxn, &options)?; + let index_mapper = IndexMapper::new(&env, &mut wtxn, &options, budget)?; wtxn.commit()?; // allow unreachable_code to get rids of the warning in the case of a test build. let this = Self { - must_stop_processing: MustStopProcessing::default(), processing_tasks: Arc::new(RwLock::new(ProcessingTasks::new())), - file_store, - all_tasks, - all_batches, - batch_to_tasks_mapping, - // Task reverse indexes - status, - kind, - index_tasks, - canceled_by, - enqueued_at, - started_at, - finished_at, + version, + queue, + scheduler: Scheduler::new(&options, auth_env), - // Batch reverse indexes - batch_status, - batch_kind, - batch_index_tasks, - batch_enqueued_at, - batch_started_at, - batch_finished_at, - - index_mapper: IndexMapper::new( - &env, - options.indexes_path, - budget.map_size, - options.index_growth_amount, - budget.index_count, - options.enable_mdb_writemap, - options.indexer_config, - )?, + index_mapper, env, - // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things - wake_up: Arc::new(SignalEvent::auto(true)), - autobatching_enabled: options.autobatching_enabled, cleanup_enabled: options.cleanup_enabled, - max_number_of_tasks: options.max_number_of_tasks, - max_number_of_batched_tasks: options.max_number_of_batched_tasks, - dumps_path: options.dumps_path, - snapshots_path: options.snapshots_path, - auth_path: options.auth_path, - version_file_path: options.version_file_path, webhook_url: options.webhook_url, webhook_authorization_header: options.webhook_authorization_header, embedders: Default::default(), @@ -603,7 +297,7 @@ impl IndexScheduler { /// Return `Ok(())` if the index scheduler is able to access one of its database. pub fn health(&self) -> Result<()> { let rtxn = self.env.read_txn()?; - self.all_tasks.first(&rtxn)?; + self.queue.batch_to_tasks_mapping.first(&rtxn)?; Ok(()) } @@ -676,7 +370,7 @@ impl IndexScheduler { } } - pub fn read_txn(&self) -> Result { + pub fn read_txn(&self) -> Result> { self.env.read_txn().map_err(|e| e.into()) } @@ -690,15 +384,16 @@ impl IndexScheduler { .name(String::from("scheduler")) .spawn(move || { #[cfg(test)] - run.breakpoint(Breakpoint::Init); + run.breakpoint(test_utils::Breakpoint::Init); - run.wake_up.wait_timeout(std::time::Duration::from_secs(60)); + run.scheduler.wake_up.wait_timeout(std::time::Duration::from_secs(60)); loop { let ret = catch_unwind(AssertUnwindSafe(|| run.tick())); match ret { Ok(Ok(TickOutcome::TickAgain(_))) => (), - Ok(Ok(TickOutcome::WaitForSignal)) => run.wake_up.wait(), + Ok(Ok(TickOutcome::WaitForSignal)) => run.scheduler.wake_up.wait(), + Ok(Ok(TickOutcome::StopProcessingForever)) => break, Ok(Err(e)) => { tracing::error!("{e}"); // Wait one second when an irrecoverable error occurs. @@ -747,11 +442,13 @@ impl IndexScheduler { let rtxn = self.env.read_txn()?; self.index_mapper.index(&rtxn, name) } + /// Return the boolean referring if index exists. pub fn index_exists(&self, name: &str) -> Result { let rtxn = self.env.read_txn()?; self.index_mapper.index_exists(&rtxn, name) } + /// Return the name of all indexes without opening them. pub fn index_names(&self) -> Result> { let rtxn = self.env.read_txn()?; @@ -776,389 +473,47 @@ impl IndexScheduler { self.index_mapper.try_for_each_index(&rtxn, f) } - /// Return the task ids matched by the given query from the index scheduler's point of view. - pub(crate) fn get_task_ids(&self, rtxn: &RoTxn, query: &Query) -> Result { - let ProcessingTasks { batch: processing_batch, processing: processing_tasks, progress: _ } = - self.processing_tasks.read().unwrap().clone(); - let Query { - limit, - from, - reverse, - uids, - batch_uids, - statuses, - types, - index_uids, - canceled_by, - before_enqueued_at, - after_enqueued_at, - before_started_at, - after_started_at, - before_finished_at, - after_finished_at, - } = query; - - let mut tasks = self.all_task_ids(rtxn)?; - - if let Some(from) = from { - let range = if reverse.unwrap_or_default() { - u32::MIN..*from - } else { - from.saturating_add(1)..u32::MAX - }; - tasks.remove_range(range); - } - - if let Some(batch_uids) = batch_uids { - let mut batch_tasks = RoaringBitmap::new(); - for batch_uid in batch_uids { - if processing_batch.as_ref().map_or(false, |batch| batch.uid == *batch_uid) { - batch_tasks |= &processing_tasks; - } else { - batch_tasks |= self.tasks_in_batch(rtxn, *batch_uid)?; - } - } - tasks &= batch_tasks; - } - - if let Some(status) = statuses { - let mut status_tasks = RoaringBitmap::new(); - for status in status { - match status { - // special case for Processing tasks - Status::Processing => { - status_tasks |= &processing_tasks; - } - status => status_tasks |= &self.get_status(rtxn, *status)?, - }; - } - if !status.contains(&Status::Processing) { - tasks -= &processing_tasks; - } - tasks &= status_tasks; - } - - if let Some(uids) = uids { - let uids = RoaringBitmap::from_iter(uids); - tasks &= &uids; - } - - if let Some(canceled_by) = canceled_by { - let mut all_canceled_tasks = RoaringBitmap::new(); - for cancel_task_uid in canceled_by { - if let Some(canceled_by_uid) = self.canceled_by.get(rtxn, cancel_task_uid)? { - all_canceled_tasks |= canceled_by_uid; - } - } - - // if the canceled_by has been specified but no task - // matches then we prefer matching zero than all tasks. - if all_canceled_tasks.is_empty() { - return Ok(RoaringBitmap::new()); - } else { - tasks &= all_canceled_tasks; - } - } - - if let Some(kind) = types { - let mut kind_tasks = RoaringBitmap::new(); - for kind in kind { - kind_tasks |= self.get_kind(rtxn, *kind)?; - } - tasks &= &kind_tasks; - } - - if let Some(index) = index_uids { - let mut index_tasks = RoaringBitmap::new(); - for index in index { - index_tasks |= self.index_tasks(rtxn, index)?; - } - tasks &= &index_tasks; - } - - // For the started_at filter, we need to treat the part of the tasks that are processing from the part of the - // tasks that are not processing. The non-processing ones are filtered normally while the processing ones - // are entirely removed unless the in-memory startedAt variable falls within the date filter. - // Once we have filtered the two subsets, we put them back together and assign it back to `tasks`. - tasks = { - let (mut filtered_non_processing_tasks, mut filtered_processing_tasks) = - (&tasks - &processing_tasks, &tasks & &processing_tasks); - - // special case for Processing tasks - // A closure that clears the filtered_processing_tasks if their started_at date falls outside the given bounds - let clear_filtered_processing_tasks = - |start: Bound, end: Bound| { - let start = map_bound(start, |b| b.unix_timestamp_nanos()); - let end = map_bound(end, |b| b.unix_timestamp_nanos()); - let is_within_dates = RangeBounds::contains( - &(start, end), - &processing_batch - .map_or_else(OffsetDateTime::now_utc, |batch| batch.started_at) - .unix_timestamp_nanos(), - ); - if !is_within_dates { - filtered_processing_tasks.clear(); - } - }; - match (after_started_at, before_started_at) { - (None, None) => (), - (None, Some(before)) => { - clear_filtered_processing_tasks(Bound::Unbounded, Bound::Excluded(*before)) - } - (Some(after), None) => { - clear_filtered_processing_tasks(Bound::Excluded(*after), Bound::Unbounded) - } - (Some(after), Some(before)) => clear_filtered_processing_tasks( - Bound::Excluded(*after), - Bound::Excluded(*before), - ), - }; - - keep_ids_within_datetimes( - rtxn, - &mut filtered_non_processing_tasks, - self.started_at, - *after_started_at, - *before_started_at, - )?; - filtered_non_processing_tasks | filtered_processing_tasks - }; - - keep_ids_within_datetimes( - rtxn, - &mut tasks, - self.enqueued_at, - *after_enqueued_at, - *before_enqueued_at, - )?; - - keep_ids_within_datetimes( - rtxn, - &mut tasks, - self.finished_at, - *after_finished_at, - *before_finished_at, - )?; - - if let Some(limit) = limit { - tasks = if query.reverse.unwrap_or_default() { - tasks.into_iter().take(*limit as usize).collect() - } else { - tasks.into_iter().rev().take(*limit as usize).collect() - }; - } - - Ok(tasks) - } - - /// Return the batch ids matched by the given query from the index scheduler's point of view. - pub(crate) fn get_batch_ids( + /// Returns the total number of indexes available for the specified filter. + /// And a `Vec` of the index_uid + its stats + pub fn get_paginated_indexes_stats( &self, - rtxn: &RoTxn, - processing: &ProcessingTasks, - query: &Query, - ) -> Result { - let Query { - limit, - from, - reverse, - uids, - batch_uids, - statuses, - types, - index_uids, - canceled_by, - before_enqueued_at, - after_enqueued_at, - before_started_at, - after_started_at, - before_finished_at, - after_finished_at, - } = query; + filters: &meilisearch_auth::AuthFilter, + from: usize, + limit: usize, + ) -> Result<(usize, Vec<(String, index_mapper::IndexStats)>)> { + let rtxn = self.read_txn()?; - let mut batches = self.all_batch_ids(rtxn)?; - if let Some(batch_id) = processing.batch.as_ref().map(|batch| batch.uid) { - batches.insert(batch_id); - } + let mut total = 0; + let mut iter = self + .index_mapper + .index_mapping + .iter(&rtxn)? + // in case of an error we want to keep the value to return it + .filter(|ret| { + ret.as_ref().map_or(true, |(name, _uuid)| filters.is_index_authorized(name)) + }) + .inspect(|_| total += 1) + .skip(from); + let ret = iter + .by_ref() + .take(limit) + .map(|ret| ret.map_err(Error::from)) + .map(|ret| { + ret.and_then(|(name, uuid)| { + self.index_mapper.index_stats.get(&rtxn, &uuid).map_err(Error::from).and_then( + |stat| { + stat.map(|stat| (name.to_string(), stat)) + .ok_or(Error::CorruptedTaskQueue) + }, + ) + }) + }) + .collect::>>(); - if let Some(from) = from { - let range = if reverse.unwrap_or_default() { - u32::MIN..*from - } else { - from.saturating_add(1)..u32::MAX - }; - batches.remove_range(range); - } + // We must iterate on the rest of the indexes to compute the total + iter.for_each(drop); - if let Some(batch_uids) = &batch_uids { - let batches_uids = RoaringBitmap::from_iter(batch_uids); - batches &= batches_uids; - } - - if let Some(status) = &statuses { - let mut status_batches = RoaringBitmap::new(); - for status in status { - match status { - // special case for Processing batches - Status::Processing => { - if let Some(batch_id) = processing.batch.as_ref().map(|batch| batch.uid) { - status_batches.insert(batch_id); - } - } - // Enqueued tasks are not stored in batches - Status::Enqueued => (), - status => status_batches |= &self.get_batch_status(rtxn, *status)?, - }; - } - if !status.contains(&Status::Processing) { - if let Some(ref batch) = processing.batch { - batches.remove(batch.uid); - } - } - batches &= status_batches; - } - - if let Some(task_uids) = &uids { - let mut batches_by_task_uids = RoaringBitmap::new(); - for task_uid in task_uids { - if let Some(task) = self.get_task(rtxn, *task_uid)? { - if let Some(batch_uid) = task.batch_uid { - batches_by_task_uids.insert(batch_uid); - } - } - } - batches &= batches_by_task_uids; - } - - // There is no database for this query, we must retrieve the task queried by the client and ensure it's valid - if let Some(canceled_by) = &canceled_by { - let mut all_canceled_batches = RoaringBitmap::new(); - for cancel_uid in canceled_by { - if let Some(task) = self.get_task(rtxn, *cancel_uid)? { - if task.kind.as_kind() == Kind::TaskCancelation - && task.status == Status::Succeeded - { - if let Some(batch_uid) = task.batch_uid { - all_canceled_batches.insert(batch_uid); - } - } - } - } - - // if the canceled_by has been specified but no batch - // matches then we prefer matching zero than all batches. - if all_canceled_batches.is_empty() { - return Ok(RoaringBitmap::new()); - } else { - batches &= all_canceled_batches; - } - } - - if let Some(kind) = &types { - let mut kind_batches = RoaringBitmap::new(); - for kind in kind { - kind_batches |= self.get_batch_kind(rtxn, *kind)?; - if let Some(uid) = processing - .batch - .as_ref() - .and_then(|batch| batch.kinds.contains(kind).then_some(batch.uid)) - { - kind_batches.insert(uid); - } - } - batches &= &kind_batches; - } - - if let Some(index) = &index_uids { - let mut index_batches = RoaringBitmap::new(); - for index in index { - index_batches |= self.index_batches(rtxn, index)?; - if let Some(uid) = processing - .batch - .as_ref() - .and_then(|batch| batch.indexes.contains(index).then_some(batch.uid)) - { - index_batches.insert(uid); - } - } - batches &= &index_batches; - } - - // For the started_at filter, we need to treat the part of the batches that are processing from the part of the - // batches that are not processing. The non-processing ones are filtered normally while the processing ones - // are entirely removed unless the in-memory startedAt variable falls within the date filter. - // Once we have filtered the two subsets, we put them back together and assign it back to `batches`. - batches = { - let (mut filtered_non_processing_batches, mut filtered_processing_batches) = - (&batches - &processing.processing, &batches & &processing.processing); - - // special case for Processing batches - // A closure that clears the filtered_processing_batches if their started_at date falls outside the given bounds - let mut clear_filtered_processing_batches = - |start: Bound, end: Bound| { - let start = map_bound(start, |b| b.unix_timestamp_nanos()); - let end = map_bound(end, |b| b.unix_timestamp_nanos()); - let is_within_dates = RangeBounds::contains( - &(start, end), - &processing - .batch - .as_ref() - .map_or_else(OffsetDateTime::now_utc, |batch| batch.started_at) - .unix_timestamp_nanos(), - ); - if !is_within_dates { - filtered_processing_batches.clear(); - } - }; - match (after_started_at, before_started_at) { - (None, None) => (), - (None, Some(before)) => { - clear_filtered_processing_batches(Bound::Unbounded, Bound::Excluded(*before)) - } - (Some(after), None) => { - clear_filtered_processing_batches(Bound::Excluded(*after), Bound::Unbounded) - } - (Some(after), Some(before)) => clear_filtered_processing_batches( - Bound::Excluded(*after), - Bound::Excluded(*before), - ), - }; - - keep_ids_within_datetimes( - rtxn, - &mut filtered_non_processing_batches, - self.batch_started_at, - *after_started_at, - *before_started_at, - )?; - filtered_non_processing_batches | filtered_processing_batches - }; - - keep_ids_within_datetimes( - rtxn, - &mut batches, - self.batch_enqueued_at, - *after_enqueued_at, - *before_enqueued_at, - )?; - - keep_ids_within_datetimes( - rtxn, - &mut batches, - self.batch_finished_at, - *after_finished_at, - *before_finished_at, - )?; - - if let Some(limit) = limit { - batches = if query.reverse.unwrap_or_default() { - batches.into_iter().take(*limit as usize).collect() - } else { - batches.into_iter().rev().take(*limit as usize).collect() - }; - } - - Ok(batches) + ret.map(|ret| (total, ret)) } /// The returned structure contains: @@ -1167,39 +522,7 @@ impl IndexScheduler { /// 3. The number of times the properties appeared. pub fn get_stats(&self) -> Result>> { let rtxn = self.read_txn()?; - - let mut res = BTreeMap::new(); - - let processing_tasks = { self.processing_tasks.read().unwrap().processing.len() }; - - res.insert( - "statuses".to_string(), - enum_iterator::all::() - .map(|s| { - let tasks = self.get_status(&rtxn, s)?.len(); - match s { - Status::Enqueued => Ok((s.to_string(), tasks - processing_tasks)), - Status::Processing => Ok((s.to_string(), processing_tasks)), - s => Ok((s.to_string(), tasks)), - } - }) - .collect::>>()?, - ); - res.insert( - "types".to_string(), - enum_iterator::all::() - .map(|s| Ok((s.to_string(), self.get_kind(&rtxn, s)?.len()))) - .collect::>>()?, - ); - res.insert( - "indexes".to_string(), - self.index_tasks - .iter(&rtxn)? - .map(|res| Ok(res.map(|(name, bitmap)| (name.to_string(), bitmap.len()))?)) - .collect::>>()?, - ); - - Ok(res) + self.queue.get_stats(&rtxn, &self.processing_tasks.read().unwrap()) } // Return true if there is at least one task that is processing. @@ -1212,131 +535,11 @@ impl IndexScheduler { pub fn is_index_processing(&self, index: &str) -> Result { let rtxn = self.env.read_txn()?; let processing_tasks = self.processing_tasks.read().unwrap().processing.clone(); - let index_tasks = self.index_tasks(&rtxn, index)?; + let index_tasks = self.queue.tasks.index_tasks(&rtxn, index)?; let nbr_index_processing_tasks = processing_tasks.intersection_len(&index_tasks); Ok(nbr_index_processing_tasks > 0) } - /// Return the task ids matching the query along with the total number of tasks - /// by ignoring the from and limit parameters from the user's point of view. - /// - /// There are two differences between an internal query and a query executed by - /// the user. - /// - /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated - /// with many indexes internally. - /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. - pub fn get_task_ids_from_authorized_indexes( - &self, - rtxn: &RoTxn, - query: &Query, - filters: &meilisearch_auth::AuthFilter, - ) -> Result<(RoaringBitmap, u64)> { - // compute all tasks matching the filter by ignoring the limits, to find the number of tasks matching - // the filter. - // As this causes us to compute the filter twice it is slightly inefficient, but doing it this way spares - // us from modifying the underlying implementation, and the performance remains sufficient. - // Should this change, we would modify `get_task_ids` to directly return the number of matching tasks. - let total_tasks = self.get_task_ids(rtxn, &query.clone().without_limits())?; - let mut tasks = self.get_task_ids(rtxn, query)?; - - // If the query contains a list of index uid or there is a finite list of authorized indexes, - // then we must exclude all the kinds that aren't associated to one and only one index. - if query.index_uids.is_some() || !filters.all_indexes_authorized() { - for kind in enum_iterator::all::().filter(|kind| !kind.related_to_one_index()) { - tasks -= self.get_kind(rtxn, kind)?; - } - } - - // Any task that is internally associated with a non-authorized index - // must be discarded. - if !filters.all_indexes_authorized() { - let all_indexes_iter = self.index_tasks.iter(rtxn)?; - for result in all_indexes_iter { - let (index, index_tasks) = result?; - if !filters.is_index_authorized(index) { - tasks -= index_tasks; - } - } - } - - Ok((tasks, total_tasks.len())) - } - - /// Return the batch ids matching the query along with the total number of batches - /// by ignoring the from and limit parameters from the user's point of view. - /// - /// There are two differences between an internal query and a query executed by - /// the user. - /// - /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated - /// with many indexes internally. - /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. - fn get_batch_ids_from_authorized_indexes( - &self, - rtxn: &RoTxn, - processing: &ProcessingTasks, - query: &Query, - filters: &meilisearch_auth::AuthFilter, - ) -> Result<(RoaringBitmap, u64)> { - // compute all batches matching the filter by ignoring the limits, to find the number of batches matching - // the filter. - // As this causes us to compute the filter twice it is slightly inefficient, but doing it this way spares - // us from modifying the underlying implementation, and the performance remains sufficient. - // Should this change, we would modify `get_batch_ids` to directly return the number of matching batches. - let total_batches = - self.get_batch_ids(rtxn, processing, &query.clone().without_limits())?; - let mut batches = self.get_batch_ids(rtxn, processing, query)?; - - // If the query contains a list of index uid or there is a finite list of authorized indexes, - // then we must exclude all the batches that only contains tasks associated to multiple indexes. - // This works because we don't autobatch tasks associated to multiple indexes with tasks associated - // to a single index. e.g: IndexSwap cannot be batched with IndexCreation. - if query.index_uids.is_some() || !filters.all_indexes_authorized() { - for kind in enum_iterator::all::().filter(|kind| !kind.related_to_one_index()) { - batches -= self.get_kind(rtxn, kind)?; - if let Some(batch) = processing.batch.as_ref() { - if batch.kinds.contains(&kind) { - batches.remove(batch.uid); - } - } - } - } - - // Any batch that is internally associated with at least one authorized index - // must be returned. - if !filters.all_indexes_authorized() { - let mut valid_indexes = RoaringBitmap::new(); - let mut forbidden_indexes = RoaringBitmap::new(); - - let all_indexes_iter = self.batch_index_tasks.iter(rtxn)?; - for result in all_indexes_iter { - let (index, index_tasks) = result?; - if filters.is_index_authorized(index) { - valid_indexes |= index_tasks; - } else { - forbidden_indexes |= index_tasks; - } - } - if let Some(batch) = processing.batch.as_ref() { - for index in &batch.indexes { - if filters.is_index_authorized(index) { - valid_indexes.insert(batch.uid); - } else { - forbidden_indexes.insert(batch.uid); - } - } - } - - // If a batch had ONE valid task then it should be returned - let invalid_batches = forbidden_indexes - valid_indexes; - - batches -= invalid_batches; - } - - Ok((batches, total_batches.len())) - } - /// Return the tasks matching the query from the user's point of view along /// with the total number of tasks matching the query, ignoring from and limit. /// @@ -1344,53 +547,35 @@ impl IndexScheduler { /// the user. /// /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated - /// with many indexes internally. + /// with many indexes internally. /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. pub fn get_tasks_from_authorized_indexes( &self, - query: Query, + query: &Query, filters: &meilisearch_auth::AuthFilter, ) -> Result<(Vec, u64)> { - let rtxn = self.env.read_txn()?; + let rtxn = self.read_txn()?; + let processing = self.processing_tasks.read().unwrap(); + self.queue.get_tasks_from_authorized_indexes(&rtxn, query, filters, &processing) + } - let (tasks, total) = self.get_task_ids_from_authorized_indexes(&rtxn, &query, filters)?; - let tasks = if query.reverse.unwrap_or_default() { - Box::new(tasks.into_iter()) as Box> - } else { - Box::new(tasks.into_iter().rev()) as Box> - }; - let tasks = - self.get_existing_tasks(&rtxn, tasks.take(query.limit.unwrap_or(u32::MAX) as usize))?; - - let ProcessingTasks { batch, processing, progress } = - self.processing_tasks.read().map_err(|_| Error::CorruptedTaskQueue)?.clone(); - - // ignored for now, might be added to batch details later - let _ = progress; - - let ret = tasks.into_iter(); - if processing.is_empty() || batch.is_none() { - Ok((ret.collect(), total)) - } else { - // Safe because we ensured there was a batch in the previous branch - let batch = batch.unwrap(); - Ok(( - ret.map(|task| { - if processing.contains(task.uid) { - Task { - status: Status::Processing, - batch_uid: Some(batch.uid), - started_at: Some(batch.started_at), - ..task - } - } else { - task - } - }) - .collect(), - total, - )) - } + /// Return the task ids matching the query along with the total number of tasks + /// by ignoring the from and limit parameters from the user's point of view. + /// + /// There are two differences between an internal query and a query executed by + /// the user. + /// + /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated + /// with many indexes internally. + /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. + pub fn get_task_ids_from_authorized_indexes( + &self, + query: &Query, + filters: &meilisearch_auth::AuthFilter, + ) -> Result<(RoaringBitmap, u64)> { + let rtxn = self.read_txn()?; + let processing = self.processing_tasks.read().unwrap(); + self.queue.get_task_ids_from_authorized_indexes(&rtxn, query, filters, &processing) } /// Return the batches matching the query from the user's point of view along @@ -1400,31 +585,35 @@ impl IndexScheduler { /// the user. /// /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated - /// with many indexes internally. + /// with many indexes internally. /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. pub fn get_batches_from_authorized_indexes( &self, - query: Query, + query: &Query, filters: &meilisearch_auth::AuthFilter, ) -> Result<(Vec, u64)> { - let rtxn = self.env.read_txn()?; - let processing = self.processing_tasks.read().unwrap().clone(); + let rtxn = self.read_txn()?; + let processing = self.processing_tasks.read().unwrap(); + self.queue.get_batches_from_authorized_indexes(&rtxn, query, filters, &processing) + } - let (batches, total) = - self.get_batch_ids_from_authorized_indexes(&rtxn, &processing, &query, filters)?; - let batches = if query.reverse.unwrap_or_default() { - Box::new(batches.into_iter()) as Box> - } else { - Box::new(batches.into_iter().rev()) as Box> - }; - - let batches = self.get_existing_batches( - &rtxn, - &processing, - batches.take(query.limit.unwrap_or(u32::MAX) as usize), - )?; - - Ok((batches, total)) + /// Return the batch ids matching the query along with the total number of batches + /// by ignoring the from and limit parameters from the user's point of view. + /// + /// There are two differences between an internal query and a query executed by + /// the user. + /// + /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated + /// with many indexes internally. + /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. + pub fn get_batch_ids_from_authorized_indexes( + &self, + query: &Query, + filters: &meilisearch_auth::AuthFilter, + ) -> Result<(RoaringBitmap, u64)> { + let rtxn = self.read_txn()?; + let processing = self.processing_tasks.read().unwrap(); + self.queue.get_batch_ids_from_authorized_indexes(&rtxn, query, filters, &processing) } /// Register a new task in the scheduler. @@ -1436,73 +625,15 @@ impl IndexScheduler { task_id: Option, dry_run: bool, ) -> Result { - let mut wtxn = self.env.write_txn()?; - // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty()) - && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 50 + && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40 { return Err(Error::NoSpaceLeftInTaskQueue); } - let next_task_id = self.next_task_id(&wtxn)?; - - if let Some(uid) = task_id { - if uid < next_task_id { - return Err(Error::BadTaskId { received: uid, expected: next_task_id }); - } - } - - let mut task = Task { - uid: task_id.unwrap_or(next_task_id), - // The batch is defined once we starts processing the task - batch_uid: None, - enqueued_at: OffsetDateTime::now_utc(), - started_at: None, - finished_at: None, - error: None, - canceled_by: None, - details: kind.default_details(), - status: Status::Enqueued, - kind: kind.clone(), - }; - // For deletion and cancelation tasks, we want to make extra sure that they - // don't attempt to delete/cancel tasks that are newer than themselves. - filter_out_references_to_newer_tasks(&mut task); - // If the register task is an index swap task, verify that it is well-formed - // (that it does not contain duplicate indexes). - check_index_swap_validity(&task)?; - - // At this point the task is going to be registered and no further checks will be done - if dry_run { - return Ok(task); - } - - // Get rid of the mutability. - let task = task; - - self.all_tasks.put_with_flags(&mut wtxn, PutFlags::APPEND, &task.uid, &task)?; - - for index in task.indexes() { - self.update_index(&mut wtxn, index, |bitmap| { - bitmap.insert(task.uid); - })?; - } - - self.update_status(&mut wtxn, Status::Enqueued, |bitmap| { - bitmap.insert(task.uid); - })?; - - self.update_kind(&mut wtxn, task.kind.as_kind(), |bitmap| { - bitmap.insert(task.uid); - })?; - - utils::insert_task_datetime(&mut wtxn, self.enqueued_at, task.enqueued_at, task.uid)?; - - if let Err(e) = wtxn.commit() { - self.delete_persisted_task_data(&task)?; - return Err(e.into()); - } + let mut wtxn = self.env.write_txn()?; + let task = self.queue.register(&mut wtxn, &kind, task_id, dry_run)?; // If the registered task is a task cancelation // we inform the processing tasks to stop (if necessary). @@ -1510,13 +641,17 @@ impl IndexScheduler { let tasks_to_cancel = RoaringBitmap::from_iter(tasks); if self.processing_tasks.read().unwrap().must_cancel_processing_tasks(&tasks_to_cancel) { - self.must_stop_processing.must_stop(); + self.scheduler.must_stop_processing.must_stop(); } } - // notify the scheduler loop to execute a new tick - self.wake_up.signal(); + if let Err(e) = wtxn.commit() { + self.queue.delete_persisted_task_data(&task)?; + return Err(e.into()); + } + // notify the scheduler loop to execute a new tick + self.scheduler.wake_up.signal(); Ok(task) } @@ -1537,248 +672,17 @@ impl IndexScheduler { Ok(index) } - /// Create a file and register it in the index scheduler. - /// - /// The returned file and uuid can be used to associate - /// some data to a task. The file will be kept until - /// the task has been fully processed. - pub fn create_update_file(&self, dry_run: bool) -> Result<(Uuid, file_store::File)> { - if dry_run { - Ok((Uuid::nil(), file_store::File::dry_file()?)) - } else { - Ok(self.file_store.new_update()?) - } - } + pub fn refresh_index_stats(&self, name: &str) -> Result<()> { + let mut mapper_wtxn = self.env.write_txn()?; + let index = self.index_mapper.index(&mapper_wtxn, name)?; + let index_rtxn = index.read_txn()?; - #[cfg(test)] - pub fn create_update_file_with_uuid(&self, uuid: u128) -> Result<(Uuid, file_store::File)> { - Ok(self.file_store.new_update_with_uuid(uuid)?) - } + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(name.to_string())))?; - /// The size on disk taken by all the updates files contained in the `IndexScheduler`, in bytes. - pub fn compute_update_file_size(&self) -> Result { - Ok(self.file_store.compute_total_size()?) - } - - /// Delete a file from the index scheduler. - /// - /// Counterpart to the [`create_update_file`](IndexScheduler::create_update_file) method. - pub fn delete_update_file(&self, uuid: Uuid) -> Result<()> { - Ok(self.file_store.delete(uuid)?) - } - - /// Perform one iteration of the run loop. - /// - /// 1. See if we need to cleanup the task queue - /// 2. Find the next batch of tasks to be processed. - /// 3. Update the information of these tasks following the start of their processing. - /// 4. Update the in-memory list of processed tasks accordingly. - /// 5. Process the batch: - /// - perform the actions of each batched task - /// - update the information of each batched task following the end - /// of their processing. - /// 6. Reset the in-memory list of processed tasks. - /// - /// Returns the number of processed tasks. - fn tick(&self) -> Result { - #[cfg(test)] - { - *self.run_loop_iteration.write().unwrap() += 1; - self.breakpoint(Breakpoint::Start); - } - - if self.cleanup_enabled { - self.cleanup_task_queue()?; - } - - let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; - let (batch, mut processing_batch) = - match self.create_next_batch(&rtxn).map_err(|e| Error::CreateBatch(Box::new(e)))? { - Some(batch) => batch, - None => return Ok(TickOutcome::WaitForSignal), - }; - let index_uid = batch.index_uid().map(ToOwned::to_owned); - drop(rtxn); - - // 1. store the starting date with the bitmap of processing tasks. - let mut ids = batch.ids(); - let processed_tasks = ids.len(); - - // We reset the must_stop flag to be sure that we don't stop processing tasks - self.must_stop_processing.reset(); - self.processing_tasks - .write() - .unwrap() - // We can clone the processing batch here because we don't want its modification to affect the view of the processing batches - .start_processing(processing_batch.clone(), ids.clone()); - - #[cfg(test)] - self.breakpoint(Breakpoint::BatchCreated); - - // 2. Process the tasks - let res = { - let cloned_index_scheduler = self.private_clone(); - let processing_batch = &mut processing_batch; - std::thread::scope(|s| { - let handle = std::thread::Builder::new() - .name(String::from("batch-operation")) - .spawn_scoped(s, move || { - cloned_index_scheduler.process_batch(batch, processing_batch) - }) - .unwrap(); - handle.join().unwrap_or(Err(Error::ProcessBatchPanicked)) - }) - }; - - // Reset the currently updating index to relinquish the index handle - self.index_mapper.set_currently_updating_index(None); - - #[cfg(test)] - self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?; - - processing_batch.finished(); - let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; - let mut canceled = RoaringBitmap::new(); - - match res { - Ok(tasks) => { - #[cfg(test)] - self.breakpoint(Breakpoint::ProcessBatchSucceeded); - - let mut success = 0; - let mut failure = 0; - let mut canceled_by = None; - - #[allow(unused_variables)] - for (i, mut task) in tasks.into_iter().enumerate() { - processing_batch.update(&mut task); - if task.status == Status::Canceled { - canceled.insert(task.uid); - canceled_by = task.canceled_by; - } - - #[cfg(test)] - self.maybe_fail( - tests::FailureLocation::UpdatingTaskAfterProcessBatchSuccess { - task_uid: i as u32, - }, - )?; - - match task.error { - Some(_) => failure += 1, - None => success += 1, - } - - self.update_task(&mut wtxn, &task) - .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; - } - if let Some(canceled_by) = canceled_by { - self.canceled_by.put(&mut wtxn, &canceled_by, &canceled)?; - } - tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); - } - // If we have an abortion error we must stop the tick here and re-schedule tasks. - Err(Error::Milli(milli::Error::InternalError( - milli::InternalError::AbortedIndexation, - ))) - | Err(Error::AbortedTask) => { - #[cfg(test)] - self.breakpoint(Breakpoint::AbortedIndexation); - wtxn.abort(); - - tracing::info!("A batch of tasks was aborted."); - // We make sure that we don't call `stop_processing` on the `processing_tasks`, - // this is because we want to let the next tick call `create_next_batch` and keep - // the `started_at` date times and `processings` of the current processing tasks. - // This date time is used by the task cancelation to store the right `started_at` - // date in the task on disk. - return Ok(TickOutcome::TickAgain(0)); - } - // If an index said it was full, we need to: - // 1. identify which index is full - // 2. close the associated environment - // 3. resize it - // 4. re-schedule tasks - Err(Error::Milli(milli::Error::UserError( - milli::UserError::MaxDatabaseSizeReached, - ))) if index_uid.is_some() => { - // fixme: add index_uid to match to avoid the unwrap - let index_uid = index_uid.unwrap(); - // fixme: handle error more gracefully? not sure when this could happen - self.index_mapper.resize_index(&wtxn, &index_uid)?; - wtxn.abort(); - - tracing::info!("The max database size was reached. Resizing the index."); - - return Ok(TickOutcome::TickAgain(0)); - } - // In case of a failure we must get back and patch all the tasks with the error. - Err(err) => { - #[cfg(test)] - self.breakpoint(Breakpoint::ProcessBatchFailed); - let error: ResponseError = err.into(); - for id in ids.iter() { - let mut task = self - .get_task(&wtxn, id) - .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? - .ok_or(Error::CorruptedTaskQueue)?; - task.status = Status::Failed; - task.error = Some(error.clone()); - task.details = task.details.map(|d| d.to_failed()); - processing_batch.update(&mut task); - - #[cfg(test)] - self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; - - tracing::error!("Batch failed {}", error); - - self.update_task(&mut wtxn, &task) - .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; - } - } - } - - self.processing_tasks.write().unwrap().stop_processing(); - // We must re-add the canceled task so they're part of the same batch. - // processed.processing |= canceled; - ids |= canceled; - - self.write_batch(&mut wtxn, processing_batch, &ids)?; - - #[cfg(test)] - self.maybe_fail(tests::FailureLocation::CommittingWtxn)?; - - wtxn.commit().map_err(Error::HeedTransaction)?; - - // Once the tasks are committed, we should delete all the update files associated ASAP to avoid leaking files in case of a restart - tracing::debug!("Deleting the update files"); - - //We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap - let idx = AtomicU32::new(0); - (0..current_num_threads()).into_par_iter().try_for_each(|_| -> Result<()> { - let rtxn = self.read_txn()?; - while let Some(id) = ids.select(idx.fetch_add(1, Ordering::Relaxed)) { - let task = self - .get_task(&rtxn, id) - .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))? - .ok_or(Error::CorruptedTaskQueue)?; - if let Err(e) = self.delete_persisted_task_data(&task) { - tracing::error!( - "Failure to delete the content files associated with task {}. Error: {e}", - task.uid - ); - } - } - Ok(()) - })?; - - // We shouldn't crash the tick function if we can't send data to the webhook. - let _ = self.notify_webhook(&ids); - - #[cfg(test)] - self.breakpoint(Breakpoint::AfterProcessing); - - Ok(TickOutcome::TickAgain(processed_tasks)) + self.index_mapper.store_stats_of(&mut mapper_wtxn, name, &stats)?; + mapper_wtxn.commit()?; + Ok(()) } /// Once the tasks changes have been committed we must send all the tasks that were updated to our webhook if there is one. @@ -1792,7 +696,7 @@ impl IndexScheduler { written: usize, } - impl<'a, 'b> Read for TaskReader<'a, 'b> { + impl Read for TaskReader<'_, '_> { fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result { if self.buffer.is_empty() { match self.tasks.next() { @@ -1800,6 +704,8 @@ impl IndexScheduler { Some(task_id) => { let task = self .index_scheduler + .queue + .tasks .get_task(self.rtxn, task_id) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))? .ok_or_else(|| { @@ -1861,59 +767,6 @@ impl IndexScheduler { Ok(()) } - /// Register a task to cleanup the task queue if needed - fn cleanup_task_queue(&self) -> Result<()> { - let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; - - let nb_tasks = self.all_task_ids(&rtxn)?.len(); - // if we have less than 1M tasks everything is fine - if nb_tasks < self.max_number_of_tasks as u64 { - return Ok(()); - } - - let finished = self.status.get(&rtxn, &Status::Succeeded)?.unwrap_or_default() - | self.status.get(&rtxn, &Status::Failed)?.unwrap_or_default() - | self.status.get(&rtxn, &Status::Canceled)?.unwrap_or_default(); - - let to_delete = RoaringBitmap::from_iter(finished.into_iter().rev().take(100_000)); - - // /!\ the len must be at least 2 or else we might enter an infinite loop where we only delete - // the deletion tasks we enqueued ourselves. - if to_delete.len() < 2 { - tracing::warn!("The task queue is almost full, but no task can be deleted yet."); - // the only thing we can do is hope that the user tasks are going to finish - return Ok(()); - } - - tracing::info!( - "The task queue is almost full. Deleting the oldest {} finished tasks.", - to_delete.len() - ); - - // it's safe to unwrap here because we checked the len above - let newest_task_id = to_delete.iter().last().unwrap(); - let last_task_to_delete = - self.get_task(&rtxn, newest_task_id)?.ok_or(Error::CorruptedTaskQueue)?; - drop(rtxn); - - // increase time by one nanosecond so that the enqueuedAt of the last task to delete is also lower than that date. - let delete_before = last_task_to_delete.enqueued_at + Duration::from_nanos(1); - - self.register( - KindWithContent::TaskDeletion { - query: format!( - "?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled", - delete_before.format(&Rfc3339).map_err(|_| Error::CorruptedTaskQueue)?, - ), - tasks: to_delete, - }, - None, - false, - )?; - - Ok(()) - } - pub fn index_stats(&self, index_uid: &str) -> Result { let is_indexing = self.is_index_processing(index_uid)?; let rtxn = self.read_txn()?; @@ -1932,16 +785,19 @@ impl IndexScheduler { Ok(()) } - pub(crate) fn delete_persisted_task_data(&self, task: &Task) -> Result<()> { - match task.content_uuid() { - Some(content_file) => self.delete_update_file(content_file), - None => Ok(()), - } + pub fn put_network(&self, network: Network) -> Result<()> { + let wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; + self.features.put_network(wtxn, network)?; + Ok(()) + } + + pub fn network(&self) -> Network { + self.features.network() } - // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, + index_uid: String, embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs @@ -1952,8 +808,12 @@ impl IndexScheduler { config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized }, .. }| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + let prompt = Arc::new( + prompt + .try_into() + .map_err(meilisearch_types::milli::Error::from) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); @@ -1967,9 +827,11 @@ impl IndexScheduler { // add missing embedder let embedder = Arc::new( - Embedder::new(embedder_options.clone()) + Embedder::new(embedder_options.clone(), self.scheduler.embedding_cache_cap) .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, + .map_err(|err| { + Error::from_milli(err.into(), Some(index_uid.clone())) + })?, ); { let mut embedders = self.embedders.write().unwrap(); @@ -1981,223 +843,6 @@ impl IndexScheduler { .collect(); res.map(EmbeddingConfigs::new) } - - /// Blocks the thread until the test handle asks to progress to/through this breakpoint. - /// - /// Two messages are sent through the channel for each breakpoint. - /// The first message is `(b, false)` and the second message is `(b, true)`. - /// - /// Since the channel has a capacity of zero, the `send` and `recv` calls wait for each other. - /// So when the index scheduler calls `test_breakpoint_sdr.send(b, false)`, it blocks - /// the thread until the test catches up by calling `test_breakpoint_rcv.recv()` enough. - /// From the test side, we call `recv()` repeatedly until we find the message `(breakpoint, false)`. - /// As soon as we find it, the index scheduler is unblocked but then wait again on the call to - /// `test_breakpoint_sdr.send(b, true)`. This message will only be able to send once the - /// test asks to progress to the next `(b2, false)`. - #[cfg(test)] - fn breakpoint(&self, b: Breakpoint) { - // We send two messages. The first one will sync with the call - // to `handle.wait_until(b)`. The second one will block until the - // the next call to `handle.wait_until(..)`. - self.test_breakpoint_sdr.send((b, false)).unwrap(); - // This one will only be able to be sent if the test handle stays alive. - // If it fails, then it means that we have exited the test. - // By crashing with `unwrap`, we kill the run loop. - self.test_breakpoint_sdr.send((b, true)).unwrap(); - } -} - -pub struct Dump<'a> { - index_scheduler: &'a IndexScheduler, - wtxn: RwTxn<'a>, - - indexes: HashMap, - statuses: HashMap, - kinds: HashMap, -} - -impl<'a> Dump<'a> { - pub(crate) fn new(index_scheduler: &'a mut IndexScheduler) -> Result { - // While loading a dump no one should be able to access the scheduler thus I can block everything. - let wtxn = index_scheduler.env.write_txn()?; - - Ok(Dump { - index_scheduler, - wtxn, - indexes: HashMap::new(), - statuses: HashMap::new(), - kinds: HashMap::new(), - }) - } - - /// Register a new task coming from a dump in the scheduler. - /// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running. - pub fn register_dumped_task( - &mut self, - task: TaskDump, - content_file: Option>, - ) -> Result { - let content_uuid = match content_file { - Some(content_file) if task.status == Status::Enqueued => { - let (uuid, mut file) = self.index_scheduler.create_update_file(false)?; - let mut builder = DocumentsBatchBuilder::new(&mut file); - for doc in content_file { - builder.append_json_object(&doc?)?; - } - builder.into_inner()?; - file.persist()?; - - Some(uuid) - } - // If the task isn't `Enqueued` then just generate a recognisable `Uuid` - // in case we try to open it later. - _ if task.status != Status::Enqueued => Some(Uuid::nil()), - _ => None, - }; - - let task = Task { - uid: task.uid, - batch_uid: task.batch_uid, - enqueued_at: task.enqueued_at, - started_at: task.started_at, - finished_at: task.finished_at, - error: task.error, - canceled_by: task.canceled_by, - details: task.details, - status: task.status, - kind: match task.kind { - KindDump::DocumentImport { - primary_key, - method, - documents_count, - allow_index_creation, - } => KindWithContent::DocumentAdditionOrUpdate { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - primary_key, - method, - content_file: content_uuid.ok_or(Error::CorruptedDump)?, - documents_count, - allow_index_creation, - }, - KindDump::DocumentDeletion { documents_ids } => KindWithContent::DocumentDeletion { - documents_ids, - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - }, - KindDump::DocumentDeletionByFilter { filter } => { - KindWithContent::DocumentDeletionByFilter { - filter_expr: filter, - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - } - } - KindDump::DocumentEdition { filter, context, function } => { - KindWithContent::DocumentEdition { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - filter_expr: filter, - context, - function, - } - } - KindDump::DocumentClear => KindWithContent::DocumentClear { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - }, - KindDump::Settings { settings, is_deletion, allow_index_creation } => { - KindWithContent::SettingsUpdate { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - new_settings: settings, - is_deletion, - allow_index_creation, - } - } - KindDump::IndexDeletion => KindWithContent::IndexDeletion { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - }, - KindDump::IndexCreation { primary_key } => KindWithContent::IndexCreation { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - primary_key, - }, - KindDump::IndexUpdate { primary_key } => KindWithContent::IndexUpdate { - index_uid: task.index_uid.ok_or(Error::CorruptedDump)?, - primary_key, - }, - KindDump::IndexSwap { swaps } => KindWithContent::IndexSwap { swaps }, - KindDump::TaskCancelation { query, tasks } => { - KindWithContent::TaskCancelation { query, tasks } - } - KindDump::TasksDeletion { query, tasks } => { - KindWithContent::TaskDeletion { query, tasks } - } - KindDump::DumpCreation { keys, instance_uid } => { - KindWithContent::DumpCreation { keys, instance_uid } - } - KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - }, - }; - - self.index_scheduler.all_tasks.put(&mut self.wtxn, &task.uid, &task)?; - - for index in task.indexes() { - match self.indexes.get_mut(index) { - Some(bitmap) => { - bitmap.insert(task.uid); - } - None => { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(task.uid); - self.indexes.insert(index.to_string(), bitmap); - } - }; - } - - utils::insert_task_datetime( - &mut self.wtxn, - self.index_scheduler.enqueued_at, - task.enqueued_at, - task.uid, - )?; - - // we can't override the started_at & finished_at, so we must only set it if the tasks is finished and won't change - if matches!(task.status, Status::Succeeded | Status::Failed | Status::Canceled) { - if let Some(started_at) = task.started_at { - utils::insert_task_datetime( - &mut self.wtxn, - self.index_scheduler.started_at, - started_at, - task.uid, - )?; - } - if let Some(finished_at) = task.finished_at { - utils::insert_task_datetime( - &mut self.wtxn, - self.index_scheduler.finished_at, - finished_at, - task.uid, - )?; - } - } - - self.statuses.entry(task.status).or_default().insert(task.uid); - self.kinds.entry(task.kind.as_kind()).or_default().insert(task.uid); - - Ok(task) - } - - /// Commit all the changes and exit the importing dump state - pub fn finish(mut self) -> Result<()> { - for (index, bitmap) in self.indexes { - self.index_scheduler.index_tasks.put(&mut self.wtxn, &index, &bitmap)?; - } - for (status, bitmap) in self.statuses { - self.index_scheduler.put_status(&mut self.wtxn, status, &bitmap)?; - } - for (kind, bitmap) in self.kinds { - self.index_scheduler.put_kind(&mut self.wtxn, kind, &bitmap)?; - } - - self.wtxn.commit()?; - self.index_scheduler.wake_up.signal(); - - Ok(()) - } } /// The outcome of calling the [`IndexScheduler::tick`] function. @@ -2208,6 +853,8 @@ pub enum TickOutcome { TickAgain(u64), /// The scheduler should wait for an external signal before attempting another `tick`. WaitForSignal, + /// The scheduler exits the run-loop and will never process tasks again + StopProcessingForever, } /// How many indexes we can afford to have open simultaneously. @@ -2230,4660 +877,3 @@ pub struct IndexStats { /// Internal stats computed from the index. pub inner_stats: index_mapper::IndexStats, } - -#[cfg(test)] -mod tests { - use std::io::{BufWriter, Write}; - use std::time::Instant; - - use big_s::S; - use crossbeam::channel::RecvTimeoutError; - use file_store::File; - use insta::assert_json_snapshot; - use maplit::btreeset; - use meili_snap::{json_string, snapshot}; - use meilisearch_auth::AuthFilter; - use meilisearch_types::document_formats::DocumentFormatError; - use meilisearch_types::error::ErrorCode; - use meilisearch_types::index_uid_pattern::IndexUidPattern; - use meilisearch_types::milli::obkv_to_json; - use meilisearch_types::milli::update::IndexDocumentsMethod::{ - ReplaceDocuments, UpdateDocuments, - }; - use meilisearch_types::milli::update::Setting; - use meilisearch_types::milli::vector::settings::EmbeddingSettings; - use meilisearch_types::settings::Unchecked; - use meilisearch_types::tasks::IndexSwap; - use meilisearch_types::VERSION_FILE_NAME; - use tempfile::{NamedTempFile, TempDir}; - use time::Duration; - use uuid::Uuid; - use Breakpoint::*; - - use super::*; - use crate::insta_snapshot::{snapshot_bitmap, snapshot_index_scheduler}; - - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub enum FailureLocation { - InsideCreateBatch, - InsideProcessBatch, - PanicInsideProcessBatch, - AcquiringWtxn, - UpdatingTaskAfterProcessBatchSuccess { task_uid: u32 }, - UpdatingTaskAfterProcessBatchFailure, - CommittingWtxn, - } - - impl IndexScheduler { - pub fn test( - autobatching_enabled: bool, - planned_failures: Vec<(usize, FailureLocation)>, - ) -> (Self, IndexSchedulerHandle) { - Self::test_with_custom_config(planned_failures, |config| { - config.autobatching_enabled = autobatching_enabled; - }) - } - - pub fn test_with_custom_config( - planned_failures: Vec<(usize, FailureLocation)>, - configuration: impl Fn(&mut IndexSchedulerOptions), - ) -> (Self, IndexSchedulerHandle) { - let tempdir = TempDir::new().unwrap(); - let (sender, receiver) = crossbeam::channel::bounded(0); - - let indexer_config = IndexerConfig { skip_index_budget: true, ..Default::default() }; - - let mut options = IndexSchedulerOptions { - version_file_path: tempdir.path().join(VERSION_FILE_NAME), - auth_path: tempdir.path().join("auth"), - tasks_path: tempdir.path().join("db_path"), - update_file_path: tempdir.path().join("file_store"), - indexes_path: tempdir.path().join("indexes"), - snapshots_path: tempdir.path().join("snapshots"), - dumps_path: tempdir.path().join("dumps"), - webhook_url: None, - webhook_authorization_header: None, - task_db_size: 1000 * 1000 * 10, // 10 MB, we don't use MiB on purpose. - index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. - enable_mdb_writemap: false, - index_growth_amount: 1000 * 1000 * 1000 * 1000, // 1 TB - index_count: 5, - indexer_config, - autobatching_enabled: true, - cleanup_enabled: true, - max_number_of_tasks: 1_000_000, - max_number_of_batched_tasks: usize::MAX, - instance_features: Default::default(), - }; - configuration(&mut options); - - let index_scheduler = Self::new(options, sender, planned_failures).unwrap(); - - // To be 100% consistent between all test we're going to start the scheduler right now - // and ensure it's in the expected starting state. - let breakpoint = match receiver.recv_timeout(std::time::Duration::from_secs(10)) { - Ok(b) => b, - Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") - } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), - }; - assert_eq!(breakpoint, (Init, false)); - let index_scheduler_handle = IndexSchedulerHandle { - _tempdir: tempdir, - index_scheduler: index_scheduler.private_clone(), - test_breakpoint_rcv: receiver, - last_breakpoint: breakpoint.0, - }; - - (index_scheduler, index_scheduler_handle) - } - - /// Return a [`PlannedFailure`](Error::PlannedFailure) error if a failure is planned - /// for the given location and current run loop iteration. - pub fn maybe_fail(&self, location: FailureLocation) -> Result<()> { - if self.planned_failures.contains(&(*self.run_loop_iteration.read().unwrap(), location)) - { - match location { - FailureLocation::PanicInsideProcessBatch => { - panic!("simulated panic") - } - _ => Err(Error::PlannedFailure), - } - } else { - Ok(()) - } - } - } - - /// Return a `KindWithContent::IndexCreation` task - fn index_creation_task(index: &'static str, primary_key: &'static str) -> KindWithContent { - KindWithContent::IndexCreation { index_uid: S(index), primary_key: Some(S(primary_key)) } - } - /// Create a `KindWithContent::DocumentImport` task that imports documents. - /// - /// - `index_uid` is given as parameter - /// - `primary_key` is given as parameter - /// - `method` is set to `ReplaceDocuments` - /// - `content_file` is given as parameter - /// - `documents_count` is given as parameter - /// - `allow_index_creation` is set to `true` - fn replace_document_import_task( - index: &'static str, - primary_key: Option<&'static str>, - content_file_uuid: u128, - documents_count: u64, - ) -> KindWithContent { - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S(index), - primary_key: primary_key.map(ToOwned::to_owned), - method: ReplaceDocuments, - content_file: Uuid::from_u128(content_file_uuid), - documents_count, - allow_index_creation: true, - } - } - - /// Adapting to the new json reading interface - pub fn read_json( - bytes: &[u8], - write: impl Write, - ) -> std::result::Result { - let temp_file = NamedTempFile::new().unwrap(); - let mut buffer = BufWriter::new(temp_file.reopen().unwrap()); - buffer.write_all(bytes).unwrap(); - buffer.flush().unwrap(); - meilisearch_types::document_formats::read_json(temp_file.as_file(), write) - } - - /// Create an update file with the given file uuid. - /// - /// The update file contains just one simple document whose id is given by `document_id`. - /// - /// The uuid of the file and its documents count is returned. - fn sample_documents( - index_scheduler: &IndexScheduler, - file_uuid: u128, - document_id: usize, - ) -> (File, u64) { - let content = format!( - r#" - {{ - "id" : "{document_id}" - }}"# - ); - - let (_uuid, mut file) = index_scheduler.create_update_file_with_uuid(file_uuid).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - (file, documents_count) - } - - pub struct IndexSchedulerHandle { - _tempdir: TempDir, - index_scheduler: IndexScheduler, - test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, - last_breakpoint: Breakpoint, - } - - impl IndexSchedulerHandle { - /// Advance the scheduler to the next tick. - /// Panic - /// * If the scheduler is waiting for a task to be registered. - /// * If the breakpoint queue is in a bad state. - #[track_caller] - fn advance(&mut self) -> Breakpoint { - let (breakpoint_1, b) = match self - .test_breakpoint_rcv - .recv_timeout(std::time::Duration::from_secs(50)) - { - Ok(b) => b, - Err(RecvTimeoutError::Timeout) => { - let state = snapshot_index_scheduler(&self.index_scheduler); - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") - } - Err(RecvTimeoutError::Disconnected) => { - let state = snapshot_index_scheduler(&self.index_scheduler); - panic!("The scheduler crashed.\n{state}") - } - }; - // if we've already encountered a breakpoint we're supposed to be stuck on the false - // and we expect the same variant with the true to come now. - assert_eq!( - (breakpoint_1, b), - (self.last_breakpoint, true), - "Internal error in the test suite. In the previous iteration I got `({:?}, false)` and now I got `({:?}, {:?})`.", - self.last_breakpoint, - breakpoint_1, - b, - ); - - let (breakpoint_2, b) = match self - .test_breakpoint_rcv - .recv_timeout(std::time::Duration::from_secs(50)) - { - Ok(b) => b, - Err(RecvTimeoutError::Timeout) => { - let state = snapshot_index_scheduler(&self.index_scheduler); - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") - } - Err(RecvTimeoutError::Disconnected) => { - let state = snapshot_index_scheduler(&self.index_scheduler); - panic!("The scheduler crashed.\n{state}") - } - }; - assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); - - self.last_breakpoint = breakpoint_2; - - breakpoint_2 - } - - /// Advance the scheduler until all the provided breakpoints are reached in order. - #[track_caller] - fn advance_till(&mut self, breakpoints: impl IntoIterator) { - for breakpoint in breakpoints { - let b = self.advance(); - assert_eq!( - b, - breakpoint, - "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{}", - breakpoint, - b, - snapshot_index_scheduler(&self.index_scheduler) - ); - } - } - - /// Wait for `n` successful batches. - #[track_caller] - fn advance_n_successful_batches(&mut self, n: usize) { - for _ in 0..n { - self.advance_one_successful_batch(); - } - } - - /// Wait for `n` failed batches. - #[track_caller] - fn advance_n_failed_batches(&mut self, n: usize) { - for _ in 0..n { - self.advance_one_failed_batch(); - } - } - - // Wait for one successful batch. - #[track_caller] - fn advance_one_successful_batch(&mut self) { - self.advance_till([Start, BatchCreated]); - loop { - match self.advance() { - // the process_batch function can call itself recursively, thus we need to - // accept as may InsideProcessBatch as possible before moving to the next state. - InsideProcessBatch => (), - // the batch went successfully, we can stop the loop and go on with the next states. - ProcessBatchSucceeded => break, - AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), - ProcessBatchFailed => { - while self.advance() != Start {} - panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) - }, - breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), - } - } - - self.advance_till([AfterProcessing]); - } - - // Wait for one failed batch. - #[track_caller] - fn advance_one_failed_batch(&mut self) { - self.advance_till([Start, BatchCreated]); - loop { - match self.advance() { - // the process_batch function can call itself recursively, thus we need to - // accept as may InsideProcessBatch as possible before moving to the next state. - InsideProcessBatch => (), - // the batch went failed, we can stop the loop and go on with the next states. - ProcessBatchFailed => break, - ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), - AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), - breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), - } - } - self.advance_till([AfterProcessing]); - } - } - - #[test] - fn register() { - // In this test, the handle doesn't make any progress, we only check that the tasks are registered - let (index_scheduler, mut _handle) = IndexScheduler::test(true, vec![]); - - let kinds = [ - index_creation_task("catto", "mouse"), - replace_document_import_task("catto", None, 0, 12), - replace_document_import_task("catto", None, 1, 50), - replace_document_import_task("doggo", Some("bone"), 2, 5000), - ]; - let (_, file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - file.persist().unwrap(); - let (_, file) = index_scheduler.create_update_file_with_uuid(1).unwrap(); - file.persist().unwrap(); - let (_, file) = index_scheduler.create_update_file_with_uuid(2).unwrap(); - file.persist().unwrap(); - - for (idx, kind) in kinds.into_iter().enumerate() { - let k = kind.as_kind(); - let task = index_scheduler.register(kind, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - - assert_eq!(task.uid, idx as u32); - assert_eq!(task.status, Status::Enqueued); - assert_eq!(task.kind.as_kind(), k); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "everything_is_successfully_registered"); - } - - #[test] - fn insert_task_while_another_task_is_processing() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - handle.advance_till([Start, BatchCreated]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_creation"); - - // while the task is processing can we register another task? - index_scheduler.register(index_creation_task("index_b", "id"), None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - - index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); - } - - #[test] - fn test_task_is_processing() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task"); - - handle.advance_till([Start, BatchCreated]); - assert!(index_scheduler.is_task_processing().unwrap()); - } - - /// We send a lot of tasks but notify the tasks scheduler only once as - /// we send them very fast, we must make sure that they are all processed. - #[test] - fn process_tasks_inserted_without_new_signal() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - - index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_second_task"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_third_task"); - } - - #[test] - fn process_tasks_without_autobatching() { - let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - - index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); - - index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_fourth_task"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fourth"); - } - - #[test] - fn task_deletion_undeleteable() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); - file0.persist().unwrap(); - file1.persist().unwrap(); - - let to_enqueue = [ - index_creation_task("catto", "mouse"), - replace_document_import_task("catto", None, 0, documents_count0), - replace_document_import_task("doggo", Some("bone"), 1, documents_count1), - ]; - - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - - // here we have registered all the tasks, but the index scheduler - // has not progressed at all - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); - - index_scheduler - .register( - KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0, 1]), - }, - None, - false, - ) - .unwrap(); - // again, no progress made at all, but one more task is registered - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_enqueued"); - - // now we create the first batch - handle.advance_till([Start, BatchCreated]); - - // the task deletion should now be "processing" - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processing"); - - handle.advance_till([InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); - // after the task deletion is processed, no task should actually have been deleted, - // because the tasks with ids 0 and 1 were still "enqueued", and thus undeleteable - // the "task deletion" task should be marked as "succeeded" and, in its details, the - // number of deleted tasks should be 0 - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_done"); - } - - #[test] - fn task_deletion_deleteable() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); - file0.persist().unwrap(); - file1.persist().unwrap(); - - let to_enqueue = [ - replace_document_import_task("catto", None, 0, documents_count0), - replace_document_import_task("doggo", Some("bone"), 1, documents_count1), - ]; - - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); - - handle.advance_one_successful_batch(); - // first addition of documents should be successful - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_processed"); - - // Now we delete the first task - index_scheduler - .register( - KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_task_deletion"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processed"); - } - - #[test] - fn task_deletion_delete_same_task_twice() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); - file0.persist().unwrap(); - file1.persist().unwrap(); - - let to_enqueue = [ - replace_document_import_task("catto", None, 0, documents_count0), - replace_document_import_task("doggo", Some("bone"), 1, documents_count1), - ]; - - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); - - handle.advance_one_successful_batch(); - // first addition of documents should be successful - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_processed"); - - // Now we delete the first task multiple times in a row - for _ in 0..2 { - index_scheduler - .register( - KindWithContent::TaskDeletion { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - handle.advance_one_successful_batch(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processed"); - } - - #[test] - fn document_addition() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let content = r#" - { - "id": 1, - "doggo": "bob" - }"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); - - handle.advance_till([Start, BatchCreated]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_the_batch_creation"); - - handle.advance_till([InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "once_everything_is_processed"); - } - - #[test] - fn document_addition_and_index_deletion() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let content = r#" - { - "id": 1, - "doggo": "bob" - }"#; - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - - index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); - - handle.advance_one_successful_batch(); // The index creation. - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "before_index_creation"); - handle.advance_one_successful_batch(); // // after the execution of the two tasks in a single batch. - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded"); - } - - #[test] - fn document_addition_and_document_deletion() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let content = r#"[ - { "id": 1, "doggo": "jean bob" }, - { "id": 2, "catto": "jorts" }, - { "id": 3, "doggo": "bork" } - ]"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - index_scheduler - .register( - KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1"), S("2")], - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - - handle.advance_one_successful_batch(); // The addition AND deletion should've been batched together - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_batch"); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn document_deletion_and_document_addition() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - index_scheduler - .register( - KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1"), S("2")], - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - let content = r#"[ - { "id": 1, "doggo": "jean bob" }, - { "id": 2, "catto": "jorts" }, - { "id": 3, "doggo": "bork" } - ]"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - - // The deletion should have failed because it can't create an index - handle.advance_one_failed_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_the_deletion"); - - // The addition should works - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_last_successful_addition"); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn fail_in_process_batch_for_document_deletion() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - use meilisearch_types::settings::{Settings, Unchecked}; - let mut new_settings: Box> = Box::default(); - new_settings.filterable_attributes = Setting::Set(btreeset!(S("catto"))); - - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings, - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - - let content = r#"[ - { "id": 1, "doggo": "jean bob" }, - { "id": 2, "catto": "jorts" }, - { "id": 3, "doggo": "bork" } - ]"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_setting_and_document_addition"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_adding_the_settings"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_adding_the_documents"); - - index_scheduler - .register( - KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1")], - }, - None, - false, - ) - .unwrap(); - // This one should not be catched by Meilisearch but it's still nice to handle it because if one day we break the filters it could happens - index_scheduler - .register( - KindWithContent::DocumentDeletionByFilter { - index_uid: S("doggos"), - filter_expr: serde_json::json!(true), - }, - None, - false, - ) - .unwrap(); - // Should fail because the ids are not filterable - index_scheduler - .register( - KindWithContent::DocumentDeletionByFilter { - index_uid: S("doggos"), - filter_expr: serde_json::json!("id = 2"), - }, - None, - false, - ) - .unwrap(); - index_scheduler - .register( - KindWithContent::DocumentDeletionByFilter { - index_uid: S("doggos"), - filter_expr: serde_json::json!("catto EXISTS"), - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_document_deletions"); - - // Everything should be batched together - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_removing_the_documents"); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents_remaining_should_only_be_bork"); - } - - #[test] - fn do_not_batch_task_of_different_indexes() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - let index_names = ["doggos", "cattos", "girafos"]; - - for name in index_names { - index_scheduler - .register( - KindWithContent::IndexCreation { - index_uid: name.to_string(), - primary_key: None, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - for name in index_names { - index_scheduler - .register( - KindWithContent::DocumentClear { index_uid: name.to_string() }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - for _ in 0..(index_names.len() * 2) { - handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - } - - #[test] - fn swap_indexes() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let to_enqueue = [ - index_creation_task("a", "id"), - index_creation_task("b", "id"), - index_creation_task("c", "id"), - index_creation_task("d", "id"), - ]; - - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_a"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_b"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_c"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_d"); - - index_scheduler - .register( - KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("c".to_owned(), "d".to_owned()) }, - ], - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_registered"); - index_scheduler - .register( - KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "two_swaps_registered"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_processed"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_swap_processed"); - - index_scheduler - .register(KindWithContent::IndexSwap { swaps: vec![] }, None, false) - .unwrap(); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_empty_swap_processed"); - } - - #[test] - fn swap_indexes_errors() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let to_enqueue = [ - index_creation_task("a", "id"), - index_creation_task("b", "id"), - index_creation_task("c", "id"), - index_creation_task("d", "id"), - ]; - - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - handle.advance_n_successful_batches(4); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_the_index_creation"); - - let first_snap = snapshot_index_scheduler(&index_scheduler); - snapshot!(first_snap, name: "initial_tasks_processed"); - - let err = index_scheduler - .register( - KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("b".to_owned(), "a".to_owned()) }, - ], - }, - None, - false, - ) - .unwrap_err(); - snapshot!(format!("{err}"), @"Indexes must be declared only once during a swap. `a`, `b` were specified several times."); - - let second_snap = snapshot_index_scheduler(&index_scheduler); - assert_eq!(first_snap, second_snap); - - // Index `e` does not exist, but we don't check its existence yet - index_scheduler - .register( - KindWithContent::IndexSwap { - swaps: vec![ - IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, - IndexSwap { indexes: ("c".to_owned(), "e".to_owned()) }, - IndexSwap { indexes: ("d".to_owned(), "f".to_owned()) }, - ], - }, - None, - false, - ) - .unwrap(); - handle.advance_one_failed_batch(); - // Now the first swap should have an error message saying `e` and `f` do not exist - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_failed"); - } - - #[test] - fn document_addition_and_index_deletion_on_unexisting_index() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let content = r#" - { - "id": 1, - "doggo": "bob" - }"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler - .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) - .unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler)); - - handle.advance_n_successful_batches(1); - - snapshot!(snapshot_index_scheduler(&index_scheduler)); - } - - #[test] - fn cancel_enqueued_task() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - file0.persist().unwrap(); - - let to_enqueue = [ - replace_document_import_task("catto", None, 0, documents_count0), - KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }, - ]; - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); - } - - #[test] - fn cancel_succeeded_task() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - file0.persist().unwrap(); - - let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processed"); - - index_scheduler - .register( - KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }, - None, - false, - ) - .unwrap(); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); - } - - #[test] - fn cancel_processing_task() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - file0.persist().unwrap(); - - let _ = index_scheduler - .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processing"); - - index_scheduler - .register( - KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }, - None, - false, - ) - .unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_task_registered"); - // Now we check that we can reach the AbortedIndexation error handling - handle.advance_till([AbortedIndexation]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "aborted_indexation"); - - // handle.advance_till([Start, BatchCreated, BeforeProcessing, AfterProcessing]); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); - } - - #[test] - fn cancel_mix_of_tasks() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); - file0.persist().unwrap(); - let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); - file1.persist().unwrap(); - let (file2, documents_count2) = sample_documents(&index_scheduler, 2, 2); - file2.persist().unwrap(); - - let to_enqueue = [ - replace_document_import_task("catto", None, 0, documents_count0), - replace_document_import_task("beavero", None, 1, documents_count1), - replace_document_import_task("wolfo", None, 2, documents_count2), - ]; - for task in to_enqueue { - let _ = index_scheduler.register(task, None, false).unwrap(); - index_scheduler.assert_internally_consistent(); - } - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_task_processed"); - - handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - index_scheduler - .register( - KindWithContent::TaskCancelation { - query: "test_query".to_owned(), - tasks: RoaringBitmap::from_iter([0, 1, 2]), - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processing_second_task_cancel_enqueued"); - - handle.advance_till([AbortedIndexation]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "aborted_indexation"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); - } - - #[test] - fn test_document_replace() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler)); - - // everything should be batched together. - handle.advance_n_successful_batches(1); - snapshot!(snapshot_index_scheduler(&index_scheduler)); - - // has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_update() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler)); - - // everything should be batched together. - handle.advance_n_successful_batches(1); - snapshot!(snapshot_index_scheduler(&index_scheduler)); - - // has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_mixed_document_addition() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for i in 0..10 { - let method = if i % 2 == 0 { UpdateDocuments } else { ReplaceDocuments }; - - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Only half of the task should've been processed since we can't autobatch replace and update together. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); - - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_settings_update() { - use meilisearch_types::settings::{Settings, Unchecked}; - use milli::update::Setting; - - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let mut new_settings: Box> = Box::default(); - let mut embedders = BTreeMap::default(); - let embedding_settings = milli::vector::settings::EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), - api_key: Setting::Set(S("My super secret")), - url: Setting::Set(S("http://localhost:7777")), - dimensions: Setting::Set(4), - request: Setting::Set(serde_json::json!("{{text}}")), - response: Setting::Set(serde_json::json!("{{embedding}}")), - ..Default::default() - }; - embedders.insert(S("default"), Setting::Set(embedding_settings)); - new_settings.embedders = Setting::Set(embedders); - - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings, - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task"); - - { - let rtxn = index_scheduler.read_txn().unwrap(); - let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); - let task = meilisearch_types::task_view::TaskView::from_task(&task); - insta::assert_json_snapshot!(task.details); - } - - handle.advance_n_successful_batches(1); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed"); - - { - let rtxn = index_scheduler.read_txn().unwrap(); - let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); - let task = meilisearch_types::task_view::TaskView::from_task(&task); - insta::assert_json_snapshot!(task.details); - } - - // has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - - let configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); - insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - insta::assert_json_snapshot!(config.embedder_options); - } - - #[test] - fn test_document_replace_without_autobatching() { - let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Nothing should be batched thus half of the tasks are processed. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); - - // Everything is processed. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_update_without_autobatching() { - let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Nothing should be batched thus half of the tasks are processed. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); - - // Everything is processed. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[macro_export] - macro_rules! debug_snapshot { - ($value:expr, @$snapshot:literal) => {{ - let value = format!("{:?}", $value); - meili_snap::snapshot!(value, @$snapshot); - }}; - } - - #[test] - fn simple_new() { - crate::IndexScheduler::test(true, vec![]); - } - - #[test] - fn query_tasks_from_and_limit() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let kind = index_creation_task("doggo", "bone"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - let kind = index_creation_task("whalo", "plankton"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - let kind = index_creation_task("catto", "his_own_vomit"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); - - handle.advance_n_successful_batches(3); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_all_tasks"); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let query = Query { limit: Some(0), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - let query = Query { limit: Some(1), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[2,]"); - - let query = Query { limit: Some(2), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[1,2,]"); - - let query = Query { from: Some(1), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); - - let query = Query { from: Some(2), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[0,1,2,]"); - - let query = Query { from: Some(1), limit: Some(1), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[1,]"); - - let query = Query { from: Some(1), limit: Some(2), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); - } - - #[test] - fn query_tasks_simple() { - let start_time = OffsetDateTime::now_utc(); - - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); - - handle.advance_till([Start, BatchCreated]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - - let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[0,]"); // only the processing tasks in the first tick - - let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[1,2,]"); // only the enqueued tasks in the first tick - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&tasks), @"[0,1,2,]"); // both enqueued and processing tasks in the first tick - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - after_started_at: Some(start_time), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // both enqueued and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the test, which should excludes the enqueued tasks - snapshot!(snapshot_bitmap(&tasks), @"[0,]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - before_started_at: Some(start_time), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // both enqueued and processing tasks in the first tick, but limited to those with a started_at - // that comes before the start of the test, which should excludes all of them - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - after_started_at: Some(start_time), - before_started_at: Some(start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // both enqueued and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the test and before one minute after the start of the test, - // which should exclude the enqueued tasks and include the only processing task - snapshot!(snapshot_bitmap(&tasks), @"[0,]"); - - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - Start, - BatchCreated, - ]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - - let second_start_time = OffsetDateTime::now_utc(); - - let query = Query { - statuses: Some(vec![Status::Succeeded, Status::Processing]), - after_started_at: Some(start_time), - before_started_at: Some(start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // both succeeded and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the test and before one minute after the start of the test, - // which should include all tasks - snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); - - let query = Query { - statuses: Some(vec![Status::Succeeded, Status::Processing]), - before_started_at: Some(start_time), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // both succeeded and processing tasks in the first tick, but limited to those with a started_at - // that comes before the start of the test, which should exclude all tasks - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // both succeeded and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the second part of the test and before one minute after the - // second start of the test, which should exclude all tasks - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - // now we make one more batch, the started_at field of the new tasks will be past `second_start_time` - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - Start, - BatchCreated, - ]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // we run the same query to verify that, and indeed find that the last task is matched - snapshot!(snapshot_bitmap(&tasks), @"[2,]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // enqueued, succeeded, or processing tasks started after the second part of the test, should - // again only return the last task - snapshot!(snapshot_bitmap(&tasks), @"[2,]"); - - handle.advance_till([ProcessBatchFailed, AfterProcessing]); - let rtxn = index_scheduler.read_txn().unwrap(); - - // now the last task should have failed - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "end"); - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // so running the last query should return nothing - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // but the same query on failed tasks should return the last task - snapshot!(snapshot_bitmap(&tasks), @"[2,]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // but the same query on failed tasks should return the last task - snapshot!(snapshot_bitmap(&tasks), @"[2,]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - uids: Some(vec![1]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // same query but with an invalid uid - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - uids: Some(vec![2]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // same query but with a valid uid - snapshot!(snapshot_bitmap(&tasks), @"[2,]"); - } - - #[test] - fn query_tasks_special_rules() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], - }; - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], - }; - let _task = index_scheduler.register(kind, None, false).unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); - - handle.advance_till([Start, BatchCreated]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - - let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // only the first task associated with catto is returned, the indexSwap tasks are excluded! - snapshot!(snapshot_bitmap(&tasks), @"[0,]"); - - let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes( - &rtxn, - &query, - &AuthFilter::with_allowed_indexes( - vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), - ), - ) - .unwrap(); - // we have asked for only the tasks associated with catto, but are only authorized to retrieve the tasks - // associated with doggo -> empty result - snapshot!(snapshot_bitmap(&tasks), @"[]"); - - let query = Query::default(); - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes( - &rtxn, - &query, - &AuthFilter::with_allowed_indexes( - vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), - ), - ) - .unwrap(); - // we asked for all the tasks, but we are only authorized to retrieve the doggo tasks - // -> only the index creation of doggo should be returned - snapshot!(snapshot_bitmap(&tasks), @"[1,]"); - - let query = Query::default(); - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes( - &rtxn, - &query, - &AuthFilter::with_allowed_indexes( - vec![ - IndexUidPattern::new_unchecked("catto"), - IndexUidPattern::new_unchecked("doggo"), - ] - .into_iter() - .collect(), - ), - ) - .unwrap(); - // we asked for all the tasks, but we are only authorized to retrieve the doggo and catto tasks - // -> all tasks except the swap of catto with whalo are returned - snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); - - let query = Query::default(); - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // we asked for all the tasks with all index authorized -> all tasks returned - snapshot!(snapshot_bitmap(&tasks), @"[0,1,2,3,]"); - } - - #[test] - fn query_tasks_canceled_by() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - let _ = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _ = index_scheduler.register(kind, None, false).unwrap(); - let kind = KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], - }; - let _task = index_scheduler.register(kind, None, false).unwrap(); - - handle.advance_n_successful_batches(1); - let kind = KindWithContent::TaskCancelation { - query: "test_query".to_string(), - tasks: [0, 1, 2, 3].into_iter().collect(), - }; - let task_cancelation = index_scheduler.register(kind, None, false).unwrap(); - handle.advance_n_successful_batches(1); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); - - let rtxn = index_scheduler.read_txn().unwrap(); - let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default()) - .unwrap(); - // 0 is not returned because it was not canceled, 3 is not returned because it is the uid of the - // taskCancelation itself - snapshot!(snapshot_bitmap(&tasks), @"[1,2,]"); - - let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; - let (tasks, _) = index_scheduler - .get_task_ids_from_authorized_indexes( - &rtxn, - &query, - &AuthFilter::with_allowed_indexes( - vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), - ), - ) - .unwrap(); - // Return only 1 because the user is not authorized to see task 2 - snapshot!(snapshot_bitmap(&tasks), @"[1,]"); - } - - #[test] - fn query_batches_from_and_limit() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let kind = index_creation_task("doggo", "bone"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - let kind = index_creation_task("whalo", "plankton"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); - let kind = index_creation_task("catto", "his_own_vomit"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); - - handle.advance_n_successful_batches(3); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_all_tasks"); - - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - let rtxn = index_scheduler.env.read_txn().unwrap(); - let query = Query { limit: Some(0), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[]"); - - let query = Query { limit: Some(1), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[2,]"); - - let query = Query { limit: Some(2), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[1,2,]"); - - let query = Query { from: Some(1), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); - - let query = Query { from: Some(2), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,1,2,]"); - - let query = Query { from: Some(1), limit: Some(1), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[1,]"); - - let query = Query { from: Some(1), limit: Some(2), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); - } - - #[test] - fn query_batches_simple() { - let start_time = OffsetDateTime::now_utc(); - - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); - - handle.advance_till([Start, BatchCreated]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - - let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,]"); // only the processing batch in the first tick - - let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[]"); // The batches don't contains any enqueued tasks - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - snapshot!(snapshot_bitmap(&batches), @"[0,]"); // both enqueued and processing tasks in the first tick - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - after_started_at: Some(start_time), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // both enqueued and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the test, which should excludes the enqueued tasks - snapshot!(snapshot_bitmap(&batches), @"[0,]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - before_started_at: Some(start_time), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // both enqueued and processing tasks in the first tick, but limited to those with a started_at - // that comes before the start of the test, which should excludes all of them - snapshot!(snapshot_bitmap(&batches), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Processing]), - after_started_at: Some(start_time), - before_started_at: Some(start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // both enqueued and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the test and before one minute after the start of the test, - // which should exclude the enqueued tasks and include the only processing task - snapshot!(snapshot_bitmap(&batches), @"[0,]"); - - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - Start, - BatchCreated, - ]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after-advancing-a-bit"); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - - let second_start_time = OffsetDateTime::now_utc(); - - let query = Query { - statuses: Some(vec![Status::Succeeded, Status::Processing]), - after_started_at: Some(start_time), - before_started_at: Some(start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // both succeeded and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the test and before one minute after the start of the test, - // which should include all tasks - snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); - - let query = Query { - statuses: Some(vec![Status::Succeeded, Status::Processing]), - before_started_at: Some(start_time), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // both succeeded and processing tasks in the first tick, but limited to those with a started_at - // that comes before the start of the test, which should exclude all tasks - snapshot!(snapshot_bitmap(&batches), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // both succeeded and processing tasks in the first tick, but limited to those with a started_at - // that comes after the start of the second part of the test and before one minute after the - // second start of the test, which should exclude all tasks - snapshot!(snapshot_bitmap(&batches), @"[]"); - - // now we make one more batch, the started_at field of the new tasks will be past `second_start_time` - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - Start, - BatchCreated, - ]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // we run the same query to verify that, and indeed find that the last task is matched - snapshot!(snapshot_bitmap(&batches), @"[2,]"); - - let query = Query { - statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // enqueued, succeeded, or processing tasks started after the second part of the test, should - // again only return the last task - snapshot!(snapshot_bitmap(&batches), @"[2,]"); - - handle.advance_till([ProcessBatchFailed, AfterProcessing]); - let rtxn = index_scheduler.read_txn().unwrap(); - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - - // now the last task should have failed - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "end"); - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // so running the last query should return nothing - snapshot!(snapshot_bitmap(&batches), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // but the same query on failed tasks should return the last task - snapshot!(snapshot_bitmap(&batches), @"[2,]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // but the same query on failed tasks should return the last task - snapshot!(snapshot_bitmap(&batches), @"[2,]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - uids: Some(vec![1]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // same query but with an invalid uid - snapshot!(snapshot_bitmap(&batches), @"[]"); - - let query = Query { - statuses: Some(vec![Status::Failed]), - uids: Some(vec![2]), - after_started_at: Some(second_start_time), - before_started_at: Some(second_start_time + Duration::minutes(1)), - ..Default::default() - }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // same query but with a valid uid - snapshot!(snapshot_bitmap(&batches), @"[2,]"); - } - - #[test] - fn query_batches_special_rules() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], - }; - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], - }; - let _task = index_scheduler.register(kind, None, false).unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); - - handle.advance_till([Start, BatchCreated]); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - - let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // only the first task associated with catto is returned, the indexSwap tasks are excluded! - snapshot!(snapshot_bitmap(&batches), @"[0,]"); - - let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes( - &rtxn, - &proc, - &query, - &AuthFilter::with_allowed_indexes( - vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), - ), - ) - .unwrap(); - // we have asked for only the tasks associated with catto, but are only authorized to retrieve the tasks - // associated with doggo -> empty result - snapshot!(snapshot_bitmap(&batches), @"[]"); - - drop(rtxn); - // We're going to advance and process all the batches for the next query to actually hit the db - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - ]); - handle.advance_one_successful_batch(); - handle.advance_n_failed_batches(2); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after-processing-everything"); - let rtxn = index_scheduler.env.read_txn().unwrap(); - - let query = Query::default(); - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes( - &rtxn, - &proc, - &query, - &AuthFilter::with_allowed_indexes( - vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), - ), - ) - .unwrap(); - // we asked for all the tasks, but we are only authorized to retrieve the doggo tasks - // -> only the index creation of doggo should be returned - snapshot!(snapshot_bitmap(&batches), @"[1,]"); - - let query = Query::default(); - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes( - &rtxn, - &proc, - &query, - &AuthFilter::with_allowed_indexes( - vec![ - IndexUidPattern::new_unchecked("catto"), - IndexUidPattern::new_unchecked("doggo"), - ] - .into_iter() - .collect(), - ), - ) - .unwrap(); - // we asked for all the tasks, but we are only authorized to retrieve the doggo and catto tasks - // -> all tasks except the swap of catto with whalo are returned - snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); - - let query = Query::default(); - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // we asked for all the tasks with all index authorized -> all tasks returned - snapshot!(snapshot_bitmap(&batches), @"[0,1,2,3,]"); - } - - #[test] - fn query_batches_canceled_by() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - let _ = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _ = index_scheduler.register(kind, None, false).unwrap(); - let kind = KindWithContent::IndexSwap { - swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], - }; - let _task = index_scheduler.register(kind, None, false).unwrap(); - - handle.advance_n_successful_batches(1); - let kind = KindWithContent::TaskCancelation { - query: "test_query".to_string(), - tasks: [0, 1, 2, 3].into_iter().collect(), - }; - let task_cancelation = index_scheduler.register(kind, None, false).unwrap(); - handle.advance_n_successful_batches(1); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); - - let rtxn = index_scheduler.read_txn().unwrap(); - let proc = index_scheduler.processing_tasks.read().unwrap().clone(); - let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default()) - .unwrap(); - // The batch zero was the index creation task, the 1 is the task cancellation - snapshot!(snapshot_bitmap(&batches), @"[1,]"); - - let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; - let (batches, _) = index_scheduler - .get_batch_ids_from_authorized_indexes( - &rtxn, - &proc, - &query, - &AuthFilter::with_allowed_indexes( - vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), - ), - ) - .unwrap(); - // Return only 1 because the user is not authorized to see task 2 - snapshot!(snapshot_bitmap(&batches), @"[1,]"); - } - - #[test] - fn fail_in_process_batch_for_index_creation() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(1, FailureLocation::InsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); - - handle.advance_one_failed_batch(); - - // Still in the first iteration - assert_eq!(*index_scheduler.run_loop_iteration.read().unwrap(), 1); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "index_creation_failed"); - } - - #[test] - fn fail_in_process_batch_for_document_addition() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(1, FailureLocation::InsideProcessBatch)]); - - let content = r#" - { - "id": 1, - "doggo": "bob" - }"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - handle.advance_till([Start, BatchCreated]); - - snapshot!( - snapshot_index_scheduler(&index_scheduler), - name: "document_addition_batch_created" - ); - - handle.advance_till([ProcessBatchFailed, AfterProcessing]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "document_addition_failed"); - } - - #[test] - fn fail_in_update_task_after_process_batch_success_for_document_addition() { - let (index_scheduler, mut handle) = IndexScheduler::test( - true, - vec![(1, FailureLocation::UpdatingTaskAfterProcessBatchSuccess { task_uid: 0 })], - ); - - let content = r#" - { - "id": 1, - "doggo": "bob" - }"#; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - handle.advance_till([Start]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "document_addition_succeeded_but_index_scheduler_not_updated"); - - handle.advance_till([BatchCreated, InsideProcessBatch, ProcessBatchSucceeded]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_succeeded"); - - // At this point the next time the scheduler will try to progress it should encounter - // a critical failure and have to wait for 1s before retrying anything. - - let before_failure = Instant::now(); - handle.advance_till([Start]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_to_commit"); - let failure_duration = before_failure.elapsed(); - assert!(failure_duration.as_millis() >= 1000); - - handle.advance_till([ - BatchCreated, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - ]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_successfully_processed"); - } - - #[test] - fn test_document_addition_cant_create_index_without_index() { - // We're going to autobatch multiple document addition that don't have - // the right to create an index while there is no index currently. - // Thus, everything should be batched together and a IndexDoesNotExists - // error should be throwed. - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Everything should be batched together. - handle.advance_till([ - Start, - BatchCreated, - InsideProcessBatch, - ProcessBatchFailed, - AfterProcessing, - ]); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_10_tasks"); - - // The index should not exist. - snapshot!(matches!(index_scheduler.index_exists("doggos"), Ok(true)), @"false"); - } - - #[test] - fn test_document_addition_cant_create_index_without_index_without_autobatching() { - // We're going to execute multiple document addition that don't have - // the right to create an index while there is no index currently. - // Since the auto-batching is disabled, every task should be processed - // sequentially and throw an IndexDoesNotExists. - let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Nothing should be batched thus half of the tasks are processed. - handle.advance_n_failed_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); - - // Everything is processed. - handle.advance_n_failed_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // The index should not exist. - snapshot!(matches!(index_scheduler.index_exists("doggos"), Ok(true)), @"false"); - } - - #[test] - fn test_document_addition_cant_create_index_with_index() { - // We're going to autobatch multiple document addition that don't have - // the right to create an index while there is already an index. - // Thus, everything should be batched together and no error should be - // throwed. - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - // Create the index. - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Everything should be batched together. - handle.advance_n_successful_batches(1); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_10_tasks"); - - // Has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_cant_create_index_with_index_without_autobatching() { - // We're going to execute multiple document addition that don't have - // the right to create an index while there is no index currently. - // Since the autobatching is disabled, every tasks should be processed - // sequentially and throw an IndexDoesNotExists. - let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); - - // Create the index. - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Nothing should be batched thus half of the tasks are processed. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); - - // Everything is processed. - handle.advance_n_successful_batches(5); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // Has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_mixed_rights_with_index() { - // We're going to autobatch multiple document addition. - // - The index already exists - // - The first document addition don't have the right to create an index - // can it batch with the other one? - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - // Create the index. - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, - None, - false, - ) - .unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - let allow_index_creation = i % 2 != 0; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // Everything should be batched together. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // Has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_mixed_right_without_index_starts_with_cant_create() { - // We're going to autobatch multiple document addition. - // - The index does not exists - // - The first document addition don't have the right to create an index - // - The second do. They should not batch together. - // - The second should batch with everything else as it's going to create an index. - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for i in 0..10 { - let content = format!( - r#"{{ - "id": {}, - "doggo": "bob {}" - }}"#, - i, i - ); - let allow_index_creation = i % 2 != 0; - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(i).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - file.persist().unwrap(); - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); - - // A first batch should be processed with only the first documentAddition that's going to fail. - handle.advance_one_failed_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_failed"); - - // Everything else should be batched together. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); - - // Has everything being pushed successfully in milli? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_with_multiple_primary_key() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for (id, primary_key) in ["id", "bork", "bloup"].iter().enumerate() { - let content = format!( - r#"{{ - "id": {id}, - "doggo": "jean bob" - }}"#, - ); - let (uuid, mut file) = - index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_3_tasks"); - - // A first batch should be processed with only the first documentAddition. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_fails"); - - // Is the primary key still what we expect? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"id"); - - // Is the document still the one we expect?. - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_with_multiple_primary_key_batch_wrong_key() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for (id, primary_key) in ["id", "bork", "bork"].iter().enumerate() { - let content = format!( - r#"{{ - "id": {id}, - "doggo": "jean bob" - }}"#, - ); - let (uuid, mut file) = - index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_3_tasks"); - - // A first batch should be processed with only the first documentAddition. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_and_third_tasks_fails"); - - // Is the primary key still what we expect? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"id"); - - // Is the document still the one we expect?. - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_with_bad_primary_key() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for (id, primary_key) in ["bork", "bork", "id", "bork", "id"].iter().enumerate() { - let content = format!( - r#"{{ - "id": {id}, - "doggo": "jean bob" - }}"#, - ); - let (uuid, mut file) = - index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S(primary_key)), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_5_tasks"); - - // A first batch should be processed with only the first two documentAddition. - // it should fails because the documents don't contains any `bork` field. - // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_and_second_task_fails"); - - // The primary key should be set to none since we failed the batch. - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap(); - snapshot!(primary_key.is_none(), @"true"); - - // The second batch should succeed and only contains one task. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_succeeds"); - - // The primary key should be set to `id` since this batch succeeded. - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"id"); - - // We're trying to `bork` again, but now there is already a primary key set for this index. - // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fourth_task_fails"); - - // Finally the last task should succeed since its primary key is the same as the valid one. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fifth_task_succeeds"); - - // Is the primary key still what we expect? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"id"); - - // Is the document still the one we expect?. - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_with_set_and_null_primary_key() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for (id, primary_key) in - [None, Some("bork"), Some("paw"), None, None, Some("paw")].into_iter().enumerate() - { - let content = format!( - r#"{{ - "paw": {id}, - "doggo": "jean bob" - }}"#, - ); - let (uuid, mut file) = - index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: primary_key.map(|pk| pk.to_string()), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_6_tasks"); - - // A first batch should contains only one task that fails because we can't infer the primary key. - // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_task_fails"); - - // The second batch should contains only one task that fails because we bork is not a valid primary key. - // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); - - // No primary key should be set at this point. - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap(); - snapshot!(primary_key.is_none(), @"true"); - - // The third batch should succeed and only contains one task. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_succeeds"); - - // The primary key should be set to `id` since this batch succeeded. - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"paw"); - - // We should be able to batch together the next two tasks that don't specify any primary key - // + the last task that matches the current primary-key. Everything should succeed. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_other_tasks_succeeds"); - - // Is the primary key still what we expect? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"paw"); - - // Is the document still the one we expect?. - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn test_document_addition_with_set_and_null_primary_key_inference_works() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - for (id, primary_key) in [None, Some("bork"), Some("doggoid"), None, None, Some("doggoid")] - .into_iter() - .enumerate() - { - let content = format!( - r#"{{ - "doggoid": {id}, - "doggo": "jean bob" - }}"#, - ); - let (uuid, mut file) = - index_scheduler.create_update_file_with_uuid(id as u128).unwrap(); - let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: primary_key.map(|pk| pk.to_string()), - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - } - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_6_tasks"); - - // A first batch should contains only one task that succeed and sets the primary key to `doggoid`. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_task_succeed"); - - // Checking the primary key. - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap(); - snapshot!(primary_key.is_none(), @"false"); - - // The second batch should contains only one task that fails because it tries to update the primary key to `bork`. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); - - // The third batch should succeed and only contains one task. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_succeeds"); - - // We should be able to batch together the next two tasks that don't specify any primary key - // + the last task that matches the current primary-key. Everything should succeed. - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_other_tasks_succeeds"); - - // Is the primary key still what we expect? - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); - snapshot!(primary_key, @"doggoid"); - - // Is the document still the one we expect?. - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); - } - - #[test] - fn panic_in_process_batch_for_index_creation() { - let (index_scheduler, mut handle) = - IndexScheduler::test(true, vec![(1, FailureLocation::PanicInsideProcessBatch)]); - - let kind = index_creation_task("catto", "mouse"); - - let _task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); - - handle.advance_till([Start, BatchCreated, ProcessBatchFailed, AfterProcessing]); - - // Still in the first iteration - assert_eq!(*index_scheduler.run_loop_iteration.read().unwrap(), 1); - // No matter what happens in process_batch, the index_scheduler should be internally consistent - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "index_creation_failed"); - } - - #[test] - fn test_task_queue_is_full() { - let (index_scheduler, mut handle) = - IndexScheduler::test_with_custom_config(vec![], |config| { - // that's the minimum map size possible - config.task_db_size = 1048576; - }); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - // on average this task takes ~600 bytes - loop { - let result = index_scheduler.register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ); - if result.is_err() { - break; - } - handle.advance_one_failed_batch(); - } - index_scheduler.assert_internally_consistent(); - - // at this point the task DB shoud have reached its limit and we should not be able to register new tasks - let result = index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap_err(); - snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); - // we won't be able to test this error in an integration test thus as a best effort test IĀ still ensure the error return the expected error code - snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); - - // Even the task deletion that doesn't delete anything shouldn't be accepted - let result = index_scheduler - .register( - KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, - None, - false, - ) - .unwrap_err(); - snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); - // we won't be able to test this error in an integration test thus as a best effort test IĀ still ensure the error return the expected error code - snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); - - // But a task deletion that delete something should works - index_scheduler - .register( - KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - // Now we should be able to enqueue a few tasks again - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - handle.advance_one_failed_batch(); - } - - #[test] - fn test_auto_deletion_of_tasks() { - let (index_scheduler, mut handle) = - IndexScheduler::test_with_custom_config(vec![], |config| { - config.max_number_of_tasks = 2; - }); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - handle.advance_one_failed_batch(); - - // at this point the max number of tasks is reached - // we can still enqueue multiple tasks - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]" }), name: "task_queue_is_full"); - drop(rtxn); - - // now we're above the max number of tasks - // and if we try to advance in the tick function a new task deletion should be enqueued - handle.advance_till([Start, BatchCreated]); - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_been_enqueued"); - drop(rtxn); - - handle.advance_till([InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_been_processed"); - drop(rtxn); - - handle.advance_one_failed_batch(); - // a new task deletion has been enqueued - handle.advance_one_successful_batch(); - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "after_the_second_task_deletion"); - drop(rtxn); - - handle.advance_one_failed_batch(); - handle.advance_one_successful_batch(); - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed"); - drop(rtxn); - } - - #[test] - fn test_disable_auto_deletion_of_tasks() { - let (index_scheduler, mut handle) = - IndexScheduler::test_with_custom_config(vec![], |config| { - config.cleanup_enabled = false; - config.max_number_of_tasks = 2; - }); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - handle.advance_one_failed_batch(); - - // at this point the max number of tasks is reached - // we can still enqueue multiple tasks - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - index_scheduler - .register( - KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, - None, - false, - ) - .unwrap(); - - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]" }), name: "task_queue_is_full"); - drop(rtxn); - - // now we're above the max number of tasks - // and if we try to advance in the tick function no new task deletion should be enqueued - handle.advance_till([Start, BatchCreated]); - let rtxn = index_scheduler.env.read_txn().unwrap(); - let tasks = index_scheduler.get_task_ids(&rtxn, &Query { ..Default::default() }).unwrap(); - let tasks = index_scheduler.get_existing_tasks(&rtxn, tasks).unwrap(); - snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_not_been_enqueued"); - drop(rtxn); - } - - #[test] - fn basic_get_stats() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let kind = index_creation_task("catto", "mouse"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("doggo", "sheep"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - let kind = index_creation_task("whalo", "fish"); - let _task = index_scheduler.register(kind, None, false).unwrap(); - - snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" - { - "indexes": { - "catto": 1, - "doggo": 1, - "whalo": 1 - }, - "statuses": { - "canceled": 0, - "enqueued": 3, - "failed": 0, - "processing": 0, - "succeeded": 0 - }, - "types": { - "documentAdditionOrUpdate": 0, - "documentDeletion": 0, - "documentEdition": 0, - "dumpCreation": 0, - "indexCreation": 3, - "indexDeletion": 0, - "indexSwap": 0, - "indexUpdate": 0, - "settingsUpdate": 0, - "snapshotCreation": 0, - "taskCancelation": 0, - "taskDeletion": 0 - } - } - "###); - - handle.advance_till([Start, BatchCreated]); - snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" - { - "indexes": { - "catto": 1, - "doggo": 1, - "whalo": 1 - }, - "statuses": { - "canceled": 0, - "enqueued": 2, - "failed": 0, - "processing": 1, - "succeeded": 0 - }, - "types": { - "documentAdditionOrUpdate": 0, - "documentDeletion": 0, - "documentEdition": 0, - "dumpCreation": 0, - "indexCreation": 3, - "indexDeletion": 0, - "indexSwap": 0, - "indexUpdate": 0, - "settingsUpdate": 0, - "snapshotCreation": 0, - "taskCancelation": 0, - "taskDeletion": 0 - } - } - "###); - - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - Start, - BatchCreated, - ]); - snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" - { - "indexes": { - "catto": 1, - "doggo": 1, - "whalo": 1 - }, - "statuses": { - "canceled": 0, - "enqueued": 1, - "failed": 0, - "processing": 1, - "succeeded": 1 - }, - "types": { - "documentAdditionOrUpdate": 0, - "documentDeletion": 0, - "documentEdition": 0, - "dumpCreation": 0, - "indexCreation": 3, - "indexDeletion": 0, - "indexSwap": 0, - "indexUpdate": 0, - "settingsUpdate": 0, - "snapshotCreation": 0, - "taskCancelation": 0, - "taskDeletion": 0 - } - } - "###); - - // now we make one more batch, the started_at field of the new tasks will be past `second_start_time` - handle.advance_till([ - InsideProcessBatch, - InsideProcessBatch, - ProcessBatchSucceeded, - AfterProcessing, - Start, - BatchCreated, - ]); - snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###" - { - "indexes": { - "catto": 1, - "doggo": 1, - "whalo": 1 - }, - "statuses": { - "canceled": 0, - "enqueued": 0, - "failed": 0, - "processing": 1, - "succeeded": 2 - }, - "types": { - "documentAdditionOrUpdate": 0, - "documentDeletion": 0, - "documentEdition": 0, - "dumpCreation": 0, - "indexCreation": 3, - "indexDeletion": 0, - "indexSwap": 0, - "indexUpdate": 0, - "settingsUpdate": 0, - "snapshotCreation": 0, - "taskCancelation": 0, - "taskDeletion": 0 - } - } - "###); - } - - #[test] - fn cancel_processing_dump() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None }; - let dump_cancellation = KindWithContent::TaskCancelation { - query: "cancel dump".to_owned(), - tasks: RoaringBitmap::from_iter([0]), - }; - let _ = index_scheduler.register(dump_creation, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); - handle.advance_till([Start, BatchCreated, InsideProcessBatch]); - - let _ = index_scheduler.register(dump_cancellation, None, false).unwrap(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); - - snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); - } - - #[test] - fn basic_set_taskid() { - let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); - - let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, None, false).unwrap(); - snapshot!(task.uid, @"0"); - - let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, Some(12), false).unwrap(); - snapshot!(task.uid, @"12"); - - let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let error = index_scheduler.register(kind, Some(5), false).unwrap_err(); - snapshot!(error, @"Received bad task id: 5 should be >= to 13."); - } - - #[test] - fn dry_run() { - let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); - - let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, None, true).unwrap(); - snapshot!(task.uid, @"0"); - snapshot!(snapshot_index_scheduler(&index_scheduler), @r" - ### Autobatching Enabled = true - ### Processing batch None: - [] - ---------------------------------------------------------------------- - ### All Tasks: - ---------------------------------------------------------------------- - ### Status: - ---------------------------------------------------------------------- - ### Kind: - ---------------------------------------------------------------------- - ### Index Tasks: - ---------------------------------------------------------------------- - ### Index Mapper: - - ---------------------------------------------------------------------- - ### Canceled By: - - ---------------------------------------------------------------------- - ### Enqueued At: - ---------------------------------------------------------------------- - ### Started At: - ---------------------------------------------------------------------- - ### Finished At: - ---------------------------------------------------------------------- - ### All Batches: - ---------------------------------------------------------------------- - ### Batch to tasks mapping: - ---------------------------------------------------------------------- - ### Batches Status: - ---------------------------------------------------------------------- - ### Batches Kind: - ---------------------------------------------------------------------- - ### Batches Index Tasks: - ---------------------------------------------------------------------- - ### Batches Enqueued At: - ---------------------------------------------------------------------- - ### Batches Started At: - ---------------------------------------------------------------------- - ### Batches Finished At: - ---------------------------------------------------------------------- - ### File Store: - - ---------------------------------------------------------------------- - "); - - let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; - let task = index_scheduler.register(kind, Some(12), true).unwrap(); - snapshot!(task.uid, @"12"); - snapshot!(snapshot_index_scheduler(&index_scheduler), @r" - ### Autobatching Enabled = true - ### Processing batch None: - [] - ---------------------------------------------------------------------- - ### All Tasks: - ---------------------------------------------------------------------- - ### Status: - ---------------------------------------------------------------------- - ### Kind: - ---------------------------------------------------------------------- - ### Index Tasks: - ---------------------------------------------------------------------- - ### Index Mapper: - - ---------------------------------------------------------------------- - ### Canceled By: - - ---------------------------------------------------------------------- - ### Enqueued At: - ---------------------------------------------------------------------- - ### Started At: - ---------------------------------------------------------------------- - ### Finished At: - ---------------------------------------------------------------------- - ### All Batches: - ---------------------------------------------------------------------- - ### Batch to tasks mapping: - ---------------------------------------------------------------------- - ### Batches Status: - ---------------------------------------------------------------------- - ### Batches Kind: - ---------------------------------------------------------------------- - ### Batches Index Tasks: - ---------------------------------------------------------------------- - ### Batches Enqueued At: - ---------------------------------------------------------------------- - ### Batches Started At: - ---------------------------------------------------------------------- - ### Batches Finished At: - ---------------------------------------------------------------------- - ### File Store: - - ---------------------------------------------------------------------- - "); - } - - #[test] - fn import_vectors() { - use meilisearch_types::settings::{Settings, Unchecked}; - use milli::update::Setting; - - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let mut new_settings: Box> = Box::default(); - let mut embedders = BTreeMap::default(); - let embedding_settings = milli::vector::settings::EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), - api_key: Setting::Set(S("My super secret")), - url: Setting::Set(S("http://localhost:7777")), - dimensions: Setting::Set(384), - request: Setting::Set(serde_json::json!("{{text}}")), - response: Setting::Set(serde_json::json!("{{embedding}}")), - ..Default::default() - }; - embedders.insert(S("A_fakerest"), Setting::Set(embedding_settings)); - - let embedding_settings = milli::vector::settings::EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), - model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), - revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), - document_template: Setting::Set(S("{{doc.doggo}} the {{doc.breed}} best doggo")), - ..Default::default() - }; - embedders.insert(S("B_small_hf"), Setting::Set(embedding_settings)); - - new_settings.embedders = Setting::Set(embedders); - - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings, - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); - - { - let rtxn = index_scheduler.read_txn().unwrap(); - let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); - let task = meilisearch_types::task_view::TaskView::from_task(&task); - insta::assert_json_snapshot!(task.details); - } - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); - - { - let rtxn = index_scheduler.read_txn().unwrap(); - let task = index_scheduler.get_task(&rtxn, 0).unwrap().unwrap(); - let task = meilisearch_types::task_view::TaskView::from_task(&task); - insta::assert_json_snapshot!(task.details); - } - - let (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) = { - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - - let configs = index.embedding_configs(&rtxn).unwrap(); - // for consistency with the below - #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = - configs.get(0).unwrap(); - insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - insta::assert_json_snapshot!(fakerest_config.embedder_options); - let fakerest_name = name.clone(); - - let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = - configs.get(1).unwrap(); - insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - insta::assert_json_snapshot!(simple_hf_config.embedder_options); - let simple_hf_name = name.clone(); - - let configs = index_scheduler.embedders(configs).unwrap(); - let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); - let beagle_embed = - hf_embedder.embed_one(S("Intel the beagle best doggo"), None).unwrap(); - let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo"), None).unwrap(); - let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo"), None).unwrap(); - (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) - }; - - // add one doc, specifying vectors - - let doc = serde_json::json!( - { - "id": 0, - "doggo": "Intel", - "breed": "beagle", - "_vectors": { - &fakerest_name: { - // this will never trigger regeneration, which is good because we can't actually generate with - // this embedder - "regenerate": false, - "embeddings": beagle_embed, - }, - &simple_hf_name: { - // this will be regenerated on updates - "regenerate": true, - "embeddings": lab_embed, - }, - "noise": [0.1, 0.2, 0.3] - } - } - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); - let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: Some(S("id")), - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); - - handle.advance_one_successful_batch(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "adding Intel succeeds"); - - // check embeddings - { - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - - // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); - // for consistency with the below - #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); - insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); - insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - - let embeddings = index.embeddings(&rtxn, 0).unwrap(); - - assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); - - let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let doc = obkv_to_json( - &[ - fields_ids_map.id("doggo").unwrap(), - fields_ids_map.id("breed").unwrap(), - fields_ids_map.id("_vectors").unwrap(), - ], - &fields_ids_map, - doc, - ) - .unwrap(); - assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); - } - - // update the doc, specifying vectors - - let doc = serde_json::json!( - { - "id": 0, - "doggo": "kefir", - "breed": "patou", - } - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1u128).unwrap(); - let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); - assert_eq!(documents_count, 1); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: None, - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); - - handle.advance_one_successful_batch(); - snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); - - { - // check embeddings - { - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - - // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); - // for consistency with the below - #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); - insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - - let IndexEmbeddingConfig { name, config: _, user_provided } = - configs.get(1).unwrap(); - insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - - let embeddings = index.embeddings(&rtxn, 0).unwrap(); - - // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); - // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); - - let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let doc = obkv_to_json( - &[ - fields_ids_map.id("doggo").unwrap(), - fields_ids_map.id("breed").unwrap(), - fields_ids_map.id("_vectors").unwrap(), - ], - &fields_ids_map, - doc, - ) - .unwrap(); - assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); - } - } - } - - #[test] - fn import_vectors_first_and_embedder_later() { - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let content = serde_json::json!( - [ - { - "id": 0, - "doggo": "kefir", - }, - { - "id": 1, - "doggo": "intel", - "_vectors": { - "my_doggo_embedder": vec![1; 384], - "unknown embedder": vec![1, 2, 3], - } - }, - { - "id": 2, - "doggo": "max", - "_vectors": { - "my_doggo_embedder": { - "regenerate": false, - "embeddings": vec![2; 384], - }, - "unknown embedder": vec![4, 5], - }, - }, - { - "id": 3, - "doggo": "marcel", - "_vectors": { - "my_doggo_embedder": { - "regenerate": true, - "embeddings": vec![3; 384], - }, - }, - }, - { - "id": 4, - "doggo": "sora", - "_vectors": { - "my_doggo_embedder": { - "regenerate": true, - }, - }, - }, - ] - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); - let documents_count = - read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) - .unwrap(); - snapshot!(documents_count, @"5"); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: None, - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); - - let setting = meilisearch_types::settings::Settings:: { - embedders: Setting::Set(maplit::btreemap! { - S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), - model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), - revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), - document_template: Setting::Set(S("{{doc.doggo}}")), - ..Default::default() - }) - }), - ..Default::default() - }; - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings: Box::new(setting), - is_deletion: false, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - index_scheduler.assert_internally_consistent(); - handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - // the all the vectors linked to the new specified embedder have been removed - // Only the unknown embedders stays in the document DB - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); - // even though we specified the vector for the ID 3, it shouldn't be marked - // as user provided since we explicitely marked it as NOT user provided. - snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "my_doggo_embedder", - config: EmbeddingConfig { - embedder_options: HuggingFace( - EmbedderOptions { - model: "sentence-transformers/all-MiniLM-L6-v2", - revision: Some( - "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", - ), - distribution: None, - }, - ), - prompt: PromptData { - template: "{{doc.doggo}}", - max_bytes: Some( - 400, - ), - }, - quantized: None, - }, - user_provided: RoaringBitmap<[1, 2]>, - }, - ] - "###); - let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); - let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; - assert!(!embedding.is_empty(), "{embedding:?}"); - - // the document with the id 3 should keep its original embedding - let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); - let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embeddings = &embeddings["my_doggo_embedder"]; - - snapshot!(embeddings.len(), @"1"); - assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); - - // If we update marcel it should regenerate its embedding automatically - - let content = serde_json::json!( - [ - { - "id": 3, - "doggo": "marvel", - }, - { - "id": 4, - "doggo": "sorry", - }, - ] - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap(); - let documents_count = - read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) - .unwrap(); - snapshot!(documents_count, @"2"); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: None, - method: UpdateDocuments, - content_file: uuid, - documents_count, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - // the document with the id 3 should have its original embedding updated - let rtxn = index.read_txn().unwrap(); - let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); - let doc = index.documents(&rtxn, Some(docid)).unwrap()[0]; - let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap(); - snapshot!(json_string!(doc), @r###" - { - "id": 3, - "doggo": "marvel" - } - "###); - - let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; - - assert!(!embedding.is_empty()); - assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); - - // the document with the id 4 should generate an embedding - let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); - let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; - - assert!(!embedding.is_empty()); - } - - #[test] - fn delete_document_containing_vector() { - // 1. Add an embedder - // 2. Push two documents containing a simple vector - // 3. Delete the first document - // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore - // 5. Clear the index - // 6. The user defined roaring bitmap shouldn't contains the id of the second document - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let setting = meilisearch_types::settings::Settings:: { - embedders: Setting::Set(maplit::btreemap! { - S("manual") => Setting::Set(EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), - dimensions: Setting::Set(3), - ..Default::default() - }) - }), - ..Default::default() - }; - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings: Box::new(setting), - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - let content = serde_json::json!( - [ - { - "id": 0, - "doggo": "kefir", - "_vectors": { - "manual": vec![0, 0, 0], - } - }, - { - "id": 1, - "doggo": "intel", - "_vectors": { - "manual": vec![1, 1, 1], - } - }, - ] - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); - let documents_count = - read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) - .unwrap(); - snapshot!(documents_count, @"2"); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: None, - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - index_scheduler - .register( - KindWithContent::DocumentDeletion { - index_uid: S("doggos"), - documents_ids: vec![S("1")], - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); - snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), - }, - quantized: None, - }, - user_provided: RoaringBitmap<[0]>, - }, - ] - "###); - let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); - let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["manual"]; - assert!(!embedding.is_empty(), "{embedding:?}"); - - index_scheduler - .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) - .unwrap(); - handle.advance_one_successful_batch(); - - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); - let conf = index.embedding_configs(&rtxn).unwrap(); - snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), - }, - quantized: None, - }, - user_provided: RoaringBitmap<[]>, - }, - ] - "###); - } - - #[test] - fn delete_embedder_with_user_provided_vectors() { - // 1. Add two embedders - // 2. Push two documents containing a simple vector - // 3. The documents must not contain the vectors after the update as they are in the vectors db - // 3. Delete the embedders - // 4. The documents contain the vectors again - let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); - - let setting = meilisearch_types::settings::Settings:: { - embedders: Setting::Set(maplit::btreemap! { - S("manual") => Setting::Set(EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), - dimensions: Setting::Set(3), - ..Default::default() - }), - S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), - model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), - revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), - document_template: Setting::Set(S("{{doc.doggo}}")), - ..Default::default() - }), - }), - ..Default::default() - }; - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings: Box::new(setting), - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - let content = serde_json::json!( - [ - { - "id": 0, - "doggo": "kefir", - "_vectors": { - "manual": vec![0, 0, 0], - "my_doggo_embedder": vec![1; 384], - } - }, - { - "id": 1, - "doggo": "intel", - "_vectors": { - "manual": vec![1, 1, 1], - } - }, - ] - ); - - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); - let documents_count = - read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) - .unwrap(); - snapshot!(documents_count, @"2"); - file.persist().unwrap(); - - index_scheduler - .register( - KindWithContent::DocumentAdditionOrUpdate { - index_uid: S("doggos"), - primary_key: None, - method: ReplaceDocuments, - content_file: uuid, - documents_count, - allow_index_creation: false, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - - { - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###); - } - - { - let setting = meilisearch_types::settings::Settings:: { - embedders: Setting::Set(maplit::btreemap! { - S("manual") => Setting::Reset, - }), - ..Default::default() - }; - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings: Box::new(setting), - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - } - - { - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); - } - - { - let setting = meilisearch_types::settings::Settings:: { - embedders: Setting::Reset, - ..Default::default() - }; - index_scheduler - .register( - KindWithContent::SettingsUpdate { - index_uid: S("doggos"), - new_settings: Box::new(setting), - is_deletion: false, - allow_index_creation: true, - }, - None, - false, - ) - .unwrap(); - handle.advance_one_successful_batch(); - } - - { - let index = index_scheduler.index("doggos").unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let field_ids = field_ids_map.ids().collect::>(); - let documents = index - .all_documents(&rtxn) - .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) - .collect::>(); - - // FIXME: redaction - snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); - } - } -} diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs new file mode 100644 index 000000000..fed26aeb7 --- /dev/null +++ b/crates/index-scheduler/src/processing.rs @@ -0,0 +1,289 @@ +use std::sync::Arc; + +use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView}; +use meilisearch_types::milli::{make_atomic_progress, make_enum_progress}; +use roaring::RoaringBitmap; + +use crate::utils::ProcessingBatch; + +#[derive(Clone, Default)] +pub struct ProcessingTasks { + pub batch: Option>, + /// The list of tasks ids that are currently running. + pub processing: Arc, + /// The progress on processing tasks + pub progress: Option, +} + +impl ProcessingTasks { + /// Creates an empty `ProcessingAt` struct. + pub fn new() -> ProcessingTasks { + ProcessingTasks::default() + } + + pub fn get_progress_view(&self) -> Option { + Some(self.progress.as_ref()?.as_progress_view()) + } + + /// Stores the currently processing tasks, and the date time at which it started. + pub fn start_processing( + &mut self, + processing_batch: ProcessingBatch, + processing: RoaringBitmap, + ) -> Progress { + self.batch = Some(Arc::new(processing_batch)); + self.processing = Arc::new(processing); + let progress = Progress::default(); + progress.update_progress(BatchProgress::ProcessingTasks); + self.progress = Some(progress.clone()); + + progress + } + + /// Set the processing tasks to an empty list + pub fn stop_processing(&mut self) -> Self { + self.progress = None; + + Self { + batch: std::mem::take(&mut self.batch), + processing: std::mem::take(&mut self.processing), + progress: None, + } + } + + /// Returns `true` if there, at least, is one task that is currently processing that we must stop. + pub fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool { + !self.processing.is_disjoint(canceled_tasks) + } +} + +make_enum_progress! { + pub enum BatchProgress { + ProcessingTasks, + WritingTasksToDisk, + } +} + +make_enum_progress! { + pub enum TaskCancelationProgress { + RetrievingTasks, + UpdatingTasks, + } +} + +make_enum_progress! { + pub enum TaskDeletionProgress { + DeletingTasksDateTime, + DeletingTasksMetadata, + DeletingTasks, + DeletingBatches, + } +} + +make_enum_progress! { + pub enum SnapshotCreationProgress { + StartTheSnapshotCreation, + SnapshotTheIndexScheduler, + SnapshotTheUpdateFiles, + SnapshotTheIndexes, + SnapshotTheApiKeys, + CreateTheTarball, + } +} + +make_enum_progress! { + pub enum DumpCreationProgress { + StartTheDumpCreation, + DumpTheApiKeys, + DumpTheTasks, + DumpTheBatches, + DumpTheIndexes, + DumpTheExperimentalFeatures, + CompressTheDump, + } +} + +make_enum_progress! { + pub enum CreateIndexProgress { + CreatingTheIndex, + } +} + +make_enum_progress! { + pub enum UpdateIndexProgress { + UpdatingTheIndex, + } +} + +make_enum_progress! { + pub enum DeleteIndexProgress { + DeletingTheIndex, + } +} + +make_enum_progress! { + pub enum SwappingTheIndexes { + EnsuringCorrectnessOfTheSwap, + SwappingTheIndexes, + } +} + +make_enum_progress! { + pub enum InnerSwappingTwoIndexes { + RetrieveTheTasks, + UpdateTheTasks, + UpdateTheIndexesMetadata, + } +} + +make_enum_progress! { + pub enum DocumentOperationProgress { + RetrievingConfig, + ComputingDocumentChanges, + Indexing, + } +} + +make_enum_progress! { + pub enum DocumentEditionProgress { + RetrievingConfig, + ComputingDocumentChanges, + Indexing, + } +} + +make_enum_progress! { + pub enum DocumentDeletionProgress { + RetrievingConfig, + DeleteDocuments, + Indexing, + } +} + +make_enum_progress! { + pub enum SettingsProgress { + RetrievingAndMergingTheSettings, + ApplyTheSettings, + } +} + +make_atomic_progress!(Task alias AtomicTaskStep => "task" ); +make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); +make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" ); + +#[cfg(test)] +mod test { + use std::sync::atomic::Ordering; + + use meili_snap::{json_string, snapshot}; + + use super::*; + + #[test] + fn one_level() { + let mut processing = ProcessingTasks::new(); + processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new()); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "processing tasks", + "finished": 0, + "total": 2 + } + ], + "percentage": 0.0 + } + "#); + processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "writing tasks to disk", + "finished": 1, + "total": 2 + } + ], + "percentage": 50.0 + } + "#); + } + + #[test] + fn task_progress() { + let mut processing = ProcessingTasks::new(); + processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new()); + let (atomic, tasks) = AtomicTaskStep::new(10); + processing.progress.as_ref().unwrap().update_progress(tasks); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "processing tasks", + "finished": 0, + "total": 2 + }, + { + "currentStep": "task", + "finished": 0, + "total": 10 + } + ], + "percentage": 0.0 + } + "#); + atomic.fetch_add(6, Ordering::Relaxed); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "processing tasks", + "finished": 0, + "total": 2 + }, + { + "currentStep": "task", + "finished": 6, + "total": 10 + } + ], + "percentage": 30.000002 + } + "#); + processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "writing tasks to disk", + "finished": 1, + "total": 2 + } + ], + "percentage": 50.0 + } + "#); + let (atomic, tasks) = AtomicTaskStep::new(5); + processing.progress.as_ref().unwrap().update_progress(tasks); + atomic.fetch_add(4, Ordering::Relaxed); + snapshot!(json_string!(processing.get_progress_view()), @r#" + { + "steps": [ + { + "currentStep": "writing tasks to disk", + "finished": 1, + "total": 2 + }, + { + "currentStep": "task", + "finished": 4, + "total": 5 + } + ], + "percentage": 90.0 + } + "#); + } +} diff --git a/crates/index-scheduler/src/queue/batches.rs b/crates/index-scheduler/src/queue/batches.rs new file mode 100644 index 000000000..785d24931 --- /dev/null +++ b/crates/index-scheduler/src/queue/batches.rs @@ -0,0 +1,603 @@ +use std::collections::HashSet; +use std::ops::{Bound, RangeBounds}; + +use meilisearch_types::batches::{Batch, BatchId}; +use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str}; +use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; +use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; +use meilisearch_types::tasks::{Kind, Status}; +use roaring::{MultiOps, RoaringBitmap}; +use time::OffsetDateTime; + +use super::{Query, Queue}; +use crate::processing::ProcessingTasks; +use crate::utils::{ + insert_task_datetime, keep_ids_within_datetimes, map_bound, + remove_n_tasks_datetime_earlier_than, remove_task_datetime, ProcessingBatch, +}; +use crate::{Error, Result, BEI128}; + +/// The number of database used by the batch queue +const NUMBER_OF_DATABASES: u32 = 7; +/// Database const names for the `IndexScheduler`. +mod db_name { + pub const ALL_BATCHES: &str = "all-batches"; + + pub const BATCH_STATUS: &str = "batch-status"; + pub const BATCH_KIND: &str = "batch-kind"; + pub const BATCH_INDEX_TASKS: &str = "batch-index-tasks"; + pub const BATCH_ENQUEUED_AT: &str = "batch-enqueued-at"; + pub const BATCH_STARTED_AT: &str = "batch-started-at"; + pub const BATCH_FINISHED_AT: &str = "batch-finished-at"; +} + +pub struct BatchQueue { + /// Contains all the batches accessible by their Id. + pub(crate) all_batches: Database>, + + /// All the batches containing a task matching the selected status. + pub(crate) status: Database, RoaringBitmapCodec>, + /// All the batches ids grouped by the kind of their task. + pub(crate) kind: Database, RoaringBitmapCodec>, + /// Store the batches associated to an index. + pub(crate) index_tasks: Database, + /// Store the batches containing tasks which were enqueued at a specific date + pub(crate) enqueued_at: Database, + /// Store the batches containing finished tasks started at a specific date + pub(crate) started_at: Database, + /// Store the batches containing tasks finished at a specific date + pub(crate) finished_at: Database, +} + +impl BatchQueue { + pub(crate) fn private_clone(&self) -> BatchQueue { + BatchQueue { + all_batches: self.all_batches, + status: self.status, + kind: self.kind, + index_tasks: self.index_tasks, + enqueued_at: self.enqueued_at, + started_at: self.started_at, + finished_at: self.finished_at, + } + } + + pub(crate) const fn nb_db() -> u32 { + NUMBER_OF_DATABASES + } + + pub(super) fn new(env: &Env, wtxn: &mut RwTxn) -> Result { + Ok(Self { + all_batches: env.create_database(wtxn, Some(db_name::ALL_BATCHES))?, + status: env.create_database(wtxn, Some(db_name::BATCH_STATUS))?, + kind: env.create_database(wtxn, Some(db_name::BATCH_KIND))?, + index_tasks: env.create_database(wtxn, Some(db_name::BATCH_INDEX_TASKS))?, + enqueued_at: env.create_database(wtxn, Some(db_name::BATCH_ENQUEUED_AT))?, + started_at: env.create_database(wtxn, Some(db_name::BATCH_STARTED_AT))?, + finished_at: env.create_database(wtxn, Some(db_name::BATCH_FINISHED_AT))?, + }) + } + + pub(crate) fn all_batch_ids(&self, rtxn: &RoTxn) -> Result { + enum_iterator::all().map(|s| self.get_status(rtxn, s)).union() + } + + pub(crate) fn next_batch_id(&self, rtxn: &RoTxn) -> Result { + Ok(self + .all_batches + .remap_data_type::() + .last(rtxn)? + .map(|(k, _)| k + 1) + .unwrap_or_default()) + } + + pub(crate) fn get_batch(&self, rtxn: &RoTxn, batch_id: BatchId) -> Result> { + Ok(self.all_batches.get(rtxn, &batch_id)?) + } + + /// Returns the whole set of batches that belongs to this index. + pub(crate) fn index_batches(&self, rtxn: &RoTxn, index: &str) -> Result { + Ok(self.index_tasks.get(rtxn, index)?.unwrap_or_default()) + } + + pub(crate) fn update_index( + &self, + wtxn: &mut RwTxn, + index: &str, + f: impl Fn(&mut RoaringBitmap), + ) -> Result<()> { + let mut batches = self.index_batches(wtxn, index)?; + f(&mut batches); + if batches.is_empty() { + self.index_tasks.delete(wtxn, index)?; + } else { + self.index_tasks.put(wtxn, index, &batches)?; + } + + Ok(()) + } + + pub(crate) fn get_status(&self, rtxn: &RoTxn, status: Status) -> Result { + Ok(self.status.get(rtxn, &status)?.unwrap_or_default()) + } + + pub(crate) fn put_status( + &self, + wtxn: &mut RwTxn, + status: Status, + bitmap: &RoaringBitmap, + ) -> Result<()> { + Ok(self.status.put(wtxn, &status, bitmap)?) + } + + pub(crate) fn update_status( + &self, + wtxn: &mut RwTxn, + status: Status, + f: impl Fn(&mut RoaringBitmap), + ) -> Result<()> { + let mut tasks = self.get_status(wtxn, status)?; + f(&mut tasks); + self.put_status(wtxn, status, &tasks)?; + + Ok(()) + } + + pub(crate) fn get_kind(&self, rtxn: &RoTxn, kind: Kind) -> Result { + Ok(self.kind.get(rtxn, &kind)?.unwrap_or_default()) + } + + pub(crate) fn put_kind( + &self, + wtxn: &mut RwTxn, + kind: Kind, + bitmap: &RoaringBitmap, + ) -> Result<()> { + Ok(self.kind.put(wtxn, &kind, bitmap)?) + } + + pub(crate) fn update_kind( + &self, + wtxn: &mut RwTxn, + kind: Kind, + f: impl Fn(&mut RoaringBitmap), + ) -> Result<()> { + let mut tasks = self.get_kind(wtxn, kind)?; + f(&mut tasks); + self.put_kind(wtxn, kind, &tasks)?; + Ok(()) + } + + pub(crate) fn write_batch(&self, wtxn: &mut RwTxn, batch: ProcessingBatch) -> Result<()> { + let old_batch = self.all_batches.get(wtxn, &batch.uid)?; + + self.all_batches.put( + wtxn, + &batch.uid, + &Batch { + uid: batch.uid, + progress: None, + details: batch.details, + stats: batch.stats, + started_at: batch.started_at, + finished_at: batch.finished_at, + enqueued_at: batch.enqueued_at, + }, + )?; + + // Update the statuses + if let Some(ref old_batch) = old_batch { + for status in old_batch.stats.status.keys() { + self.update_status(wtxn, *status, |bitmap| { + bitmap.remove(batch.uid); + })?; + } + } + for status in batch.statuses { + self.update_status(wtxn, status, |bitmap| { + bitmap.insert(batch.uid); + })?; + } + + // Update the kinds / types + if let Some(ref old_batch) = old_batch { + let kinds: HashSet<_> = old_batch.stats.types.keys().cloned().collect(); + for kind in kinds.difference(&batch.kinds) { + self.update_kind(wtxn, *kind, |bitmap| { + bitmap.remove(batch.uid); + })?; + } + } + for kind in batch.kinds { + self.update_kind(wtxn, kind, |bitmap| { + bitmap.insert(batch.uid); + })?; + } + + // Update the indexes + if let Some(ref old_batch) = old_batch { + let indexes: HashSet<_> = old_batch.stats.index_uids.keys().cloned().collect(); + for index in indexes.difference(&batch.indexes) { + self.update_index(wtxn, index, |bitmap| { + bitmap.remove(batch.uid); + })?; + } + } + for index in batch.indexes { + self.update_index(wtxn, &index, |bitmap| { + bitmap.insert(batch.uid); + })?; + } + + // Update the enqueued_at: we cannot retrieve the previous enqueued at from the previous batch, and + // must instead go through the db looking for it. We cannot look at the task contained in this batch either + // because they may have been removed. + // What we know, though, is that the task date is from before the enqueued_at, and max two timestamps have been written + // to the DB per batches. + if let Some(ref old_batch) = old_batch { + if let Some(enqueued_at) = old_batch.enqueued_at { + remove_task_datetime(wtxn, self.enqueued_at, enqueued_at.earliest, old_batch.uid)?; + remove_task_datetime(wtxn, self.enqueued_at, enqueued_at.oldest, old_batch.uid)?; + } else { + // If we don't have the enqueued at in the batch it means the database comes from the v1.12 + // and we still need to find the date by scrolling the database + remove_n_tasks_datetime_earlier_than( + wtxn, + self.enqueued_at, + old_batch.started_at, + old_batch.stats.total_nb_tasks.clamp(1, 2) as usize, + old_batch.uid, + )?; + } + } + // A finished batch MUST contains at least one task and have an enqueued_at + let enqueued_at = batch.enqueued_at.as_ref().unwrap(); + insert_task_datetime(wtxn, self.enqueued_at, enqueued_at.earliest, batch.uid)?; + insert_task_datetime(wtxn, self.enqueued_at, enqueued_at.oldest, batch.uid)?; + + // Update the started at and finished at + if let Some(ref old_batch) = old_batch { + remove_task_datetime(wtxn, self.started_at, old_batch.started_at, old_batch.uid)?; + if let Some(finished_at) = old_batch.finished_at { + remove_task_datetime(wtxn, self.finished_at, finished_at, old_batch.uid)?; + } + } + insert_task_datetime(wtxn, self.started_at, batch.started_at, batch.uid)?; + insert_task_datetime(wtxn, self.finished_at, batch.finished_at.unwrap(), batch.uid)?; + + Ok(()) + } + + /// Convert an iterator to a `Vec` of batches. The batches MUST exist or a + /// `CorruptedTaskQueue` error will be thrown. + pub(crate) fn get_existing_batches( + &self, + rtxn: &RoTxn, + tasks: impl IntoIterator, + processing: &ProcessingTasks, + ) -> Result> { + tasks + .into_iter() + .map(|batch_id| { + if Some(batch_id) == processing.batch.as_ref().map(|batch| batch.uid) { + let mut batch = processing.batch.as_ref().unwrap().to_batch(); + batch.progress = processing.get_progress_view(); + Ok(batch) + } else { + self.get_batch(rtxn, batch_id) + .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) + } + }) + .collect::>() + } +} + +impl Queue { + /// Return the batch ids matched by the given query from the index scheduler's point of view. + pub(crate) fn get_batch_ids( + &self, + rtxn: &RoTxn, + query: &Query, + processing: &ProcessingTasks, + ) -> Result { + let Query { + limit, + from, + reverse, + uids, + batch_uids, + statuses, + types, + index_uids, + canceled_by, + before_enqueued_at, + after_enqueued_at, + before_started_at, + after_started_at, + before_finished_at, + after_finished_at, + } = query; + + let mut batches = self.batches.all_batch_ids(rtxn)?; + if let Some(batch_id) = processing.batch.as_ref().map(|batch| batch.uid) { + batches.insert(batch_id); + } + + if let Some(from) = from { + let range = if reverse.unwrap_or_default() { + u32::MIN..*from + } else { + from.saturating_add(1)..u32::MAX + }; + batches.remove_range(range); + } + + if let Some(batch_uids) = &batch_uids { + let batches_uids = RoaringBitmap::from_iter(batch_uids); + batches &= batches_uids; + } + + if let Some(status) = &statuses { + let mut status_batches = RoaringBitmap::new(); + for status in status { + match status { + // special case for Processing batches + Status::Processing => { + if let Some(batch_id) = processing.batch.as_ref().map(|batch| batch.uid) { + status_batches.insert(batch_id); + } + } + // Enqueued tasks are not stored in batches + Status::Enqueued => (), + status => status_batches |= &self.batches.get_status(rtxn, *status)?, + }; + } + if !status.contains(&Status::Processing) { + if let Some(ref batch) = processing.batch { + batches.remove(batch.uid); + } + } + batches &= status_batches; + } + + if let Some(task_uids) = &uids { + let mut batches_by_task_uids = RoaringBitmap::new(); + for task_uid in task_uids { + if let Some(task) = self.tasks.get_task(rtxn, *task_uid)? { + if let Some(batch_uid) = task.batch_uid { + batches_by_task_uids.insert(batch_uid); + } + } + } + batches &= batches_by_task_uids; + } + + // There is no database for this query, we must retrieve the task queried by the client and ensure it's valid + if let Some(canceled_by) = &canceled_by { + let mut all_canceled_batches = RoaringBitmap::new(); + for cancel_uid in canceled_by { + if let Some(task) = self.tasks.get_task(rtxn, *cancel_uid)? { + if task.kind.as_kind() == Kind::TaskCancelation + && task.status == Status::Succeeded + { + if let Some(batch_uid) = task.batch_uid { + all_canceled_batches.insert(batch_uid); + } + } + } + } + + // if the canceled_by has been specified but no batch + // matches then we prefer matching zero than all batches. + if all_canceled_batches.is_empty() { + return Ok(RoaringBitmap::new()); + } else { + batches &= all_canceled_batches; + } + } + + if let Some(kind) = &types { + let mut kind_batches = RoaringBitmap::new(); + for kind in kind { + kind_batches |= self.batches.get_kind(rtxn, *kind)?; + if let Some(uid) = processing + .batch + .as_ref() + .and_then(|batch| batch.kinds.contains(kind).then_some(batch.uid)) + { + kind_batches.insert(uid); + } + } + batches &= &kind_batches; + } + + if let Some(index) = &index_uids { + let mut index_batches = RoaringBitmap::new(); + for index in index { + index_batches |= self.batches.index_batches(rtxn, index)?; + if let Some(uid) = processing + .batch + .as_ref() + .and_then(|batch| batch.indexes.contains(index).then_some(batch.uid)) + { + index_batches.insert(uid); + } + } + batches &= &index_batches; + } + + // For the started_at filter, we need to treat the part of the batches that are processing from the part of the + // batches that are not processing. The non-processing ones are filtered normally while the processing ones + // are entirely removed unless the in-memory startedAt variable falls within the date filter. + // Once we have filtered the two subsets, we put them back together and assign it back to `batches`. + batches = { + let (mut filtered_non_processing_batches, mut filtered_processing_batches) = + (&batches - &*processing.processing, &batches & &*processing.processing); + + // special case for Processing batches + // A closure that clears the filtered_processing_batches if their started_at date falls outside the given bounds + let mut clear_filtered_processing_batches = + |start: Bound, end: Bound| { + let start = map_bound(start, |b| b.unix_timestamp_nanos()); + let end = map_bound(end, |b| b.unix_timestamp_nanos()); + let is_within_dates = RangeBounds::contains( + &(start, end), + &processing + .batch + .as_ref() + .map_or_else(OffsetDateTime::now_utc, |batch| batch.started_at) + .unix_timestamp_nanos(), + ); + if !is_within_dates { + filtered_processing_batches.clear(); + } + }; + match (after_started_at, before_started_at) { + (None, None) => (), + (None, Some(before)) => { + clear_filtered_processing_batches(Bound::Unbounded, Bound::Excluded(*before)) + } + (Some(after), None) => { + clear_filtered_processing_batches(Bound::Excluded(*after), Bound::Unbounded) + } + (Some(after), Some(before)) => clear_filtered_processing_batches( + Bound::Excluded(*after), + Bound::Excluded(*before), + ), + }; + + keep_ids_within_datetimes( + rtxn, + &mut filtered_non_processing_batches, + self.batches.started_at, + *after_started_at, + *before_started_at, + )?; + filtered_non_processing_batches | filtered_processing_batches + }; + + keep_ids_within_datetimes( + rtxn, + &mut batches, + self.batches.enqueued_at, + *after_enqueued_at, + *before_enqueued_at, + )?; + + keep_ids_within_datetimes( + rtxn, + &mut batches, + self.batches.finished_at, + *after_finished_at, + *before_finished_at, + )?; + + if let Some(limit) = limit { + batches = if query.reverse.unwrap_or_default() { + batches.into_iter().take(*limit as usize).collect() + } else { + batches.into_iter().rev().take(*limit as usize).collect() + }; + } + + Ok(batches) + } + + /// Return the batch ids matching the query along with the total number of batches + /// by ignoring the from and limit parameters from the user's point of view. + /// + /// There are two differences between an internal query and a query executed by + /// the user. + /// + /// 1. IndexSwap tasks are not publicly associated with any index, but they are associated + /// with many indexes internally. + /// 2. The user may not have the rights to access the tasks (internally) associated with all indexes. + pub(crate) fn get_batch_ids_from_authorized_indexes( + &self, + rtxn: &RoTxn, + query: &Query, + filters: &meilisearch_auth::AuthFilter, + processing: &ProcessingTasks, + ) -> Result<(RoaringBitmap, u64)> { + // compute all batches matching the filter by ignoring the limits, to find the number of batches matching + // the filter. + // As this causes us to compute the filter twice it is slightly inefficient, but doing it this way spares + // us from modifying the underlying implementation, and the performance remains sufficient. + // Should this change, we would modify `get_batch_ids` to directly return the number of matching batches. + let total_batches = + self.get_batch_ids(rtxn, &query.clone().without_limits(), processing)?; + let mut batches = self.get_batch_ids(rtxn, query, processing)?; + + // If the query contains a list of index uid or there is a finite list of authorized indexes, + // then we must exclude all the batches that only contains tasks associated to multiple indexes. + // This works because we don't autobatch tasks associated to multiple indexes with tasks associated + // to a single index. e.g: IndexSwap cannot be batched with IndexCreation. + if query.index_uids.is_some() || !filters.all_indexes_authorized() { + for kind in enum_iterator::all::().filter(|kind| !kind.related_to_one_index()) { + batches -= self.tasks.get_kind(rtxn, kind)?; + if let Some(batch) = processing.batch.as_ref() { + if batch.kinds.contains(&kind) { + batches.remove(batch.uid); + } + } + } + } + + // Any batch that is internally associated with at least one authorized index + // must be returned. + if !filters.all_indexes_authorized() { + let mut valid_indexes = RoaringBitmap::new(); + let mut forbidden_indexes = RoaringBitmap::new(); + + let all_indexes_iter = self.batches.index_tasks.iter(rtxn)?; + for result in all_indexes_iter { + let (index, index_tasks) = result?; + if filters.is_index_authorized(index) { + valid_indexes |= index_tasks; + } else { + forbidden_indexes |= index_tasks; + } + } + if let Some(batch) = processing.batch.as_ref() { + for index in &batch.indexes { + if filters.is_index_authorized(index) { + valid_indexes.insert(batch.uid); + } else { + forbidden_indexes.insert(batch.uid); + } + } + } + + // If a batch had ONE valid task then it should be returned + let invalid_batches = forbidden_indexes - valid_indexes; + + batches -= invalid_batches; + } + + Ok((batches, total_batches.len())) + } + + pub(crate) fn get_batches_from_authorized_indexes( + &self, + rtxn: &RoTxn, + query: &Query, + filters: &meilisearch_auth::AuthFilter, + processing: &ProcessingTasks, + ) -> Result<(Vec, u64)> { + let (batches, total) = + self.get_batch_ids_from_authorized_indexes(rtxn, query, filters, processing)?; + let batches = if query.reverse.unwrap_or_default() { + Box::new(batches.into_iter()) as Box> + } else { + Box::new(batches.into_iter().rev()) as Box> + }; + + let batches = self.batches.get_existing_batches( + rtxn, + batches.take(query.limit.unwrap_or(u32::MAX) as usize), + processing, + )?; + + Ok((batches, total)) + } +} diff --git a/crates/index-scheduler/src/queue/batches_test.rs b/crates/index-scheduler/src/queue/batches_test.rs new file mode 100644 index 000000000..38e7ad800 --- /dev/null +++ b/crates/index-scheduler/src/queue/batches_test.rs @@ -0,0 +1,476 @@ +use meili_snap::snapshot; +use meilisearch_auth::AuthFilter; +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::tasks::{IndexSwap, KindWithContent, Status}; +use time::{Duration, OffsetDateTime}; + +use crate::insta_snapshot::{snapshot_bitmap, snapshot_index_scheduler}; +use crate::test_utils::Breakpoint::*; +use crate::test_utils::{index_creation_task, FailureLocation}; +use crate::{IndexScheduler, Query}; + +#[test] +fn query_batches_from_and_limit() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let kind = index_creation_task("doggo", "bone"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + let kind = index_creation_task("whalo", "plankton"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + let kind = index_creation_task("catto", "his_own_vomit"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); + + handle.advance_n_successful_batches(3); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_all_tasks"); + + let proc = index_scheduler.processing_tasks.read().unwrap().clone(); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let query = Query { limit: Some(0), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[]"); + + let query = Query { limit: Some(1), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[2,]"); + + let query = Query { limit: Some(2), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[1,2,]"); + + let query = Query { from: Some(1), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); + + let query = Query { from: Some(2), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[0,1,2,]"); + + let query = Query { from: Some(1), limit: Some(1), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[1,]"); + + let query = Query { from: Some(1), limit: Some(2), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); +} + +#[test] +fn query_batches_simple() { + let start_time = OffsetDateTime::now_utc(); + + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("whalo", "fish"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); + + handle.advance_till([Start, BatchCreated]); + + let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() }; + let (mut batches, _) = index_scheduler + .get_batches_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + assert_eq!(batches.len(), 1); + batches[0].started_at = OffsetDateTime::UNIX_EPOCH; + assert!(batches[0].enqueued_at.is_some()); + batches[0].enqueued_at = None; + // Insta cannot snapshot our batches because the batch stats contains an enum as key: https://github.com/mitsuhiko/insta/issues/689 + let batch = serde_json::to_string_pretty(&batches[0]).unwrap(); + snapshot!(batch, @r#" + { + "uid": 0, + "details": { + "primaryKey": "mouse" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "processing": 1 + }, + "types": { + "indexCreation": 1 + }, + "indexUids": { + "catto": 1 + } + }, + "startedAt": "1970-01-01T00:00:00Z", + "finishedAt": null, + "enqueuedAt": null + } + "#); + + let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[]"); // The batches don't contains any enqueued tasks + + let query = + Query { statuses: Some(vec![Status::Enqueued, Status::Processing]), ..Default::default() }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + snapshot!(snapshot_bitmap(&batches), @"[0,]"); // both enqueued and processing tasks in the first tick + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Processing]), + after_started_at: Some(start_time), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both enqueued and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the test, which should excludes the enqueued tasks + snapshot!(snapshot_bitmap(&batches), @"[0,]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Processing]), + before_started_at: Some(start_time), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both enqueued and processing tasks in the first tick, but limited to those with a started_at + // that comes before the start of the test, which should excludes all of them + snapshot!(snapshot_bitmap(&batches), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Processing]), + after_started_at: Some(start_time), + before_started_at: Some(start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both enqueued and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the test and before one minute after the start of the test, + // which should exclude the enqueued tasks and include the only processing task + snapshot!(snapshot_bitmap(&batches), @"[0,]"); + + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + Start, + BatchCreated, + ]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after-advancing-a-bit"); + + let second_start_time = OffsetDateTime::now_utc(); + + let query = Query { + statuses: Some(vec![Status::Succeeded, Status::Processing]), + after_started_at: Some(start_time), + before_started_at: Some(start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both succeeded and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the test and before one minute after the start of the test, + // which should include all tasks + snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); + + let query = Query { + statuses: Some(vec![Status::Succeeded, Status::Processing]), + before_started_at: Some(start_time), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both succeeded and processing tasks in the first tick, but limited to those with a started_at + // that comes before the start of the test, which should exclude all tasks + snapshot!(snapshot_bitmap(&batches), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both succeeded and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the second part of the test and before one minute after the + // second start of the test, which should exclude all tasks + snapshot!(snapshot_bitmap(&batches), @"[]"); + + // now we make one more batch, the started_at field of the new tasks will be past `second_start_time` + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + Start, + BatchCreated, + ]); + + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // we run the same query to verify that, and indeed find that the last task is matched + snapshot!(snapshot_bitmap(&batches), @"[2,]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // enqueued, succeeded, or processing tasks started after the second part of the test, should + // again only return the last task + snapshot!(snapshot_bitmap(&batches), @"[2,]"); + + handle.advance_till([ProcessBatchFailed, AfterProcessing]); + + // now the last task should have failed + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "end"); + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // so running the last query should return nothing + snapshot!(snapshot_bitmap(&batches), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // but the same query on failed tasks should return the last task + snapshot!(snapshot_bitmap(&batches), @"[2,]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // but the same query on failed tasks should return the last task + snapshot!(snapshot_bitmap(&batches), @"[2,]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + uids: Some(vec![1]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // same query but with an invalid uid + snapshot!(snapshot_bitmap(&batches), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + uids: Some(vec![2]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // same query but with a valid uid + snapshot!(snapshot_bitmap(&batches), @"[2,]"); +} + +#[test] +fn query_batches_special_rules() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], + }; + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], + }; + let _task = index_scheduler.register(kind, None, false).unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); + + handle.advance_till([Start, BatchCreated]); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap().clone(); + + let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + // only the first task associated with catto is returned, the indexSwap tasks are excluded! + snapshot!(snapshot_bitmap(&batches), @"[0,]"); + + let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), + ), + &proc, + ) + .unwrap(); + // we have asked for only the tasks associated with catto, but are only authorized to retrieve the tasks + // associated with doggo -> empty result + snapshot!(snapshot_bitmap(&batches), @"[]"); + + drop(rtxn); + // We're going to advance and process all the batches for the next query to actually hit the db + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + ]); + handle.advance_one_successful_batch(); + handle.advance_n_failed_batches(2); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after-processing-everything"); + let rtxn = index_scheduler.env.read_txn().unwrap(); + + let query = Query::default(); + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), + ), + &proc, + ) + .unwrap(); + // we asked for all the tasks, but we are only authorized to retrieve the doggo tasks + // -> only the index creation of doggo should be returned + snapshot!(snapshot_bitmap(&batches), @"[1,]"); + + let query = Query::default(); + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![ + IndexUidPattern::new_unchecked("catto"), + IndexUidPattern::new_unchecked("doggo"), + ] + .into_iter() + .collect(), + ), + &proc, + ) + .unwrap(); + // we asked for all the tasks, but we are only authorized to retrieve the doggo and catto tasks + // -> all tasks except the swap of catto with whalo are returned + snapshot!(snapshot_bitmap(&batches), @"[0,1,]"); + + let query = Query::default(); + let (batches, _) = index_scheduler + .queue + .get_batch_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + // we asked for all the tasks with all index authorized -> all tasks returned + snapshot!(snapshot_bitmap(&batches), @"[0,1,2,3,]"); +} + +#[test] +fn query_batches_canceled_by() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + let _ = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _ = index_scheduler.register(kind, None, false).unwrap(); + let kind = KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], + }; + let _task = index_scheduler.register(kind, None, false).unwrap(); + + handle.advance_n_successful_batches(1); + let kind = KindWithContent::TaskCancelation { + query: "test_query".to_string(), + tasks: [0, 1, 2, 3].into_iter().collect(), + }; + let task_cancelation = index_scheduler.register(kind, None, false).unwrap(); + handle.advance_n_successful_batches(1); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); + + let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // The batch zero was the index creation task, the 1 is the task cancellation + snapshot!(snapshot_bitmap(&batches), @"[1,]"); + + let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; + let (batches, _) = index_scheduler + .get_batch_ids_from_authorized_indexes( + &query, + &AuthFilter::with_allowed_indexes( + vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), + ), + ) + .unwrap(); + // Return only 1 because the user is not authorized to see task 2 + snapshot!(snapshot_bitmap(&batches), @"[1,]"); +} diff --git a/crates/index-scheduler/src/queue/mod.rs b/crates/index-scheduler/src/queue/mod.rs new file mode 100644 index 000000000..b13e3ffe2 --- /dev/null +++ b/crates/index-scheduler/src/queue/mod.rs @@ -0,0 +1,391 @@ +mod batches; +#[cfg(test)] +mod batches_test; +mod tasks; +#[cfg(test)] +mod tasks_test; +#[cfg(test)] +mod test; + +use std::collections::BTreeMap; +use std::fs::File as StdFile; +use std::time::Duration; + +use file_store::FileStore; +use meilisearch_types::batches::BatchId; +use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; +use meilisearch_types::milli::{CboRoaringBitmapCodec, BEU32}; +use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; +use roaring::RoaringBitmap; +use time::format_description::well_known::Rfc3339; +use time::OffsetDateTime; +use uuid::Uuid; + +pub(crate) use self::batches::BatchQueue; +pub(crate) use self::tasks::TaskQueue; +use crate::processing::ProcessingTasks; +use crate::utils::{ + check_index_swap_validity, filter_out_references_to_newer_tasks, ProcessingBatch, +}; +use crate::{Error, IndexSchedulerOptions, Result, TaskId}; + +/// The number of database used by queue itself +const NUMBER_OF_DATABASES: u32 = 1; +/// Database const names for the `IndexScheduler`. +mod db_name { + pub const BATCH_TO_TASKS_MAPPING: &str = "batch-to-tasks-mapping"; +} + +/// Defines a subset of tasks to be retrieved from the [`IndexScheduler`]. +/// +/// An empty/default query (where each field is set to `None`) matches all tasks. +/// Each non-null field restricts the set of tasks further. +#[derive(Default, Debug, Clone, PartialEq, Eq)] +pub struct Query { + /// The maximum number of tasks to be matched + pub limit: Option, + /// The minimum [task id](`meilisearch_types::tasks::Task::uid`) to be matched + pub from: Option, + /// The order used to return the tasks. By default the newest tasks are returned first and the boolean is `false`. + pub reverse: Option, + /// The [task ids](`meilisearch_types::tasks::Task::uid`) to be matched + pub uids: Option>, + /// The [batch ids](`meilisearch_types::batches::Batch::uid`) to be matched + pub batch_uids: Option>, + /// The allowed [statuses](`meilisearch_types::tasks::Task::status`) of the matched tasls + pub statuses: Option>, + /// The allowed [kinds](meilisearch_types::tasks::Kind) of the matched tasks. + /// + /// The kind of a task is given by: + /// ``` + /// # use meilisearch_types::tasks::{Task, Kind}; + /// # fn doc_func(task: Task) -> Kind { + /// task.kind.as_kind() + /// # } + /// ``` + pub types: Option>, + /// The allowed [index ids](meilisearch_types::tasks::Task::index_uid) of the matched tasks + pub index_uids: Option>, + /// The [task ids](`meilisearch_types::tasks::Task::uid`) of the [`TaskCancelation`](meilisearch_types::tasks::Task::Kind::TaskCancelation) tasks + /// that canceled the matched tasks. + pub canceled_by: Option>, + /// Exclusive upper bound of the matched tasks' [`enqueued_at`](meilisearch_types::tasks::Task::enqueued_at) field. + pub before_enqueued_at: Option, + /// Exclusive lower bound of the matched tasks' [`enqueued_at`](meilisearch_types::tasks::Task::enqueued_at) field. + pub after_enqueued_at: Option, + /// Exclusive upper bound of the matched tasks' [`started_at`](meilisearch_types::tasks::Task::started_at) field. + pub before_started_at: Option, + /// Exclusive lower bound of the matched tasks' [`started_at`](meilisearch_types::tasks::Task::started_at) field. + pub after_started_at: Option, + /// Exclusive upper bound of the matched tasks' [`finished_at`](meilisearch_types::tasks::Task::finished_at) field. + pub before_finished_at: Option, + /// Exclusive lower bound of the matched tasks' [`finished_at`](meilisearch_types::tasks::Task::finished_at) field. + pub after_finished_at: Option, +} + +impl Query { + /// Return `true` if every field of the query is set to `None`, such that the query + /// matches all tasks. + pub fn is_empty(&self) -> bool { + matches!( + self, + Query { + limit: None, + from: None, + reverse: None, + uids: None, + batch_uids: None, + statuses: None, + types: None, + index_uids: None, + canceled_by: None, + before_enqueued_at: None, + after_enqueued_at: None, + before_started_at: None, + after_started_at: None, + before_finished_at: None, + after_finished_at: None, + } + ) + } + + /// Add an [index id](meilisearch_types::tasks::Task::index_uid) to the list of permitted indexes. + pub fn with_index(self, index_uid: String) -> Self { + let mut index_vec = self.index_uids.unwrap_or_default(); + index_vec.push(index_uid); + Self { index_uids: Some(index_vec), ..self } + } + + // Removes the `from` and `limit` restrictions from the query. + // Useful to get the total number of tasks matching a filter. + pub fn without_limits(self) -> Self { + Query { limit: None, from: None, ..self } + } +} + +/// Structure which holds meilisearch's indexes and schedules the tasks +/// to be performed on them. +pub struct Queue { + pub(crate) tasks: tasks::TaskQueue, + pub(crate) batches: batches::BatchQueue, + + /// Matches a batch id with the associated task ids. + pub(crate) batch_to_tasks_mapping: Database, + + /// The list of files referenced by the tasks. + pub(crate) file_store: FileStore, + + /// The max number of tasks allowed before the scheduler starts to delete + /// the finished tasks automatically. + pub(crate) max_number_of_tasks: usize, +} + +impl Queue { + pub(crate) fn private_clone(&self) -> Queue { + Queue { + tasks: self.tasks.private_clone(), + batches: self.batches.private_clone(), + batch_to_tasks_mapping: self.batch_to_tasks_mapping, + file_store: self.file_store.clone(), + max_number_of_tasks: self.max_number_of_tasks, + } + } + + pub(crate) const fn nb_db() -> u32 { + tasks::TaskQueue::nb_db() + batches::BatchQueue::nb_db() + NUMBER_OF_DATABASES + } + + /// Create an index scheduler and start its run loop. + pub(crate) fn new( + env: &Env, + wtxn: &mut RwTxn, + options: &IndexSchedulerOptions, + ) -> Result { + // allow unreachable_code to get rids of the warning in the case of a test build. + Ok(Self { + file_store: FileStore::new(&options.update_file_path)?, + batch_to_tasks_mapping: env + .create_database(wtxn, Some(db_name::BATCH_TO_TASKS_MAPPING))?, + tasks: TaskQueue::new(env, wtxn)?, + batches: BatchQueue::new(env, wtxn)?, + max_number_of_tasks: options.max_number_of_tasks, + }) + } + + /// Returns the whole set of tasks that belongs to this batch. + pub(crate) fn tasks_in_batch(&self, rtxn: &RoTxn, batch_id: BatchId) -> Result { + Ok(self.batch_to_tasks_mapping.get(rtxn, &batch_id)?.unwrap_or_default()) + } + + /// Convert an iterator to a `Vec` of tasks and edit the `ProcessingBatch` to add the given tasks. + /// + /// The tasks MUST exist, or a `CorruptedTaskQueue` error will be thrown. + pub(crate) fn get_existing_tasks_for_processing_batch( + &self, + rtxn: &RoTxn, + processing_batch: &mut ProcessingBatch, + tasks: impl IntoIterator, + ) -> Result> { + tasks + .into_iter() + .map(|task_id| { + let mut task = self + .tasks + .get_task(rtxn, task_id) + .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)); + processing_batch.processing(&mut task); + task + }) + .collect::>() + } + + pub(crate) fn write_batch( + &self, + wtxn: &mut RwTxn, + batch: ProcessingBatch, + tasks: &RoaringBitmap, + ) -> Result<()> { + self.batch_to_tasks_mapping.put(wtxn, &batch.uid, tasks)?; + self.batches.write_batch(wtxn, batch)?; + Ok(()) + } + + pub(crate) fn delete_persisted_task_data(&self, task: &Task) -> Result<()> { + match task.content_uuid() { + Some(content_file) => self.delete_update_file(content_file), + None => Ok(()), + } + } + + /// Open and returns the task's content File. + pub fn update_file(&self, uuid: Uuid) -> file_store::Result { + self.file_store.get_update(uuid) + } + + /// Delete a file from the index scheduler. + /// + /// Counterpart to the [`create_update_file`](IndexScheduler::create_update_file) method. + pub fn delete_update_file(&self, uuid: Uuid) -> Result<()> { + Ok(self.file_store.delete(uuid)?) + } + + /// Create a file and register it in the index scheduler. + /// + /// The returned file and uuid can be used to associate + /// some data to a task. The file will be kept until + /// the task has been fully processed. + pub fn create_update_file(&self, dry_run: bool) -> Result<(Uuid, file_store::File)> { + if dry_run { + Ok((Uuid::nil(), file_store::File::dry_file()?)) + } else { + Ok(self.file_store.new_update()?) + } + } + + #[cfg(test)] + pub fn create_update_file_with_uuid(&self, uuid: u128) -> Result<(Uuid, file_store::File)> { + Ok(self.file_store.new_update_with_uuid(uuid)?) + } + + /// The size on disk taken by all the updates files contained in the `IndexScheduler`, in bytes. + pub fn compute_update_file_size(&self) -> Result { + Ok(self.file_store.compute_total_size()?) + } + + pub fn register( + &self, + wtxn: &mut RwTxn, + kind: &KindWithContent, + task_id: Option, + dry_run: bool, + ) -> Result { + let next_task_id = self.tasks.next_task_id(wtxn)?; + + if let Some(uid) = task_id { + if uid < next_task_id { + return Err(Error::BadTaskId { received: uid, expected: next_task_id }); + } + } + + let mut task = Task { + uid: task_id.unwrap_or(next_task_id), + // The batch is defined once we starts processing the task + batch_uid: None, + enqueued_at: OffsetDateTime::now_utc(), + started_at: None, + finished_at: None, + error: None, + canceled_by: None, + details: kind.default_details(), + status: Status::Enqueued, + kind: kind.clone(), + }; + // For deletion and cancelation tasks, we want to make extra sure that they + // don't attempt to delete/cancel tasks that are newer than themselves. + filter_out_references_to_newer_tasks(&mut task); + // If the register task is an index swap task, verify that it is well-formed + // (that it does not contain duplicate indexes). + check_index_swap_validity(&task)?; + + // At this point the task is going to be registered and no further checks will be done + if dry_run { + return Ok(task); + } + + // Get rid of the mutability. + let task = task; + self.tasks.register(wtxn, &task)?; + + Ok(task) + } + + /// Register a task to cleanup the task queue if needed + pub fn cleanup_task_queue(&self, wtxn: &mut RwTxn) -> Result<()> { + let nb_tasks = self.tasks.all_task_ids(wtxn)?.len(); + // if we have less than 1M tasks everything is fine + if nb_tasks < self.max_number_of_tasks as u64 { + return Ok(()); + } + + let finished = self.tasks.status.get(wtxn, &Status::Succeeded)?.unwrap_or_default() + | self.tasks.status.get(wtxn, &Status::Failed)?.unwrap_or_default() + | self.tasks.status.get(wtxn, &Status::Canceled)?.unwrap_or_default(); + + let to_delete = RoaringBitmap::from_iter(finished.into_iter().rev().take(100_000)); + + // /!\ the len must be at least 2 or else we might enter an infinite loop where we only delete + // the deletion tasks we enqueued ourselves. + if to_delete.len() < 2 { + tracing::warn!("The task queue is almost full, but no task can be deleted yet."); + // the only thing we can do is hope that the user tasks are going to finish + return Ok(()); + } + + tracing::info!( + "The task queue is almost full. Deleting the oldest {} finished tasks.", + to_delete.len() + ); + + // it's safe to unwrap here because we checked the len above + let newest_task_id = to_delete.iter().last().unwrap(); + let last_task_to_delete = + self.tasks.get_task(wtxn, newest_task_id)?.ok_or(Error::CorruptedTaskQueue)?; + + // increase time by one nanosecond so that the enqueuedAt of the last task to delete is also lower than that date. + let delete_before = last_task_to_delete.enqueued_at + Duration::from_nanos(1); + + self.register( + wtxn, + &KindWithContent::TaskDeletion { + query: format!( + "?beforeEnqueuedAt={}&statuses=succeeded,failed,canceled", + delete_before.format(&Rfc3339).map_err(|_| Error::CorruptedTaskQueue)?, + ), + tasks: to_delete, + }, + None, + false, + )?; + + Ok(()) + } + + pub fn get_stats( + &self, + rtxn: &RoTxn, + processing: &ProcessingTasks, + ) -> Result>> { + let mut res = BTreeMap::new(); + let processing_tasks = processing.processing.len(); + + res.insert( + "statuses".to_string(), + enum_iterator::all::() + .map(|s| { + let tasks = self.tasks.get_status(rtxn, s)?.len(); + match s { + Status::Enqueued => Ok((s.to_string(), tasks - processing_tasks)), + Status::Processing => Ok((s.to_string(), processing_tasks)), + s => Ok((s.to_string(), tasks)), + } + }) + .collect::>>()?, + ); + res.insert( + "types".to_string(), + enum_iterator::all::() + .map(|s| Ok((s.to_string(), self.tasks.get_kind(rtxn, s)?.len()))) + .collect::>>()?, + ); + res.insert( + "indexes".to_string(), + self.tasks + .index_tasks + .iter(rtxn)? + .map(|res| Ok(res.map(|(name, bitmap)| (name.to_string(), bitmap.len()))?)) + .collect::>>()?, + ); + + Ok(res) + } +} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_canceled_by/start.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_canceled_by/start.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_canceled_by/start.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_canceled_by/start.snap index ea3a75e8f..410f46929 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_canceled_by/start.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_canceled_by/start.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/processed_all_tasks.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/processed_all_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/processed_all_tasks.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/processed_all_tasks.snap index 9f5c7e4ad..27a641b59 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/processed_all_tasks.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/processed_all_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_first_task.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_first_task.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_first_task.snap index 64503a754..74c4c4a33 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_first_task.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_second_task.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_second_task.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_second_task.snap index 171f6dab4..411e82ea0 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_second_task.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_third_task.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_third_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_third_task.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_third_task.snap index f811b99a6..4c76db95e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_third_task.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_from_and_limit/registered_the_third_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/after-advancing-a-bit.snap similarity index 95% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/after-advancing-a-bit.snap index 869e38e57..6d899b270 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/after-advancing-a-bit.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"doggo":2}}, } +{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_simple/end.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/end.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_simple/end.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/end.snap index cbb780494..314f5b067 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_simple/end.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/end.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_simple/start.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/start.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_simple/start.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/start.snap index 78a6c4228..6dc897dfa 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_simple/start.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_simple/start.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_special_rules/after-processing-everything.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_special_rules/after-processing-everything.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_special_rules/after-processing-everything.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_special_rules/after-processing-everything.snap index 31a08e88b..f40322ac0 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_special_rules/after-processing-everything.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_special_rules/after-processing-everything.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_special_rules/start.snap b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_special_rules/start.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_special_rules/start.snap rename to crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_special_rules/start.snap index 30f62c526..1184c197f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_special_rules/start.snap +++ b/crates/index-scheduler/src/queue/snapshots/batches_test.rs/query_batches_special_rules/start.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/batches_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_canceled_by/start.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_canceled_by/start.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_canceled_by/start.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_canceled_by/start.snap index ea3a75e8f..165d7c4fe 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_canceled_by/start.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_canceled_by/start.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/processed_all_tasks.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/processed_all_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/processed_all_tasks.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/processed_all_tasks.snap index 9f5c7e4ad..079972755 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/processed_all_tasks.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/processed_all_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_first_task.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_first_task.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_first_task.snap index 64503a754..4f9ffb209 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_first_task.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_second_task.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_second_task.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_second_task.snap index 171f6dab4..eb6b0e7ec 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_from_and_limit/registered_the_second_task.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_third_task.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_third_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_third_task.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_third_task.snap index f811b99a6..181f0308c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_from_and_limit/registered_the_third_task.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_from_and_limit/registered_the_third_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/end.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_simple/end.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/end.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_simple/end.snap index cbb780494..3ed017700 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/end.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_simple/end.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/start.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_simple/start.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/start.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_simple/start.snap index 78a6c4228..268f463aa 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/start.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_simple/start.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_special_rules/start.snap b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_special_rules/start.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/query_tasks_special_rules/start.snap rename to crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_special_rules/start.snap index 30f62c526..60c041c05 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/query_tasks_special_rules/start.snap +++ b/crates/index-scheduler/src/queue/snapshots/tasks_test.rs/query_tasks_special_rules/start.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/tasks_test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/register/everything_is_successfully_registered.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/register/everything_is_successfully_registered.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/register/everything_is_successfully_registered.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/register/everything_is_successfully_registered.snap index 8341d947d..e4d9af541 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/register/everything_is_successfully_registered.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/register/everything_is_successfully_registered.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/after_the_second_task_deletion.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/after_the_second_task_deletion.snap similarity index 94% rename from crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/after_the_second_task_deletion.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/after_the_second_task_deletion.snap index 03213fbb0..30e8e17a8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/after_the_second_task_deletion.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/after_the_second_task_deletion.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/everything_has_been_processed.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/everything_has_been_processed.snap similarity index 91% rename from crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/everything_has_been_processed.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/everything_has_been_processed.snap index cc38f69a0..76f88a13f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/everything_has_been_processed.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/everything_has_been_processed.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_deletion_have_been_enqueued.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_deletion_have_been_enqueued.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_deletion_have_been_enqueued.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_deletion_have_been_enqueued.snap index 3400d8950..4e3fb5439 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_deletion_have_been_enqueued.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_deletion_have_been_enqueued.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_deletion_have_been_processed.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_deletion_have_been_processed.snap similarity index 95% rename from crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_deletion_have_been_processed.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_deletion_have_been_processed.snap index ab4210bed..4cabce94b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_deletion_have_been_processed.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_deletion_have_been_processed.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_queue_is_full.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_queue_is_full.snap index 8b69b1cc2..5565994cb 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_auto_deletion_of_tasks/task_queue_is_full.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap index 8b69b1cc2..5565994cb 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_disable_auto_deletion_of_tasks/task_deletion_have_not_been_enqueued.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_queue_is_full.snap b/crates/index-scheduler/src/queue/snapshots/test.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_queue_is_full.snap rename to crates/index-scheduler/src/queue/snapshots/test.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap index 8b69b1cc2..5565994cb 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_auto_deletion_of_tasks/task_queue_is_full.snap +++ b/crates/index-scheduler/src/queue/snapshots/test.rs/test_disable_auto_deletion_of_tasks/task_queue_is_full.snap @@ -1,5 +1,6 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/queue/test.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/queue/tasks.rs b/crates/index-scheduler/src/queue/tasks.rs new file mode 100644 index 000000000..74192232e --- /dev/null +++ b/crates/index-scheduler/src/queue/tasks.rs @@ -0,0 +1,541 @@ +use std::ops::{Bound, RangeBounds}; + +use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str}; +use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls}; +use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32}; +use meilisearch_types::tasks::{Kind, Status, Task}; +use roaring::{MultiOps, RoaringBitmap}; +use time::OffsetDateTime; + +use super::{Query, Queue}; +use crate::processing::ProcessingTasks; +use crate::utils::{ + self, insert_task_datetime, keep_ids_within_datetimes, map_bound, remove_task_datetime, +}; +use crate::{Error, Result, TaskId, BEI128}; + +/// The number of database used by the task queue +const NUMBER_OF_DATABASES: u32 = 8; +/// Database const names for the `IndexScheduler`. +mod db_name { + pub const ALL_TASKS: &str = "all-tasks"; + + pub const STATUS: &str = "status"; + pub const KIND: &str = "kind"; + pub const INDEX_TASKS: &str = "index-tasks"; + pub const CANCELED_BY: &str = "canceled_by"; + pub const ENQUEUED_AT: &str = "enqueued-at"; + pub const STARTED_AT: &str = "started-at"; + pub const FINISHED_AT: &str = "finished-at"; +} + +pub struct TaskQueue { + /// The main database, it contains all the tasks accessible by their Id. + pub(crate) all_tasks: Database>, + + /// All the tasks ids grouped by their status. + // TODO we should not be able to serialize a `Status::Processing` in this database. + pub(crate) status: Database, RoaringBitmapCodec>, + /// All the tasks ids grouped by their kind. + pub(crate) kind: Database, RoaringBitmapCodec>, + /// Store the tasks associated to an index. + pub(crate) index_tasks: Database, + /// Store the tasks that were canceled by a task uid + pub(crate) canceled_by: Database, + /// Store the task ids of tasks which were enqueued at a specific date + pub(crate) enqueued_at: Database, + /// Store the task ids of finished tasks which started being processed at a specific date + pub(crate) started_at: Database, + /// Store the task ids of tasks which finished at a specific date + pub(crate) finished_at: Database, +} + +impl TaskQueue { + pub(crate) fn private_clone(&self) -> TaskQueue { + TaskQueue { + all_tasks: self.all_tasks, + status: self.status, + kind: self.kind, + index_tasks: self.index_tasks, + canceled_by: self.canceled_by, + enqueued_at: self.enqueued_at, + started_at: self.started_at, + finished_at: self.finished_at, + } + } + + pub(crate) const fn nb_db() -> u32 { + NUMBER_OF_DATABASES + } + + pub(crate) fn new(env: &Env, wtxn: &mut RwTxn) -> Result { + Ok(Self { + all_tasks: env.create_database(wtxn, Some(db_name::ALL_TASKS))?, + status: env.create_database(wtxn, Some(db_name::STATUS))?, + kind: env.create_database(wtxn, Some(db_name::KIND))?, + index_tasks: env.create_database(wtxn, Some(db_name::INDEX_TASKS))?, + canceled_by: env.create_database(wtxn, Some(db_name::CANCELED_BY))?, + enqueued_at: env.create_database(wtxn, Some(db_name::ENQUEUED_AT))?, + started_at: env.create_database(wtxn, Some(db_name::STARTED_AT))?, + finished_at: env.create_database(wtxn, Some(db_name::FINISHED_AT))?, + }) + } + + pub(crate) fn last_task_id(&self, rtxn: &RoTxn) -> Result> { + Ok(self.all_tasks.remap_data_type::().last(rtxn)?.map(|(k, _)| k + 1)) + } + + pub(crate) fn next_task_id(&self, rtxn: &RoTxn) -> Result { + Ok(self.last_task_id(rtxn)?.unwrap_or_default()) + } + + pub(crate) fn all_task_ids(&self, rtxn: &RoTxn) -> Result { + enum_iterator::all().map(|s| self.get_status(rtxn, s)).union() + } + + pub(crate) fn get_task(&self, rtxn: &RoTxn, task_id: TaskId) -> Result> { + Ok(self.all_tasks.get(rtxn, &task_id)?) + } + + pub(crate) fn update_task(&self, wtxn: &mut RwTxn, task: &Task) -> Result<()> { + let old_task = self.get_task(wtxn, task.uid)?.ok_or(Error::CorruptedTaskQueue)?; + let reprocessing = old_task.status != Status::Enqueued; + + debug_assert!(old_task != *task); + debug_assert_eq!(old_task.uid, task.uid); + + // If we're processing a task that failed it may already contains a batch_uid + debug_assert!( + reprocessing || (old_task.batch_uid.is_none() && task.batch_uid.is_some()), + "\n==> old: {old_task:?}\n==> new: {task:?}" + ); + + if old_task.status != task.status { + self.update_status(wtxn, old_task.status, |bitmap| { + bitmap.remove(task.uid); + })?; + self.update_status(wtxn, task.status, |bitmap| { + bitmap.insert(task.uid); + })?; + } + + if old_task.kind.as_kind() != task.kind.as_kind() { + self.update_kind(wtxn, old_task.kind.as_kind(), |bitmap| { + bitmap.remove(task.uid); + })?; + self.update_kind(wtxn, task.kind.as_kind(), |bitmap| { + bitmap.insert(task.uid); + })?; + } + + assert_eq!( + old_task.enqueued_at, task.enqueued_at, + "Cannot update a task's enqueued_at time" + ); + if old_task.started_at != task.started_at { + assert!( + reprocessing || old_task.started_at.is_none(), + "Cannot update a task's started_at time" + ); + if let Some(started_at) = old_task.started_at { + remove_task_datetime(wtxn, self.started_at, started_at, task.uid)?; + } + if let Some(started_at) = task.started_at { + insert_task_datetime(wtxn, self.started_at, started_at, task.uid)?; + } + } + if old_task.finished_at != task.finished_at { + assert!( + reprocessing || old_task.finished_at.is_none(), + "Cannot update a task's finished_at time" + ); + if let Some(finished_at) = old_task.finished_at { + remove_task_datetime(wtxn, self.finished_at, finished_at, task.uid)?; + } + if let Some(finished_at) = task.finished_at { + insert_task_datetime(wtxn, self.finished_at, finished_at, task.uid)?; + } + } + + self.all_tasks.put(wtxn, &task.uid, task)?; + Ok(()) + } + + /// Returns the whole set of tasks that belongs to this index. + pub(crate) fn index_tasks(&self, rtxn: &RoTxn, index: &str) -> Result { + Ok(self.index_tasks.get(rtxn, index)?.unwrap_or_default()) + } + + pub(crate) fn update_index( + &self, + wtxn: &mut RwTxn, + index: &str, + f: impl Fn(&mut RoaringBitmap), + ) -> Result<()> { + let mut tasks = self.index_tasks(wtxn, index)?; + f(&mut tasks); + if tasks.is_empty() { + self.index_tasks.delete(wtxn, index)?; + } else { + self.index_tasks.put(wtxn, index, &tasks)?; + } + + Ok(()) + } + + pub(crate) fn get_status(&self, rtxn: &RoTxn, status: Status) -> Result { + Ok(self.status.get(rtxn, &status)?.unwrap_or_default()) + } + + pub(crate) fn put_status( + &self, + wtxn: &mut RwTxn, + status: Status, + bitmap: &RoaringBitmap, + ) -> Result<()> { + Ok(self.status.put(wtxn, &status, bitmap)?) + } + + pub(crate) fn update_status( + &self, + wtxn: &mut RwTxn, + status: Status, + f: impl Fn(&mut RoaringBitmap), + ) -> Result<()> { + let mut tasks = self.get_status(wtxn, status)?; + f(&mut tasks); + self.put_status(wtxn, status, &tasks)?; + + Ok(()) + } + + pub(crate) fn get_kind(&self, rtxn: &RoTxn, kind: Kind) -> Result { + Ok(self.kind.get(rtxn, &kind)?.unwrap_or_default()) + } + + pub(crate) fn put_kind( + &self, + wtxn: &mut RwTxn, + kind: Kind, + bitmap: &RoaringBitmap, + ) -> Result<()> { + Ok(self.kind.put(wtxn, &kind, bitmap)?) + } + + pub(crate) fn update_kind( + &self, + wtxn: &mut RwTxn, + kind: Kind, + f: impl Fn(&mut RoaringBitmap), + ) -> Result<()> { + let mut tasks = self.get_kind(wtxn, kind)?; + f(&mut tasks); + self.put_kind(wtxn, kind, &tasks)?; + + Ok(()) + } + + /// Convert an iterator to a `Vec` of tasks. The tasks MUST exist or a + /// `CorruptedTaskQueue` error will be thrown. + pub(crate) fn get_existing_tasks( + &self, + rtxn: &RoTxn, + tasks: impl IntoIterator, + ) -> Result> { + tasks + .into_iter() + .map(|task_id| { + self.get_task(rtxn, task_id).and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) + }) + .collect::>() + } + + pub(crate) fn register(&self, wtxn: &mut RwTxn, task: &Task) -> Result<()> { + self.all_tasks.put(wtxn, &task.uid, task)?; + + for index in task.indexes() { + self.update_index(wtxn, index, |bitmap| { + bitmap.insert(task.uid); + })?; + } + + self.update_status(wtxn, Status::Enqueued, |bitmap| { + bitmap.insert(task.uid); + })?; + + self.update_kind(wtxn, task.kind.as_kind(), |bitmap| { + bitmap.insert(task.uid); + })?; + + utils::insert_task_datetime(wtxn, self.enqueued_at, task.enqueued_at, task.uid)?; + + Ok(()) + } +} + +impl Queue { + /// Return the task ids matched by the given query from the index scheduler's point of view. + pub(crate) fn get_task_ids( + &self, + rtxn: &RoTxn, + query: &Query, + processing_tasks: &ProcessingTasks, + ) -> Result { + let ProcessingTasks { batch: processing_batch, processing: processing_tasks, progress: _ } = + processing_tasks; + let Query { + limit, + from, + reverse, + uids, + batch_uids, + statuses, + types, + index_uids, + canceled_by, + before_enqueued_at, + after_enqueued_at, + before_started_at, + after_started_at, + before_finished_at, + after_finished_at, + } = query; + + let mut tasks = self.tasks.all_task_ids(rtxn)?; + + if let Some(from) = from { + let range = if reverse.unwrap_or_default() { + u32::MIN..*from + } else { + from.saturating_add(1)..u32::MAX + }; + tasks.remove_range(range); + } + + if let Some(batch_uids) = batch_uids { + let mut batch_tasks = RoaringBitmap::new(); + for batch_uid in batch_uids { + if processing_batch.as_ref().is_some_and(|batch| batch.uid == *batch_uid) { + batch_tasks |= &**processing_tasks; + } else { + batch_tasks |= self.tasks_in_batch(rtxn, *batch_uid)?; + } + } + tasks &= batch_tasks; + } + + if let Some(status) = statuses { + let mut status_tasks = RoaringBitmap::new(); + for status in status { + match status { + // special case for Processing tasks + Status::Processing => { + status_tasks |= &**processing_tasks; + } + status => status_tasks |= &self.tasks.get_status(rtxn, *status)?, + }; + } + if !status.contains(&Status::Processing) { + tasks -= &**processing_tasks; + } + tasks &= status_tasks; + } + + if let Some(uids) = uids { + let uids = RoaringBitmap::from_iter(uids); + tasks &= &uids; + } + + if let Some(canceled_by) = canceled_by { + let mut all_canceled_tasks = RoaringBitmap::new(); + for cancel_task_uid in canceled_by { + if let Some(canceled_by_uid) = self.tasks.canceled_by.get(rtxn, cancel_task_uid)? { + all_canceled_tasks |= canceled_by_uid; + } + } + + // if the canceled_by has been specified but no task + // matches then we prefer matching zero than all tasks. + if all_canceled_tasks.is_empty() { + return Ok(RoaringBitmap::new()); + } else { + tasks &= all_canceled_tasks; + } + } + + if let Some(kind) = types { + let mut kind_tasks = RoaringBitmap::new(); + for kind in kind { + kind_tasks |= self.tasks.get_kind(rtxn, *kind)?; + } + tasks &= &kind_tasks; + } + + if let Some(index) = index_uids { + let mut index_tasks = RoaringBitmap::new(); + for index in index { + index_tasks |= self.tasks.index_tasks(rtxn, index)?; + } + tasks &= &index_tasks; + } + + // For the started_at filter, we need to treat the part of the tasks that are processing from the part of the + // tasks that are not processing. The non-processing ones are filtered normally while the processing ones + // are entirely removed unless the in-memory startedAt variable falls within the date filter. + // Once we have filtered the two subsets, we put them back together and assign it back to `tasks`. + tasks = { + let (mut filtered_non_processing_tasks, mut filtered_processing_tasks) = + (&tasks - &**processing_tasks, &tasks & &**processing_tasks); + + // special case for Processing tasks + // A closure that clears the filtered_processing_tasks if their started_at date falls outside the given bounds + let mut clear_filtered_processing_tasks = + |start: Bound, end: Bound| { + let start = map_bound(start, |b| b.unix_timestamp_nanos()); + let end = map_bound(end, |b| b.unix_timestamp_nanos()); + let is_within_dates = RangeBounds::contains( + &(start, end), + &processing_batch + .as_ref() + .map_or_else(OffsetDateTime::now_utc, |batch| batch.started_at) + .unix_timestamp_nanos(), + ); + if !is_within_dates { + filtered_processing_tasks.clear(); + } + }; + match (after_started_at, before_started_at) { + (None, None) => (), + (None, Some(before)) => { + clear_filtered_processing_tasks(Bound::Unbounded, Bound::Excluded(*before)) + } + (Some(after), None) => { + clear_filtered_processing_tasks(Bound::Excluded(*after), Bound::Unbounded) + } + (Some(after), Some(before)) => clear_filtered_processing_tasks( + Bound::Excluded(*after), + Bound::Excluded(*before), + ), + }; + + keep_ids_within_datetimes( + rtxn, + &mut filtered_non_processing_tasks, + self.tasks.started_at, + *after_started_at, + *before_started_at, + )?; + filtered_non_processing_tasks | filtered_processing_tasks + }; + + keep_ids_within_datetimes( + rtxn, + &mut tasks, + self.tasks.enqueued_at, + *after_enqueued_at, + *before_enqueued_at, + )?; + + keep_ids_within_datetimes( + rtxn, + &mut tasks, + self.tasks.finished_at, + *after_finished_at, + *before_finished_at, + )?; + + if let Some(limit) = limit { + tasks = if query.reverse.unwrap_or_default() { + tasks.into_iter().take(*limit as usize).collect() + } else { + tasks.into_iter().rev().take(*limit as usize).collect() + }; + } + + Ok(tasks) + } + + pub(crate) fn get_task_ids_from_authorized_indexes( + &self, + rtxn: &RoTxn, + query: &Query, + filters: &meilisearch_auth::AuthFilter, + processing_tasks: &ProcessingTasks, + ) -> Result<(RoaringBitmap, u64)> { + // compute all tasks matching the filter by ignoring the limits, to find the number of tasks matching + // the filter. + // As this causes us to compute the filter twice it is slightly inefficient, but doing it this way spares + // us from modifying the underlying implementation, and the performance remains sufficient. + // Should this change, we would modify `get_task_ids` to directly return the number of matching tasks. + let total_tasks = + self.get_task_ids(rtxn, &query.clone().without_limits(), processing_tasks)?; + let mut tasks = self.get_task_ids(rtxn, query, processing_tasks)?; + + // If the query contains a list of index uid or there is a finite list of authorized indexes, + // then we must exclude all the kinds that aren't associated to one and only one index. + if query.index_uids.is_some() || !filters.all_indexes_authorized() { + for kind in enum_iterator::all::().filter(|kind| !kind.related_to_one_index()) { + tasks -= self.tasks.get_kind(rtxn, kind)?; + } + } + + // Any task that is internally associated with a non-authorized index + // must be discarded. + if !filters.all_indexes_authorized() { + let all_indexes_iter = self.tasks.index_tasks.iter(rtxn)?; + for result in all_indexes_iter { + let (index, index_tasks) = result?; + if !filters.is_index_authorized(index) { + tasks -= index_tasks; + } + } + } + + Ok((tasks, total_tasks.len())) + } + + pub(crate) fn get_tasks_from_authorized_indexes( + &self, + rtxn: &RoTxn, + query: &Query, + filters: &meilisearch_auth::AuthFilter, + processing_tasks: &ProcessingTasks, + ) -> Result<(Vec, u64)> { + let (tasks, total) = + self.get_task_ids_from_authorized_indexes(rtxn, query, filters, processing_tasks)?; + let tasks = if query.reverse.unwrap_or_default() { + Box::new(tasks.into_iter()) as Box> + } else { + Box::new(tasks.into_iter().rev()) as Box> + }; + let tasks = self + .tasks + .get_existing_tasks(rtxn, tasks.take(query.limit.unwrap_or(u32::MAX) as usize))?; + + let ProcessingTasks { batch, processing, progress: _ } = processing_tasks; + + let ret = tasks.into_iter(); + if processing.is_empty() || batch.is_none() { + Ok((ret.collect(), total)) + } else { + // Safe because we ensured there was a batch in the previous branch + let batch = batch.as_ref().unwrap(); + Ok(( + ret.map(|task| { + if processing.contains(task.uid) { + Task { + status: Status::Processing, + batch_uid: Some(batch.uid), + started_at: Some(batch.started_at), + ..task + } + } else { + task + } + }) + .collect(), + total, + )) + } + } +} diff --git a/crates/index-scheduler/src/queue/tasks_test.rs b/crates/index-scheduler/src/queue/tasks_test.rs new file mode 100644 index 000000000..d60d621d1 --- /dev/null +++ b/crates/index-scheduler/src/queue/tasks_test.rs @@ -0,0 +1,441 @@ +use meili_snap::snapshot; +use meilisearch_auth::AuthFilter; +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::tasks::{IndexSwap, KindWithContent, Status}; +use time::{Duration, OffsetDateTime}; + +use crate::insta_snapshot::{snapshot_bitmap, snapshot_index_scheduler}; +use crate::test_utils::Breakpoint::*; +use crate::test_utils::{index_creation_task, FailureLocation}; +use crate::{IndexScheduler, Query}; + +#[test] +fn query_tasks_from_and_limit() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let kind = index_creation_task("doggo", "bone"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + let kind = index_creation_task("whalo", "plankton"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + let kind = index_creation_task("catto", "his_own_vomit"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); + + handle.advance_n_successful_batches(3); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_all_tasks"); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let processing = index_scheduler.processing_tasks.read().unwrap(); + let query = Query { limit: Some(0), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + let query = Query { limit: Some(1), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[2,]"); + + let query = Query { limit: Some(2), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[1,2,]"); + + let query = Query { from: Some(1), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); + + let query = Query { from: Some(2), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[0,1,2,]"); + + let query = Query { from: Some(1), limit: Some(1), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[1,]"); + + let query = Query { from: Some(1), limit: Some(2), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &processing) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); +} + +#[test] +fn query_tasks_simple() { + let start_time = OffsetDateTime::now_utc(); + + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("whalo", "fish"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); + + handle.advance_till([Start, BatchCreated]); + + let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[0,]"); // only the processing tasks in the first tick + + let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[1,2,]"); // only the enqueued tasks in the first tick + + let query = + Query { statuses: Some(vec![Status::Enqueued, Status::Processing]), ..Default::default() }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + snapshot!(snapshot_bitmap(&tasks), @"[0,1,2,]"); // both enqueued and processing tasks in the first tick + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Processing]), + after_started_at: Some(start_time), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both enqueued and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the test, which should excludes the enqueued tasks + snapshot!(snapshot_bitmap(&tasks), @"[0,]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Processing]), + before_started_at: Some(start_time), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both enqueued and processing tasks in the first tick, but limited to those with a started_at + // that comes before the start of the test, which should excludes all of them + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Processing]), + after_started_at: Some(start_time), + before_started_at: Some(start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both enqueued and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the test and before one minute after the start of the test, + // which should exclude the enqueued tasks and include the only processing task + snapshot!(snapshot_bitmap(&tasks), @"[0,]"); + + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + Start, + BatchCreated, + ]); + + let second_start_time = OffsetDateTime::now_utc(); + + let query = Query { + statuses: Some(vec![Status::Succeeded, Status::Processing]), + after_started_at: Some(start_time), + before_started_at: Some(start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both succeeded and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the test and before one minute after the start of the test, + // which should include all tasks + snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); + + let query = Query { + statuses: Some(vec![Status::Succeeded, Status::Processing]), + before_started_at: Some(start_time), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both succeeded and processing tasks in the first tick, but limited to those with a started_at + // that comes before the start of the test, which should exclude all tasks + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // both succeeded and processing tasks in the first tick, but limited to those with a started_at + // that comes after the start of the second part of the test and before one minute after the + // second start of the test, which should exclude all tasks + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + // now we make one more batch, the started_at field of the new tasks will be past `second_start_time` + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + Start, + BatchCreated, + ]); + + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // we run the same query to verify that, and indeed find that the last task is matched + snapshot!(snapshot_bitmap(&tasks), @"[2,]"); + + let query = Query { + statuses: Some(vec![Status::Enqueued, Status::Succeeded, Status::Processing]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // enqueued, succeeded, or processing tasks started after the second part of the test, should + // again only return the last task + snapshot!(snapshot_bitmap(&tasks), @"[2,]"); + + handle.advance_till([ProcessBatchFailed, AfterProcessing]); + + // now the last task should have failed + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "end"); + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // so running the last query should return nothing + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // but the same query on failed tasks should return the last task + snapshot!(snapshot_bitmap(&tasks), @"[2,]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // but the same query on failed tasks should return the last task + snapshot!(snapshot_bitmap(&tasks), @"[2,]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + uids: Some(vec![1]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // same query but with an invalid uid + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + let query = Query { + statuses: Some(vec![Status::Failed]), + uids: Some(vec![2]), + after_started_at: Some(second_start_time), + before_started_at: Some(second_start_time + Duration::minutes(1)), + ..Default::default() + }; + let (tasks, _) = index_scheduler + .get_task_ids_from_authorized_indexes(&query, &AuthFilter::default()) + .unwrap(); + // same query but with a valid uid + snapshot!(snapshot_bitmap(&tasks), @"[2,]"); +} + +#[test] +fn query_tasks_special_rules() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], + }; + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "whalo".to_owned()) }], + }; + let _task = index_scheduler.register(kind, None, false).unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); + + handle.advance_till([Start, BatchCreated]); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + + let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + // only the first task associated with catto is returned, the indexSwap tasks are excluded! + snapshot!(snapshot_bitmap(&tasks), @"[0,]"); + + let query = Query { index_uids: Some(vec!["catto".to_owned()]), ..Default::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), + ), + &proc, + ) + .unwrap(); + // we have asked for only the tasks associated with catto, but are only authorized to retrieve the tasks + // associated with doggo -> empty result + snapshot!(snapshot_bitmap(&tasks), @"[]"); + + let query = Query::default(); + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), + ), + &proc, + ) + .unwrap(); + // we asked for all the tasks, but we are only authorized to retrieve the doggo tasks + // -> only the index creation of doggo should be returned + snapshot!(snapshot_bitmap(&tasks), @"[1,]"); + + let query = Query::default(); + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![ + IndexUidPattern::new_unchecked("catto"), + IndexUidPattern::new_unchecked("doggo"), + ] + .into_iter() + .collect(), + ), + &proc, + ) + .unwrap(); + // we asked for all the tasks, but we are only authorized to retrieve the doggo and catto tasks + // -> all tasks except the swap of catto with whalo are returned + snapshot!(snapshot_bitmap(&tasks), @"[0,1,]"); + + let query = Query::default(); + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + // we asked for all the tasks with all index authorized -> all tasks returned + snapshot!(snapshot_bitmap(&tasks), @"[0,1,2,3,]"); +} + +#[test] +fn query_tasks_canceled_by() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(3, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + let _ = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _ = index_scheduler.register(kind, None, false).unwrap(); + let kind = KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("catto".to_owned(), "doggo".to_owned()) }], + }; + let _task = index_scheduler.register(kind, None, false).unwrap(); + + handle.advance_n_successful_batches(1); + let kind = KindWithContent::TaskCancelation { + query: "test_query".to_string(), + tasks: [0, 1, 2, 3].into_iter().collect(), + }; + let task_cancelation = index_scheduler.register(kind, None, false).unwrap(); + handle.advance_n_successful_batches(1); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "start"); + + let rtxn = index_scheduler.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes(&rtxn, &query, &AuthFilter::default(), &proc) + .unwrap(); + // 0 is not returned because it was not canceled, 3 is not returned because it is the uid of the + // taskCancelation itself + snapshot!(snapshot_bitmap(&tasks), @"[1,2,]"); + + let query = Query { canceled_by: Some(vec![task_cancelation.uid]), ..Query::default() }; + let (tasks, _) = index_scheduler + .queue + .get_task_ids_from_authorized_indexes( + &rtxn, + &query, + &AuthFilter::with_allowed_indexes( + vec![IndexUidPattern::new_unchecked("doggo")].into_iter().collect(), + ), + &proc, + ) + .unwrap(); + // Return only 1 because the user is not authorized to see task 2 + snapshot!(snapshot_bitmap(&tasks), @"[1,]"); +} diff --git a/crates/index-scheduler/src/queue/test.rs b/crates/index-scheduler/src/queue/test.rs new file mode 100644 index 000000000..3dbdd2db3 --- /dev/null +++ b/crates/index-scheduler/src/queue/test.rs @@ -0,0 +1,398 @@ +use big_s::S; +use meili_snap::{json_string, snapshot}; +use meilisearch_types::error::ErrorCode; +use meilisearch_types::tasks::{KindWithContent, Status}; +use roaring::RoaringBitmap; + +use crate::insta_snapshot::snapshot_index_scheduler; +use crate::test_utils::Breakpoint::*; +use crate::test_utils::{index_creation_task, replace_document_import_task}; +use crate::{IndexScheduler, Query}; + +#[test] +fn register() { + // In this test, the handle doesn't make any progress, we only check that the tasks are registered + let (index_scheduler, mut _handle) = IndexScheduler::test(true, vec![]); + + let kinds = [ + index_creation_task("catto", "mouse"), + replace_document_import_task("catto", None, 0, 12), + replace_document_import_task("catto", None, 1, 50), + replace_document_import_task("doggo", Some("bone"), 2, 5000), + ]; + let (_, file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + file.persist().unwrap(); + let (_, file) = index_scheduler.queue.create_update_file_with_uuid(1).unwrap(); + file.persist().unwrap(); + let (_, file) = index_scheduler.queue.create_update_file_with_uuid(2).unwrap(); + file.persist().unwrap(); + + for (idx, kind) in kinds.into_iter().enumerate() { + let k = kind.as_kind(); + let task = index_scheduler.register(kind, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + + assert_eq!(task.uid, idx as u32); + assert_eq!(task.status, Status::Enqueued); + assert_eq!(task.kind.as_kind(), k); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "everything_is_successfully_registered"); +} + +#[test] +fn dry_run() { + let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, None, true).unwrap(); + snapshot!(task.uid, @"0"); + snapshot!(snapshot_index_scheduler(&index_scheduler), @r" + ### Autobatching Enabled = true + ### Processing batch None: + [] + ---------------------------------------------------------------------- + ### All Tasks: + ---------------------------------------------------------------------- + ### Status: + ---------------------------------------------------------------------- + ### Kind: + ---------------------------------------------------------------------- + ### Index Tasks: + ---------------------------------------------------------------------- + ### Index Mapper: + + ---------------------------------------------------------------------- + ### Canceled By: + + ---------------------------------------------------------------------- + ### Enqueued At: + ---------------------------------------------------------------------- + ### Started At: + ---------------------------------------------------------------------- + ### Finished At: + ---------------------------------------------------------------------- + ### All Batches: + ---------------------------------------------------------------------- + ### Batch to tasks mapping: + ---------------------------------------------------------------------- + ### Batches Status: + ---------------------------------------------------------------------- + ### Batches Kind: + ---------------------------------------------------------------------- + ### Batches Index Tasks: + ---------------------------------------------------------------------- + ### Batches Enqueued At: + ---------------------------------------------------------------------- + ### Batches Started At: + ---------------------------------------------------------------------- + ### Batches Finished At: + ---------------------------------------------------------------------- + ### File Store: + + ---------------------------------------------------------------------- + "); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, Some(12), true).unwrap(); + snapshot!(task.uid, @"12"); + snapshot!(snapshot_index_scheduler(&index_scheduler), @r" + ### Autobatching Enabled = true + ### Processing batch None: + [] + ---------------------------------------------------------------------- + ### All Tasks: + ---------------------------------------------------------------------- + ### Status: + ---------------------------------------------------------------------- + ### Kind: + ---------------------------------------------------------------------- + ### Index Tasks: + ---------------------------------------------------------------------- + ### Index Mapper: + + ---------------------------------------------------------------------- + ### Canceled By: + + ---------------------------------------------------------------------- + ### Enqueued At: + ---------------------------------------------------------------------- + ### Started At: + ---------------------------------------------------------------------- + ### Finished At: + ---------------------------------------------------------------------- + ### All Batches: + ---------------------------------------------------------------------- + ### Batch to tasks mapping: + ---------------------------------------------------------------------- + ### Batches Status: + ---------------------------------------------------------------------- + ### Batches Kind: + ---------------------------------------------------------------------- + ### Batches Index Tasks: + ---------------------------------------------------------------------- + ### Batches Enqueued At: + ---------------------------------------------------------------------- + ### Batches Started At: + ---------------------------------------------------------------------- + ### Batches Finished At: + ---------------------------------------------------------------------- + ### File Store: + + ---------------------------------------------------------------------- + "); +} + +#[test] +fn basic_set_taskid() { + let (index_scheduler, _handle) = IndexScheduler::test(true, vec![]); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(task.uid, @"0"); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let task = index_scheduler.register(kind, Some(12), false).unwrap(); + snapshot!(task.uid, @"12"); + + let kind = KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }; + let error = index_scheduler.register(kind, Some(5), false).unwrap_err(); + snapshot!(error, @"Received bad task id: 5 should be >= to 13."); +} + +#[test] +fn test_disable_auto_deletion_of_tasks() { + let (index_scheduler, mut handle) = IndexScheduler::test_with_custom_config(vec![], |config| { + config.cleanup_enabled = false; + config.max_number_of_tasks = 2; + None + }); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + handle.advance_one_failed_batch(); + + // at this point the max number of tasks is reached + // we can still enqueue multiple tasks + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]" }), name: "task_queue_is_full"); + drop(rtxn); + drop(proc); + + // now we're above the max number of tasks + // and if we try to advance in the tick function no new task deletion should be enqueued + handle.advance_till([Start, BatchCreated]); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_not_been_enqueued"); + drop(rtxn); + drop(proc); +} + +#[test] +fn test_auto_deletion_of_tasks() { + let (index_scheduler, mut handle) = IndexScheduler::test_with_custom_config(vec![], |config| { + config.max_number_of_tasks = 2; + None + }); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + handle.advance_one_failed_batch(); + + // at this point the max number of tasks is reached + // we can still enqueue multiple tasks + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]" }), name: "task_queue_is_full"); + drop(rtxn); + drop(proc); + + // now we're above the max number of tasks + // and if we try to advance in the tick function a new task deletion should be enqueued + handle.advance_till([Start, BatchCreated]); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_been_enqueued"); + drop(rtxn); + drop(proc); + + handle.advance_till([InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "task_deletion_have_been_processed"); + drop(rtxn); + drop(proc); + + handle.advance_one_failed_batch(); + // a new task deletion has been enqueued + handle.advance_one_successful_batch(); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "after_the_second_task_deletion"); + drop(rtxn); + drop(proc); + + handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); + let rtxn = index_scheduler.env.read_txn().unwrap(); + let proc = index_scheduler.processing_tasks.read().unwrap(); + let tasks = + index_scheduler.queue.get_task_ids(&rtxn, &Query { ..Default::default() }, &proc).unwrap(); + let tasks = index_scheduler.queue.tasks.get_existing_tasks(&rtxn, tasks).unwrap(); + snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed"); + drop(rtxn); + drop(proc); +} + +#[test] +fn test_task_queue_is_full() { + let (index_scheduler, mut handle) = IndexScheduler::test_with_custom_config(vec![], |config| { + // that's the minimum map size possible + config.task_db_size = 1048576 * 3; + None + }); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + // on average this task takes ~600 bytes + loop { + let result = index_scheduler.register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ); + if result.is_err() { + break; + } + handle.advance_one_failed_batch(); + } + index_scheduler.assert_internally_consistent(); + + // at this point the task DB shoud have reached its limit and we should not be able to register new tasks + let result = index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap_err(); + snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); + // we won't be able to test this error in an integration test thus as a best effort test IĀ still ensure the error return the expected error code + snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); + + // Even the task deletion that doesn't delete anything shouldn't be accepted + let result = index_scheduler + .register( + KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, + None, + false, + ) + .unwrap_err(); + snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); + // we won't be able to test this error in an integration test thus as a best effort test IĀ still ensure the error return the expected error code + snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); + + // But a task deletion that delete something should works + index_scheduler + .register( + KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + // Now we should be able to enqueue a few tasks again + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap(); + handle.advance_one_failed_batch(); +} diff --git a/crates/index-scheduler/src/scheduler/autobatcher.rs b/crates/index-scheduler/src/scheduler/autobatcher.rs new file mode 100644 index 000000000..605bf80dd --- /dev/null +++ b/crates/index-scheduler/src/scheduler/autobatcher.rs @@ -0,0 +1,474 @@ +/*! +The autobatcher is responsible for combining the next enqueued +tasks affecting a single index into a [batch](crate::batch::Batch). + +The main function of the autobatcher is [`next_autobatch`]. +*/ + +use meilisearch_types::tasks::TaskId; +use std::ops::ControlFlow::{self, Break, Continue}; + +use crate::KindWithContent; + +/// Succinctly describes a task's [`Kind`](meilisearch_types::tasks::Kind) +/// for the purpose of simplifying the implementation of the autobatcher. +/// +/// Only the non-prioritised tasks that can be grouped in a batch have a corresponding [`AutobatchKind`] +enum AutobatchKind { + DocumentImport { allow_index_creation: bool, primary_key: Option }, + DocumentEdition, + DocumentDeletion { by_filter: bool }, + DocumentClear, + Settings { allow_index_creation: bool }, + IndexCreation, + IndexDeletion, + IndexUpdate, + IndexSwap, +} + +impl AutobatchKind { + #[rustfmt::skip] + fn allow_index_creation(&self) -> Option { + match self { + AutobatchKind::DocumentImport { allow_index_creation, .. } + | AutobatchKind::Settings { allow_index_creation, .. } => Some(*allow_index_creation), + _ => None, + } + } + + fn primary_key(&self) -> Option> { + match self { + AutobatchKind::DocumentImport { primary_key, .. } => Some(primary_key.as_deref()), + _ => None, + } + } +} + +impl From for AutobatchKind { + fn from(kind: KindWithContent) -> Self { + match kind { + KindWithContent::DocumentAdditionOrUpdate { + allow_index_creation, primary_key, .. + } => AutobatchKind::DocumentImport { allow_index_creation, primary_key }, + KindWithContent::DocumentEdition { .. } => AutobatchKind::DocumentEdition, + KindWithContent::DocumentDeletion { .. } => { + AutobatchKind::DocumentDeletion { by_filter: false } + } + KindWithContent::DocumentClear { .. } => AutobatchKind::DocumentClear, + KindWithContent::DocumentDeletionByFilter { .. } => { + AutobatchKind::DocumentDeletion { by_filter: true } + } + KindWithContent::SettingsUpdate { allow_index_creation, is_deletion, .. } => { + AutobatchKind::Settings { + allow_index_creation: allow_index_creation && !is_deletion, + } + } + KindWithContent::IndexDeletion { .. } => AutobatchKind::IndexDeletion, + KindWithContent::IndexCreation { .. } => AutobatchKind::IndexCreation, + KindWithContent::IndexUpdate { .. } => AutobatchKind::IndexUpdate, + KindWithContent::IndexSwap { .. } => AutobatchKind::IndexSwap, + KindWithContent::TaskCancelation { .. } + | KindWithContent::TaskDeletion { .. } + | KindWithContent::DumpCreation { .. } + | KindWithContent::UpgradeDatabase { .. } + | KindWithContent::SnapshotCreation => { + panic!("The autobatcher should never be called with tasks that don't apply to an index.") + } + } + } +} + +#[derive(Debug)] +pub enum BatchKind { + DocumentClear { + ids: Vec, + }, + DocumentOperation { + allow_index_creation: bool, + primary_key: Option, + operation_ids: Vec, + }, + DocumentEdition { + id: TaskId, + }, + DocumentDeletion { + deletion_ids: Vec, + includes_by_filter: bool, + }, + ClearAndSettings { + other: Vec, + allow_index_creation: bool, + settings_ids: Vec, + }, + Settings { + allow_index_creation: bool, + settings_ids: Vec, + }, + IndexDeletion { + ids: Vec, + }, + IndexCreation { + id: TaskId, + }, + IndexUpdate { + id: TaskId, + }, + IndexSwap { + id: TaskId, + }, +} + +impl BatchKind { + #[rustfmt::skip] + fn allow_index_creation(&self) -> Option { + match self { + BatchKind::DocumentOperation { allow_index_creation, .. } + | BatchKind::ClearAndSettings { allow_index_creation, .. } + | BatchKind::Settings { allow_index_creation, .. } => Some(*allow_index_creation), + _ => None, + } + } + + fn primary_key(&self) -> Option> { + match self { + BatchKind::DocumentOperation { primary_key, .. } => Some(primary_key.as_deref()), + _ => None, + } + } +} + +impl BatchKind { + /// Returns a `ControlFlow::Break` if you must stop right now. + /// The boolean tell you if an index has been created by the batched task. + /// To ease the writing of the code. `true` can be returned when you don't need to create an index + /// but false can't be returned if you needs to create an index. + // TODO use an AutoBatchKind as input + pub fn new( + task_id: TaskId, + kind: KindWithContent, + primary_key: Option<&str>, + ) -> (ControlFlow, bool) { + use AutobatchKind as K; + + match AutobatchKind::from(kind) { + K::IndexCreation => (Break(BatchKind::IndexCreation { id: task_id }), true), + K::IndexDeletion => (Break(BatchKind::IndexDeletion { ids: vec![task_id] }), false), + K::IndexUpdate => (Break(BatchKind::IndexUpdate { id: task_id }), false), + K::IndexSwap => (Break(BatchKind::IndexSwap { id: task_id }), false), + K::DocumentClear => (Continue(BatchKind::DocumentClear { ids: vec![task_id] }), false), + K::DocumentImport { allow_index_creation, primary_key: pk } + if primary_key.is_none() || pk.is_none() || primary_key == pk.as_deref() => + { + ( + Continue(BatchKind::DocumentOperation { + allow_index_creation, + primary_key: pk, + operation_ids: vec![task_id], + }), + allow_index_creation, + ) + } + // if the primary key set in the task was different than ours we should stop and make this batch fail asap. + K::DocumentImport { allow_index_creation, primary_key } => ( + Break(BatchKind::DocumentOperation { + allow_index_creation, + primary_key, + operation_ids: vec![task_id], + }), + allow_index_creation, + ), + K::DocumentEdition => (Break(BatchKind::DocumentEdition { id: task_id }), false), + K::DocumentDeletion { by_filter: includes_by_filter } => ( + Continue(BatchKind::DocumentDeletion { + deletion_ids: vec![task_id], + includes_by_filter, + }), + false, + ), + K::Settings { allow_index_creation } => ( + Continue(BatchKind::Settings { allow_index_creation, settings_ids: vec![task_id] }), + allow_index_creation, + ), + } + } + + /// Returns a `ControlFlow::Break` if you must stop right now. + /// The boolean tell you if an index has been created by the batched task. + /// To ease the writing of the code. `true` can be returned when you don't need to create an index + /// but false can't be returned if you needs to create an index. + #[rustfmt::skip] + fn accumulate(self, id: TaskId, kind: AutobatchKind, index_already_exists: bool, primary_key: Option<&str>) -> ControlFlow { + use AutobatchKind as K; + + match (self, kind) { + // We don't batch any of these operations + (this, K::IndexCreation | K::IndexUpdate | K::IndexSwap | K::DocumentEdition) => Break(this), + // We must not batch tasks that don't have the same index creation rights if the index doesn't already exists. + (this, kind) if !index_already_exists && this.allow_index_creation() == Some(false) && kind.allow_index_creation() == Some(true) => { + Break(this) + }, + // NOTE: We need to negate the whole condition since we're checking if we need to break instead of continue. + // I wrote it this way because it's easier to understand than the other way around. + (this, kind) if !( + // 1. If both task don't interact with primary key -> we can continue + (this.primary_key().is_none() && kind.primary_key().is_none()) || + // 2. Else -> + ( + // 2.1 If we already have a primary-key -> + ( + primary_key.is_some() && + // 2.1.1 If the task we're trying to accumulate have a pk it must be equal to our primary key + // 2.1.2 If the task don't have a primary-key -> we can continue + kind.primary_key().is_none_or(|pk| pk == primary_key) + ) || + // 2.2 If we don't have a primary-key -> + ( + // 2.2.1 If both the batch and the task have a primary key they should be equal + // 2.2.2 If the batch is set to Some(None), the task should be too + // 2.2.3 If the batch is set to None -> we can continue + this.primary_key().zip(kind.primary_key()).map_or(true, |(this, kind)| this == kind) + ) + ) + + ) // closing the negation + + => { + Break(this) + }, + // The index deletion can batch with everything but must stop after + ( + BatchKind::DocumentClear { mut ids } + | BatchKind::DocumentDeletion { deletion_ids: mut ids, includes_by_filter: _ } + | BatchKind::DocumentOperation { allow_index_creation: _, primary_key: _, operation_ids: mut ids } + | BatchKind::Settings { allow_index_creation: _, settings_ids: mut ids }, + K::IndexDeletion, + ) => { + ids.push(id); + Break(BatchKind::IndexDeletion { ids }) + } + ( + BatchKind::ClearAndSettings { settings_ids: mut ids, allow_index_creation: _, mut other }, + K::IndexDeletion, + ) => { + ids.push(id); + ids.append(&mut other); + Break(BatchKind::IndexDeletion { ids }) + } + + ( + BatchKind::DocumentClear { mut ids }, + K::DocumentClear | K::DocumentDeletion { by_filter: _ }, + ) => { + ids.push(id); + Continue(BatchKind::DocumentClear { ids }) + } + ( + this @ BatchKind::DocumentClear { .. }, + K::DocumentImport { .. } | K::Settings { .. }, + ) => Break(this), + ( + BatchKind::DocumentOperation { allow_index_creation: _, primary_key: _, mut operation_ids }, + K::DocumentClear, + ) => { + operation_ids.push(id); + Continue(BatchKind::DocumentClear { ids: operation_ids }) + } + + // we can autobatch different kind of document operations and mix replacements with updates + ( + BatchKind::DocumentOperation { allow_index_creation, primary_key: _, mut operation_ids }, + K::DocumentImport { primary_key: pk, .. }, + ) => { + operation_ids.push(id); + Continue(BatchKind::DocumentOperation { + allow_index_creation, + operation_ids, + primary_key: pk, + }) + } + ( + BatchKind::DocumentOperation { allow_index_creation, primary_key, mut operation_ids }, + K::DocumentDeletion { by_filter: false }, + ) => { + operation_ids.push(id); + + Continue(BatchKind::DocumentOperation { + allow_index_creation, + primary_key, + operation_ids, + }) + } + // We can't batch a document operation with a delete by filter + ( + this @ BatchKind::DocumentOperation { .. }, + K::DocumentDeletion { by_filter: true }, + ) => { + Break(this) + } + ( + this @ BatchKind::DocumentOperation { .. }, + K::Settings { .. }, + ) => Break(this), + + (BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: _ }, K::DocumentClear) => { + deletion_ids.push(id); + Continue(BatchKind::DocumentClear { ids: deletion_ids }) + } + // we can't autobatch the deletion and import if the document deletion contained a filter + ( + this @ BatchKind::DocumentDeletion { deletion_ids: _, includes_by_filter: true }, + K::DocumentImport { .. } + ) => Break(this), + // we can autobatch the deletion and import if the index already exists + ( + BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: false }, + K::DocumentImport { allow_index_creation, primary_key } + ) if index_already_exists => { + deletion_ids.push(id); + + Continue(BatchKind::DocumentOperation { + allow_index_creation, + primary_key, + operation_ids: deletion_ids, + }) + } + // we can autobatch the deletion and import if both can't create an index + ( + BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter: false }, + K::DocumentImport { allow_index_creation, primary_key } + ) if !allow_index_creation => { + deletion_ids.push(id); + + Continue(BatchKind::DocumentOperation { + allow_index_creation, + primary_key, + operation_ids: deletion_ids, + }) + } + // we can't autobatch a deletion and an import if the index does not exists but would be created by an addition + ( + this @ BatchKind::DocumentDeletion { .. }, + K::DocumentImport { .. } + ) => { + Break(this) + } + (BatchKind::DocumentDeletion { mut deletion_ids, includes_by_filter }, K::DocumentDeletion { by_filter }) => { + deletion_ids.push(id); + Continue(BatchKind::DocumentDeletion { deletion_ids, includes_by_filter: includes_by_filter | by_filter }) + } + (this @ BatchKind::DocumentDeletion { .. }, K::Settings { .. }) => Break(this), + + ( + BatchKind::Settings { settings_ids, allow_index_creation }, + K::DocumentClear, + ) => Continue(BatchKind::ClearAndSettings { + settings_ids, + allow_index_creation, + other: vec![id], + }), + ( + this @ BatchKind::Settings { .. }, + K::DocumentImport { .. } | K::DocumentDeletion { .. }, + ) => Break(this), + ( + BatchKind::Settings { mut settings_ids, allow_index_creation }, + K::Settings { .. }, + ) => { + settings_ids.push(id); + Continue(BatchKind::Settings { + allow_index_creation, + settings_ids, + }) + } + + ( + BatchKind::ClearAndSettings { mut other, settings_ids, allow_index_creation }, + K::DocumentClear, + ) => { + other.push(id); + Continue(BatchKind::ClearAndSettings { + other, + settings_ids, + allow_index_creation, + }) + } + (this @ BatchKind::ClearAndSettings { .. }, K::DocumentImport { .. }) => Break(this), + ( + BatchKind::ClearAndSettings { + mut other, + settings_ids, + allow_index_creation, + }, + K::DocumentDeletion { .. }, + ) => { + other.push(id); + Continue(BatchKind::ClearAndSettings { + other, + settings_ids, + allow_index_creation, + }) + } + ( + BatchKind::ClearAndSettings { mut settings_ids, other, allow_index_creation }, + K::Settings { .. }, + ) => { + settings_ids.push(id); + Continue(BatchKind::ClearAndSettings { + other, + settings_ids, + allow_index_creation, + }) + } + + ( + BatchKind::IndexCreation { .. } + | BatchKind::IndexDeletion { .. } + | BatchKind::IndexUpdate { .. } + | BatchKind::IndexSwap { .. } + | BatchKind::DocumentEdition { .. }, + _, + ) => { + unreachable!() + } + } + } +} + +/// Create a batch from an ordered list of tasks. +/// +/// ## Preconditions +/// 1. The tasks must be enqueued and given in the order in which they were enqueued +/// 2. The tasks must not be prioritised tasks (e.g. task cancellation, dump, snapshot, task deletion) +/// 3. The tasks must all be related to the same index +/// +/// ## Return +/// `None` if the list of tasks is empty. Otherwise, an [`AutoBatch`] that represents +/// a subset of the given tasks. +pub fn autobatch( + enqueued: Vec<(TaskId, KindWithContent)>, + index_already_exists: bool, + primary_key: Option<&str>, +) -> Option<(BatchKind, bool)> { + let mut enqueued = enqueued.into_iter(); + let (id, kind) = enqueued.next()?; + + // index_exist will keep track of if the index should exist at this point after the tasks we batched. + let mut index_exist = index_already_exists; + + let (mut acc, must_create_index) = match BatchKind::new(id, kind, primary_key) { + (Continue(acc), create) => (acc, create), + (Break(acc), create) => return Some((acc, create)), + }; + + // if an index has been created in the previous step we can consider it as existing. + index_exist |= must_create_index; + + for (id, kind) in enqueued { + acc = match acc.accumulate(id, kind.into(), index_exist, primary_key) { + Continue(acc) => acc, + Break(acc) => return Some((acc, must_create_index)), + }; + } + + Some((acc, must_create_index)) +} diff --git a/crates/index-scheduler/src/scheduler/autobatcher_test.rs b/crates/index-scheduler/src/scheduler/autobatcher_test.rs new file mode 100644 index 000000000..486888cf5 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/autobatcher_test.rs @@ -0,0 +1,395 @@ +use meilisearch_types::milli::update::IndexDocumentsMethod::{ + self, ReplaceDocuments, UpdateDocuments, +}; +use meilisearch_types::tasks::{IndexSwap, KindWithContent}; +use uuid::Uuid; + +use self::autobatcher::{autobatch, BatchKind}; +use super::*; +use crate::TaskId; + +#[macro_export] +macro_rules! debug_snapshot { + ($value:expr, @$snapshot:literal) => {{ + let value = format!("{:?}", $value); + meili_snap::snapshot!(value, @$snapshot); + }}; + } + +fn autobatch_from( + index_already_exists: bool, + primary_key: Option<&str>, + input: impl IntoIterator, +) -> Option<(BatchKind, bool)> { + autobatch( + input.into_iter().enumerate().map(|(id, kind)| (id as TaskId, kind)).collect(), + index_already_exists, + primary_key, + ) +} + +fn doc_imp( + method: IndexDocumentsMethod, + allow_index_creation: bool, + primary_key: Option<&str>, +) -> KindWithContent { + KindWithContent::DocumentAdditionOrUpdate { + index_uid: String::from("doggo"), + primary_key: primary_key.map(|pk| pk.to_string()), + method, + content_file: Uuid::new_v4(), + documents_count: 0, + allow_index_creation, + } +} + +fn doc_del() -> KindWithContent { + KindWithContent::DocumentDeletion { + index_uid: String::from("doggo"), + documents_ids: Vec::new(), + } +} + +fn doc_del_fil() -> KindWithContent { + KindWithContent::DocumentDeletionByFilter { + index_uid: String::from("doggo"), + filter_expr: serde_json::json!("cuteness > 100"), + } +} + +fn doc_clr() -> KindWithContent { + KindWithContent::DocumentClear { index_uid: String::from("doggo") } +} + +fn settings(allow_index_creation: bool) -> KindWithContent { + KindWithContent::SettingsUpdate { + index_uid: String::from("doggo"), + new_settings: Default::default(), + is_deletion: false, + allow_index_creation, + } +} + +fn idx_create() -> KindWithContent { + KindWithContent::IndexCreation { index_uid: String::from("doggo"), primary_key: None } +} + +fn idx_update() -> KindWithContent { + KindWithContent::IndexUpdate { index_uid: String::from("doggo"), primary_key: None } +} + +fn idx_del() -> KindWithContent { + KindWithContent::IndexDeletion { index_uid: String::from("doggo") } +} + +fn idx_swap() -> KindWithContent { + KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: (String::from("doggo"), String::from("catto")) }], + } +} + +#[test] +fn autobatch_simple_operation_together() { + // we can autobatch one or multiple `ReplaceDocuments` together. + // if the index exists. + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp( ReplaceDocuments, true , None), doc_imp(ReplaceDocuments, true , None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp( ReplaceDocuments, false , None), doc_imp(ReplaceDocuments, false , None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1, 2] }, false))"); + + // if it doesn't exists. + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), doc_imp( ReplaceDocuments, true , None), doc_imp(ReplaceDocuments, true , None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp( ReplaceDocuments, true , None), doc_imp(ReplaceDocuments, true , None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + + // we can autobatch one or multiple `UpdateDocuments` together. + // if the index exists. + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1, 2] }, false))"); + + // if it doesn't exists. + debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1, 2] }, false))"); + + // we can autobatch one or multiple DocumentDeletion together + debug_snapshot!(autobatch_from(true, None, [doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_del(), doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_del(), doc_del(), doc_del()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: false }, false))"); + + // we can autobatch one or multiple DocumentDeletionByFilter together + debug_snapshot!(autobatch_from(true, None, [doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_del_fil(), doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_del_fil(), doc_del_fil(), doc_del_fil()]), @"Some((DocumentDeletion { deletion_ids: [0, 1, 2], includes_by_filter: true }, false))"); + + // we can autobatch one or multiple Settings together + debug_snapshot!(autobatch_from(true, None, [settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [settings(true), settings(true), settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))"); + + debug_snapshot!(autobatch_from(false,None, [settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false,None, [settings(true), settings(true), settings(true)]), @"Some((Settings { allow_index_creation: true, settings_ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(false,None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))"); + + // We can autobatch document addition with document deletion + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + // And the other way around + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###); + + // But we can't autobatch document addition with document deletion by filter + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); + debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del_fil()]), @r###"Some((DocumentOperation { allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###); + // And the other way around + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del_fil(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); +} + +#[test] +fn simple_different_document_operations_autobatch_together() { + // addition and updates with deletion by filter can't batch together + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del_fil()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_create()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_create()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_create()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_create()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_update()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_update()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_update()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_update()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); + + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_swap()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_swap()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_swap()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_swap()]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: true }, false))"); +} + +#[test] +fn document_addition_doesnt_batch_with_settings() { + // simple case + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + + // multiple settings and doc addition + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), settings(true), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + + // addition and setting unordered + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + + // Doesn't batch with other forbidden operations + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_del()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_del()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_create()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_create()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_update()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_update()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), idx_swap()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), idx_swap()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); +} + +#[test] +fn clear_and_additions() { + // these two doesn't need to batch + debug_snapshot!(autobatch_from(true, None, [doc_clr(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentClear { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_clr(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentClear { ids: [0] }, false))"); + + // Basic use case + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); + + // This batch kind doesn't mix with other document addition + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), doc_clr(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_clr(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentClear { ids: [0, 1, 2] }, true))"); + + // But you can batch multiple clear together + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None), doc_clr(), doc_clr(), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2, 3, 4] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_imp(UpdateDocuments, true, None), doc_clr(), doc_clr(), doc_clr()]), @"Some((DocumentClear { ids: [0, 1, 2, 3, 4] }, true))"); +} + +#[test] +fn clear_and_additions_and_settings() { + // A clear don't need to autobatch the settings that happens AFTER there is no documents + debug_snapshot!(autobatch_from(true, None, [doc_clr(), settings(true)]), @"Some((DocumentClear { ids: [0] }, false))"); + + debug_snapshot!(autobatch_from(true, None, [settings(true), doc_clr(), settings(true)]), @"Some((ClearAndSettings { other: [1], allow_index_creation: true, settings_ids: [0, 2] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true), doc_clr()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), settings(true), doc_clr()]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); +} + +#[test] +fn anything_and_index_deletion() { + // The `IndexDeletion` doesn't batch with anything that happens AFTER. + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_del()]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_del_fil()]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), doc_clr()]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), settings(true)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(true, None, [idx_del(), settings(false)]), @"Some((IndexDeletion { ids: [0] }, false))"); + + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_del()]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_del_fil()]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), doc_clr()]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), settings(true)]), @"Some((IndexDeletion { ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [idx_del(), settings(false)]), @"Some((IndexDeletion { ids: [0] }, false))"); + + // The index deletion can accept almost any type of `BatchKind` and transform it to an `IndexDeletion`. + // First, the basic cases + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_del_fil(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, true, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(UpdateDocuments, false, None), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_del(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_del_fil(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_clr(), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false,None, [settings(true), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(false,None, [settings(false), idx_del()]), @"Some((IndexDeletion { ids: [0, 1] }, false))"); +} + +#[test] +fn allowed_and_disallowed_index_creation() { + // `DocumentImport` can't be mixed with those disallowed to do so except if the index already exists. + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, true, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(false,None, [doc_imp(ReplaceDocuments, false, None), settings(true)]), @"Some((DocumentOperation { allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))"); + + // batch deletion and addition + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); + debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0], includes_by_filter: false }, false))"); +} + +#[test] +fn autobatch_primary_key() { + // ==> If I have a pk + // With a single update + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + + // With a multiple updates + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("other"))]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0, 1] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); + + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, Some("id"), [doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("other")), doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + + // ==> If I don't have a pk + // With a single update + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("id"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("other"))]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("other"), operation_ids: [0] }, true))"###); + + // With a multiple updates + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_imp(ReplaceDocuments, true, Some("id"))]), @"Some((DocumentOperation { allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))"); + debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("id")), doc_imp(ReplaceDocuments, true, None)]), @r###"Some((DocumentOperation { allow_index_creation: true, primary_key: Some("id"), operation_ids: [0] }, true))"###); +} diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs new file mode 100644 index 000000000..10f480d12 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -0,0 +1,567 @@ +use std::fmt; + +use meilisearch_types::heed::RoTxn; +use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::settings::{Settings, Unchecked}; +use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; +use roaring::RoaringBitmap; +use uuid::Uuid; + +use super::autobatcher::{self, BatchKind}; +use crate::utils::ProcessingBatch; +use crate::{Error, IndexScheduler, Result}; + +/// Represents a combination of tasks that can all be processed at the same time. +/// +/// A batch contains the set of tasks that it represents (accessible through +/// [`self.ids()`](Batch::ids)), as well as additional information on how to +/// be processed. +#[derive(Debug)] +pub(crate) enum Batch { + TaskCancelation { + /// The task cancelation itself. + task: Task, + }, + TaskDeletions(Vec), + SnapshotCreation(Vec), + Dump(Task), + IndexOperation { + op: IndexOperation, + must_create_index: bool, + }, + IndexCreation { + index_uid: String, + primary_key: Option, + task: Task, + }, + IndexUpdate { + index_uid: String, + primary_key: Option, + task: Task, + }, + IndexDeletion { + index_uid: String, + tasks: Vec, + index_has_been_created: bool, + }, + IndexSwap { + task: Task, + }, + UpgradeDatabase { + tasks: Vec, + }, +} + +#[derive(Debug)] +pub(crate) enum DocumentOperation { + Replace(Uuid), + Update(Uuid), + Delete(Vec), +} + +/// A [batch](Batch) that combines multiple tasks operating on an index. +#[derive(Debug)] +pub(crate) enum IndexOperation { + DocumentOperation { + index_uid: String, + primary_key: Option, + operations: Vec, + tasks: Vec, + }, + DocumentEdition { + index_uid: String, + task: Task, + }, + DocumentDeletion { + index_uid: String, + tasks: Vec, + }, + DocumentClear { + index_uid: String, + tasks: Vec, + }, + Settings { + index_uid: String, + // The boolean indicates if it's a settings deletion or creation. + settings: Vec<(bool, Settings)>, + tasks: Vec, + }, + DocumentClearAndSetting { + index_uid: String, + cleared_tasks: Vec, + + // The boolean indicates if it's a settings deletion or creation. + settings: Vec<(bool, Settings)>, + settings_tasks: Vec, + }, +} + +impl Batch { + /// Return the task ids associated with this batch. + pub fn ids(&self) -> RoaringBitmap { + match self { + Batch::TaskCancelation { task, .. } + | Batch::Dump(task) + | Batch::IndexCreation { task, .. } + | Batch::IndexUpdate { task, .. } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } + Batch::SnapshotCreation(tasks) + | Batch::TaskDeletions(tasks) + | Batch::UpgradeDatabase { tasks } + | Batch::IndexDeletion { tasks, .. } => { + RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) + } + Batch::IndexOperation { op, .. } => match op { + IndexOperation::DocumentOperation { tasks, .. } + | IndexOperation::Settings { tasks, .. } + | IndexOperation::DocumentDeletion { tasks, .. } + | IndexOperation::DocumentClear { tasks, .. } => { + RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid)) + } + IndexOperation::DocumentEdition { task, .. } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } + IndexOperation::DocumentClearAndSetting { + cleared_tasks: tasks, + settings_tasks: other, + .. + } => RoaringBitmap::from_iter(tasks.iter().chain(other).map(|task| task.uid)), + }, + Batch::IndexSwap { task } => { + RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() + } + } + } + + /// Return the index UID associated with this batch + pub fn index_uid(&self) -> Option<&str> { + use Batch::*; + match self { + TaskCancelation { .. } + | TaskDeletions(_) + | SnapshotCreation(_) + | Dump(_) + | UpgradeDatabase { .. } + | IndexSwap { .. } => None, + IndexOperation { op, .. } => Some(op.index_uid()), + IndexCreation { index_uid, .. } + | IndexUpdate { index_uid, .. } + | IndexDeletion { index_uid, .. } => Some(index_uid), + } + } +} + +impl fmt::Display for Batch { + /// A text used when we debug the profiling reports. + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let index_uid = self.index_uid(); + let tasks = self.ids(); + match self { + Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?, + Batch::TaskDeletions(_) => f.write_str("TaskDeletion")?, + Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?, + Batch::Dump(_) => f.write_str("Dump")?, + Batch::IndexOperation { op, .. } => write!(f, "{op}")?, + Batch::IndexCreation { .. } => f.write_str("IndexCreation")?, + Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?, + Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?, + Batch::IndexSwap { .. } => f.write_str("IndexSwap")?, + Batch::UpgradeDatabase { .. } => f.write_str("UpgradeDatabase")?, + }; + match index_uid { + Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")), + None => f.write_fmt(format_args!(" from tasks: {tasks:?}")), + } + } +} + +impl IndexOperation { + pub fn index_uid(&self) -> &str { + match self { + IndexOperation::DocumentOperation { index_uid, .. } + | IndexOperation::DocumentEdition { index_uid, .. } + | IndexOperation::DocumentDeletion { index_uid, .. } + | IndexOperation::DocumentClear { index_uid, .. } + | IndexOperation::Settings { index_uid, .. } + | IndexOperation::DocumentClearAndSetting { index_uid, .. } => index_uid, + } + } +} + +impl fmt::Display for IndexOperation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + IndexOperation::DocumentOperation { .. } => { + f.write_str("IndexOperation::DocumentOperation") + } + IndexOperation::DocumentEdition { .. } => { + f.write_str("IndexOperation::DocumentEdition") + } + IndexOperation::DocumentDeletion { .. } => { + f.write_str("IndexOperation::DocumentDeletion") + } + IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"), + IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"), + IndexOperation::DocumentClearAndSetting { .. } => { + f.write_str("IndexOperation::DocumentClearAndSetting") + } + } + } +} + +impl IndexScheduler { + /// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`]. + /// + /// ## Arguments + /// - `rtxn`: read transaction + /// - `index_uid`: name of the index affected by the operations of the autobatch + /// - `batch`: the result of the autobatcher + pub(crate) fn create_next_batch_index( + &self, + rtxn: &RoTxn, + index_uid: String, + batch: BatchKind, + current_batch: &mut ProcessingBatch, + must_create_index: bool, + ) -> Result> { + match batch { + BatchKind::DocumentClear { ids } => Ok(Some(Batch::IndexOperation { + op: IndexOperation::DocumentClear { + tasks: self.queue.get_existing_tasks_for_processing_batch( + rtxn, + current_batch, + ids, + )?, + index_uid, + }, + must_create_index, + })), + BatchKind::DocumentEdition { id } => { + let mut task = + self.queue.tasks.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; + current_batch.processing(Some(&mut task)); + match &task.kind { + KindWithContent::DocumentEdition { index_uid, .. } => { + Ok(Some(Batch::IndexOperation { + op: IndexOperation::DocumentEdition { + index_uid: index_uid.clone(), + task, + }, + must_create_index: false, + })) + } + _ => unreachable!(), + } + } + BatchKind::DocumentOperation { operation_ids, .. } => { + let tasks = self.queue.get_existing_tasks_for_processing_batch( + rtxn, + current_batch, + operation_ids, + )?; + let primary_key = tasks + .iter() + .find_map(|task| match task.kind { + KindWithContent::DocumentAdditionOrUpdate { ref primary_key, .. } => { + // we want to stop on the first document addition + Some(primary_key.clone()) + } + KindWithContent::DocumentDeletion { .. } => None, + _ => unreachable!(), + }) + .flatten(); + + let mut operations = Vec::new(); + + for task in tasks.iter() { + match task.kind { + KindWithContent::DocumentAdditionOrUpdate { + content_file, method, .. + } => match method { + IndexDocumentsMethod::ReplaceDocuments => { + operations.push(DocumentOperation::Replace(content_file)) + } + IndexDocumentsMethod::UpdateDocuments => { + operations.push(DocumentOperation::Update(content_file)) + } + _ => unreachable!("Unknown document merging method"), + }, + KindWithContent::DocumentDeletion { ref documents_ids, .. } => { + operations.push(DocumentOperation::Delete(documents_ids.clone())); + } + _ => unreachable!(), + } + } + + Ok(Some(Batch::IndexOperation { + op: IndexOperation::DocumentOperation { + index_uid, + primary_key, + operations, + tasks, + }, + must_create_index, + })) + } + BatchKind::DocumentDeletion { deletion_ids, includes_by_filter: _ } => { + let tasks = self.queue.get_existing_tasks_for_processing_batch( + rtxn, + current_batch, + deletion_ids, + )?; + + Ok(Some(Batch::IndexOperation { + op: IndexOperation::DocumentDeletion { index_uid, tasks }, + must_create_index, + })) + } + BatchKind::Settings { settings_ids, .. } => { + let tasks = self.queue.get_existing_tasks_for_processing_batch( + rtxn, + current_batch, + settings_ids, + )?; + + let mut settings = Vec::new(); + for task in &tasks { + match task.kind { + KindWithContent::SettingsUpdate { + ref new_settings, is_deletion, .. + } => settings.push((is_deletion, *new_settings.clone())), + _ => unreachable!(), + } + } + + Ok(Some(Batch::IndexOperation { + op: IndexOperation::Settings { index_uid, settings, tasks }, + must_create_index, + })) + } + BatchKind::ClearAndSettings { other, settings_ids, allow_index_creation } => { + let (index_uid, settings, settings_tasks) = match self + .create_next_batch_index( + rtxn, + index_uid, + BatchKind::Settings { settings_ids, allow_index_creation }, + current_batch, + must_create_index, + )? + .unwrap() + { + Batch::IndexOperation { + op: IndexOperation::Settings { index_uid, settings, tasks, .. }, + .. + } => (index_uid, settings, tasks), + _ => unreachable!(), + }; + let (index_uid, cleared_tasks) = match self + .create_next_batch_index( + rtxn, + index_uid, + BatchKind::DocumentClear { ids: other }, + current_batch, + must_create_index, + )? + .unwrap() + { + Batch::IndexOperation { + op: IndexOperation::DocumentClear { index_uid, tasks }, + .. + } => (index_uid, tasks), + _ => unreachable!(), + }; + + Ok(Some(Batch::IndexOperation { + op: IndexOperation::DocumentClearAndSetting { + index_uid, + cleared_tasks, + settings, + settings_tasks, + }, + must_create_index, + })) + } + BatchKind::IndexCreation { id } => { + let mut task = + self.queue.tasks.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; + current_batch.processing(Some(&mut task)); + let (index_uid, primary_key) = match &task.kind { + KindWithContent::IndexCreation { index_uid, primary_key } => { + (index_uid.clone(), primary_key.clone()) + } + _ => unreachable!(), + }; + Ok(Some(Batch::IndexCreation { index_uid, primary_key, task })) + } + BatchKind::IndexUpdate { id } => { + let mut task = + self.queue.tasks.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; + current_batch.processing(Some(&mut task)); + let primary_key = match &task.kind { + KindWithContent::IndexUpdate { primary_key, .. } => primary_key.clone(), + _ => unreachable!(), + }; + Ok(Some(Batch::IndexUpdate { index_uid, primary_key, task })) + } + BatchKind::IndexDeletion { ids } => Ok(Some(Batch::IndexDeletion { + index_uid, + index_has_been_created: must_create_index, + tasks: self.queue.get_existing_tasks_for_processing_batch( + rtxn, + current_batch, + ids, + )?, + })), + BatchKind::IndexSwap { id } => { + let mut task = + self.queue.tasks.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?; + current_batch.processing(Some(&mut task)); + Ok(Some(Batch::IndexSwap { task })) + } + } + } + + /// Create the next batch to be processed; + /// 1. We get the *last* task to cancel. + /// 2. We get the *next* task to delete. + /// 3. We get the *next* snapshot to process. + /// 4. We get the *next* dump to process. + /// 5. We get the *next* tasks to process for a specific index. + #[tracing::instrument(level = "trace", skip(self, rtxn), target = "indexing::scheduler")] + pub(crate) fn create_next_batch( + &self, + rtxn: &RoTxn, + ) -> Result> { + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::InsideCreateBatch)?; + + let batch_id = self.queue.batches.next_batch_id(rtxn)?; + let mut current_batch = ProcessingBatch::new(batch_id); + + let enqueued = &self.queue.tasks.get_status(rtxn, Status::Enqueued)?; + let failed = &self.queue.tasks.get_status(rtxn, Status::Failed)?; + + // 0. The priority over everything is to upgrade the instance + // There shouldn't be multiple upgrade tasks but just in case we're going to batch all of them at the same time + let upgrade = self.queue.tasks.get_kind(rtxn, Kind::UpgradeDatabase)? & (enqueued | failed); + if !upgrade.is_empty() { + let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, upgrade)?; + // In the case of an upgrade database batch, we want to find back the original batch that tried processing it + // and re-use its id + if let Some(batch_uid) = tasks.last().unwrap().batch_uid { + current_batch.uid = batch_uid; + } + current_batch.processing(&mut tasks); + return Ok(Some((Batch::UpgradeDatabase { tasks }, current_batch))); + } + + // 1. we get the last task to cancel. + let to_cancel = self.queue.tasks.get_kind(rtxn, Kind::TaskCancelation)? & enqueued; + if let Some(task_id) = to_cancel.max() { + let mut task = + self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; + current_batch.processing(Some(&mut task)); + return Ok(Some((Batch::TaskCancelation { task }, current_batch))); + } + + // 2. we get the next task to delete + let to_delete = self.queue.tasks.get_kind(rtxn, Kind::TaskDeletion)? & enqueued; + if !to_delete.is_empty() { + let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_delete)?; + current_batch.processing(&mut tasks); + return Ok(Some((Batch::TaskDeletions(tasks), current_batch))); + } + + // 3. we batch the snapshot. + let to_snapshot = self.queue.tasks.get_kind(rtxn, Kind::SnapshotCreation)? & enqueued; + if !to_snapshot.is_empty() { + let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_snapshot)?; + current_batch.processing(&mut tasks); + return Ok(Some((Batch::SnapshotCreation(tasks), current_batch))); + } + + // 4. we batch the dumps. + let to_dump = self.queue.tasks.get_kind(rtxn, Kind::DumpCreation)? & enqueued; + if let Some(to_dump) = to_dump.min() { + let mut task = + self.queue.tasks.get_task(rtxn, to_dump)?.ok_or(Error::CorruptedTaskQueue)?; + current_batch.processing(Some(&mut task)); + return Ok(Some((Batch::Dump(task), current_batch))); + } + + // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. + let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; + let mut task = + self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; + + // If the task is not associated with any index, verify that it is an index swap and + // create the batch directly. Otherwise, get the index name associated with the task + // and use the autobatcher to batch the enqueued tasks associated with it + + let index_name = if let Some(&index_name) = task.indexes().first() { + index_name + } else { + assert!(matches!(&task.kind, KindWithContent::IndexSwap { swaps } if swaps.is_empty())); + current_batch.processing(Some(&mut task)); + return Ok(Some((Batch::IndexSwap { task }, current_batch))); + }; + + let index_already_exists = self.index_mapper.exists(rtxn, index_name)?; + let mut primary_key = None; + if index_already_exists { + let index = self.index_mapper.index(rtxn, index_name)?; + let rtxn = index.read_txn()?; + primary_key = index.primary_key(&rtxn)?.map(|pk| pk.to_string()); + } + + let index_tasks = self.queue.tasks.index_tasks(rtxn, index_name)? & enqueued; + + // If autobatching is disabled we only take one task at a time. + // Otherwise, we take only a maximum of tasks to create batches. + let tasks_limit = if self.scheduler.autobatching_enabled { + self.scheduler.max_number_of_batched_tasks + } else { + 1 + }; + + let mut enqueued = Vec::new(); + let mut total_size: u64 = 0; + for task_id in index_tasks.into_iter().take(tasks_limit) { + let task = self + .queue + .tasks + .get_task(rtxn, task_id) + .and_then(|task| task.ok_or(Error::CorruptedTaskQueue))?; + + if let Some(uuid) = task.content_uuid() { + let content_size = self.queue.file_store.compute_size(uuid)?; + total_size = total_size.saturating_add(content_size); + } + + if total_size > self.scheduler.batched_tasks_size_limit && !enqueued.is_empty() { + break; + } + + enqueued.push((task.uid, task.kind)); + } + + if let Some((batchkind, create_index)) = + autobatcher::autobatch(enqueued, index_already_exists, primary_key.as_deref()) + { + return Ok(self + .create_next_batch_index( + rtxn, + index_name.to_string(), + batchkind, + &mut current_batch, + create_index, + )? + .map(|batch| (batch, current_batch))); + } + + // If we found no tasks then we were notified for something that got autobatched + // somehow and there is nothing to do. + Ok(None) + } +} diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs new file mode 100644 index 000000000..1cbfece34 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -0,0 +1,421 @@ +mod autobatcher; +#[cfg(test)] +mod autobatcher_test; +mod create_batch; +mod process_batch; +mod process_dump_creation; +mod process_index_operation; +mod process_snapshot_creation; +mod process_upgrade; +#[cfg(test)] +mod test; +#[cfg(test)] +mod test_document_addition; +#[cfg(test)] +mod test_embedders; +#[cfg(test)] +mod test_failure; + +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; +use std::sync::Arc; + +use meilisearch_types::error::ResponseError; +use meilisearch_types::heed::{Env, WithoutTls}; +use meilisearch_types::milli; +use meilisearch_types::tasks::Status; +use rayon::current_num_threads; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use roaring::RoaringBitmap; +use synchronoise::SignalEvent; + +use crate::processing::{AtomicTaskStep, BatchProgress}; +use crate::{Error, IndexScheduler, IndexSchedulerOptions, Result, TickOutcome}; + +#[derive(Default, Clone, Debug)] +pub struct MustStopProcessing(Arc); + +impl MustStopProcessing { + pub fn get(&self) -> bool { + self.0.load(Ordering::Relaxed) + } + + pub fn must_stop(&self) { + self.0.store(true, Ordering::Relaxed); + } + + pub fn reset(&self) { + self.0.store(false, Ordering::Relaxed); + } +} + +pub struct Scheduler { + /// A boolean that can be set to true to stop the currently processing tasks. + pub must_stop_processing: MustStopProcessing, + + /// Get a signal when a batch needs to be processed. + pub(crate) wake_up: Arc, + + /// Whether auto-batching is enabled or not. + pub(crate) autobatching_enabled: bool, + + /// The maximum number of tasks that will be batched together. + pub(crate) max_number_of_batched_tasks: usize, + + /// The maximum size, in bytes, of tasks in a batch. + pub(crate) batched_tasks_size_limit: u64, + + /// The path used to create the dumps. + pub(crate) dumps_path: PathBuf, + + /// The path used to create the snapshots. + pub(crate) snapshots_path: PathBuf, + + /// The path to the folder containing the auth LMDB env. + pub(crate) auth_env: Env, + + /// The path to the version file of Meilisearch. + pub(crate) version_file_path: PathBuf, + + /// The maximal number of entries in the search query cache of an embedder. + /// + /// 0 disables the cache. + pub(crate) embedding_cache_cap: usize, +} + +impl Scheduler { + pub(crate) fn private_clone(&self) -> Scheduler { + Scheduler { + must_stop_processing: self.must_stop_processing.clone(), + wake_up: self.wake_up.clone(), + autobatching_enabled: self.autobatching_enabled, + max_number_of_batched_tasks: self.max_number_of_batched_tasks, + batched_tasks_size_limit: self.batched_tasks_size_limit, + dumps_path: self.dumps_path.clone(), + snapshots_path: self.snapshots_path.clone(), + auth_env: self.auth_env.clone(), + version_file_path: self.version_file_path.clone(), + embedding_cache_cap: self.embedding_cache_cap, + } + } + + pub fn new(options: &IndexSchedulerOptions, auth_env: Env) -> Scheduler { + Scheduler { + must_stop_processing: MustStopProcessing::default(), + // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things + wake_up: Arc::new(SignalEvent::auto(true)), + autobatching_enabled: options.autobatching_enabled, + max_number_of_batched_tasks: options.max_number_of_batched_tasks, + batched_tasks_size_limit: options.batched_tasks_size_limit, + dumps_path: options.dumps_path.clone(), + snapshots_path: options.snapshots_path.clone(), + auth_env, + version_file_path: options.version_file_path.clone(), + embedding_cache_cap: options.embedding_cache_cap, + } + } +} + +impl IndexScheduler { + /// Perform one iteration of the run loop. + /// + /// 1. See if we need to cleanup the task queue + /// 2. Find the next batch of tasks to be processed. + /// 3. Update the information of these tasks following the start of their processing. + /// 4. Update the in-memory list of processed tasks accordingly. + /// 5. Process the batch: + /// - perform the actions of each batched task + /// - update the information of each batched task following the end + /// of their processing. + /// 6. Reset the in-memory list of processed tasks. + /// + /// Returns the number of processed tasks. + pub(crate) fn tick(&self) -> Result { + #[cfg(test)] + { + *self.run_loop_iteration.write().unwrap() += 1; + self.breakpoint(crate::test_utils::Breakpoint::Start); + } + + if self.cleanup_enabled { + let mut wtxn = self.env.write_txn()?; + self.queue.cleanup_task_queue(&mut wtxn)?; + wtxn.commit()?; + } + + let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?; + let (batch, mut processing_batch) = + match self.create_next_batch(&rtxn).map_err(|e| Error::CreateBatch(Box::new(e)))? { + Some(batch) => batch, + None => return Ok(TickOutcome::WaitForSignal), + }; + let index_uid = batch.index_uid().map(ToOwned::to_owned); + drop(rtxn); + + // 1. store the starting date with the bitmap of processing tasks. + let mut ids = batch.ids(); + let processed_tasks = ids.len(); + + // We reset the must_stop flag to be sure that we don't stop processing tasks + self.scheduler.must_stop_processing.reset(); + let progress = self + .processing_tasks + .write() + .unwrap() + // We can clone the processing batch here because we don't want its modification to affect the view of the processing batches + .start_processing(processing_batch.clone(), ids.clone()); + + #[cfg(test)] + self.breakpoint(crate::test_utils::Breakpoint::BatchCreated); + + // 2. Process the tasks + let res = { + let cloned_index_scheduler = self.private_clone(); + let processing_batch = &mut processing_batch; + let progress = progress.clone(); + std::thread::scope(|s| { + let p = progress.clone(); + let handle = std::thread::Builder::new() + .name(String::from("batch-operation")) + .spawn_scoped(s, move || { + cloned_index_scheduler.process_batch(batch, processing_batch, p) + }) + .unwrap(); + + match handle.join() { + Ok(ret) => { + if ret.is_err() { + if let Ok(progress_view) = + serde_json::to_string(&progress.as_progress_view()) + { + tracing::warn!("Batch failed while doing: {progress_view}") + } + } + ret + } + Err(panic) => { + if let Ok(progress_view) = + serde_json::to_string(&progress.as_progress_view()) + { + tracing::warn!("Batch failed while doing: {progress_view}") + } + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + Err(Error::ProcessBatchPanicked(msg.to_string())) + } + } + }) + }; + + // Reset the currently updating index to relinquish the index handle + self.index_mapper.set_currently_updating_index(None); + + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::AcquiringWtxn)?; + + progress.update_progress(BatchProgress::WritingTasksToDisk); + processing_batch.finished(); + let mut stop_scheduler_forever = false; + let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; + let mut canceled = RoaringBitmap::new(); + let mut congestion = None; + + match res { + Ok((tasks, cong)) => { + #[cfg(test)] + self.breakpoint(crate::test_utils::Breakpoint::ProcessBatchSucceeded); + + let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32); + progress.update_progress(task_progress_obj); + congestion = cong; + let mut success = 0; + let mut failure = 0; + let mut canceled_by = None; + + #[allow(unused_variables)] + for (i, mut task) in tasks.into_iter().enumerate() { + task_progress.fetch_add(1, Ordering::Relaxed); + processing_batch.update(&mut task); + if task.status == Status::Canceled { + canceled.insert(task.uid); + canceled_by = task.canceled_by; + } + + #[cfg(test)] + self.maybe_fail( + crate::test_utils::FailureLocation::UpdatingTaskAfterProcessBatchSuccess { + task_uid: i as u32, + }, + )?; + + match task.error { + Some(_) => failure += 1, + None => success += 1, + } + + self.queue + .tasks + .update_task(&mut wtxn, &task) + .map_err(|e| Error::UnrecoverableError(Box::new(e)))?; + } + if let Some(canceled_by) = canceled_by { + self.queue.tasks.canceled_by.put(&mut wtxn, &canceled_by, &canceled)?; + } + tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks."); + } + // If we have an abortion error we must stop the tick here and re-schedule tasks. + Err(Error::Milli { + error: milli::Error::InternalError(milli::InternalError::AbortedIndexation), + .. + }) + | Err(Error::AbortedTask) => { + #[cfg(test)] + self.breakpoint(crate::test_utils::Breakpoint::AbortedIndexation); + wtxn.abort(); + + tracing::info!("A batch of tasks was aborted."); + // We make sure that we don't call `stop_processing` on the `processing_tasks`, + // this is because we want to let the next tick call `create_next_batch` and keep + // the `started_at` date times and `processings` of the current processing tasks. + // This date time is used by the task cancelation to store the right `started_at` + // date in the task on disk. + return Ok(TickOutcome::TickAgain(0)); + } + // If an index said it was full, we need to: + // 1. identify which index is full + // 2. close the associated environment + // 3. resize it + // 4. re-schedule tasks + Err(Error::Milli { + error: milli::Error::UserError(milli::UserError::MaxDatabaseSizeReached), + .. + }) if index_uid.is_some() => { + // fixme: add index_uid to match to avoid the unwrap + let index_uid = index_uid.unwrap(); + // fixme: handle error more gracefully? not sure when this could happen + self.index_mapper.resize_index(&wtxn, &index_uid)?; + wtxn.abort(); + + tracing::info!("The max database size was reached. Resizing the index."); + + return Ok(TickOutcome::TickAgain(0)); + } + // In case of a failure we must get back and patch all the tasks with the error. + Err(err) => { + #[cfg(test)] + self.breakpoint(crate::test_utils::Breakpoint::ProcessBatchFailed); + let (task_progress, task_progress_obj) = AtomicTaskStep::new(ids.len() as u32); + progress.update_progress(task_progress_obj); + + if matches!(err, Error::DatabaseUpgrade(_)) { + tracing::error!( + "Upgrade task failed, tasks won't be processed until the following issue is fixed: {err}" + ); + stop_scheduler_forever = true; + } + let error: ResponseError = err.into(); + for id in ids.iter() { + task_progress.fetch_add(1, Ordering::Relaxed); + let mut task = self + .queue + .tasks + .get_task(&wtxn, id) + .map_err(|e| Error::UnrecoverableError(Box::new(e)))? + .ok_or(Error::CorruptedTaskQueue)?; + task.status = Status::Failed; + task.error = Some(error.clone()); + task.details = task.details.map(|d| d.to_failed()); + processing_batch.update(&mut task); + + #[cfg(test)] + self.maybe_fail( + crate::test_utils::FailureLocation::UpdatingTaskAfterProcessBatchFailure, + )?; + + tracing::error!("Batch failed {}", error); + + self.queue + .tasks + .update_task(&mut wtxn, &task) + .map_err(|e| Error::UnrecoverableError(Box::new(e)))?; + } + } + } + + // We must re-add the canceled task so they're part of the same batch. + ids |= canceled; + + processing_batch.stats.progress_trace = + progress.accumulated_durations().into_iter().map(|(k, v)| (k, v.into())).collect(); + processing_batch.stats.write_channel_congestion = congestion.map(|congestion| { + let mut congestion_info = serde_json::Map::new(); + congestion_info.insert("attempts".into(), congestion.attempts.into()); + congestion_info.insert("blocking_attempts".into(), congestion.blocking_attempts.into()); + congestion_info.insert("blocking_ratio".into(), congestion.congestion_ratio().into()); + congestion_info + }); + + if let Some(congestion) = congestion { + tracing::debug!( + "Channel congestion metrics - Attempts: {}, Blocked attempts: {} ({:.1}% congestion)", + congestion.attempts, + congestion.blocking_attempts, + congestion.congestion_ratio(), + ); + } + + tracing::debug!("call trace: {:?}", progress.accumulated_durations()); + + self.queue.write_batch(&mut wtxn, processing_batch, &ids)?; + + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::CommittingWtxn)?; + + wtxn.commit().map_err(Error::HeedTransaction)?; + + // We should stop processing AFTER everything is processed and written to disk otherwise, a batch (which only lives in RAM) may appear in the processing task + // and then become Ā« not found Ā» for some time until the commit everything is written and the final commit is made. + self.processing_tasks.write().unwrap().stop_processing(); + + // Once the tasks are committed, we should delete all the update files associated ASAP to avoid leaking files in case of a restart + tracing::debug!("Deleting the update files"); + + //We take one read transaction **per thread**. Then, every thread is going to pull out new IDs from the roaring bitmap with the help of an atomic shared index into the bitmap + let idx = AtomicU32::new(0); + (0..current_num_threads()).into_par_iter().try_for_each(|_| -> Result<()> { + let rtxn = self.read_txn()?; + while let Some(id) = ids.select(idx.fetch_add(1, Ordering::Relaxed)) { + let task = self + .queue + .tasks + .get_task(&rtxn, id) + .map_err(|e| Error::UnrecoverableError(Box::new(e)))? + .ok_or(Error::CorruptedTaskQueue)?; + if let Err(e) = self.queue.delete_persisted_task_data(&task) { + tracing::error!( + "Failure to delete the content files associated with task {}. Error: {e}", + task.uid + ); + } + } + Ok(()) + })?; + + // We shouldn't crash the tick function if we can't send data to the webhook. + let _ = self.notify_webhook(&ids); + + #[cfg(test)] + self.breakpoint(crate::test_utils::Breakpoint::AfterProcessing); + + if stop_scheduler_forever { + Ok(TickOutcome::StopProcessingForever) + } else { + Ok(TickOutcome::TickAgain(processed_tasks)) + } + } +} diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs new file mode 100644 index 000000000..8f3987bf6 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -0,0 +1,661 @@ +use std::collections::{BTreeSet, HashMap, HashSet}; +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::sync::atomic::Ordering; + +use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; +use meilisearch_types::heed::{RoTxn, RwTxn}; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::{self, ChannelCongestion}; +use meilisearch_types::tasks::{Details, IndexSwap, KindWithContent, Status, Task}; +use milli::update::Settings as MilliSettings; +use roaring::RoaringBitmap; + +use super::create_batch::Batch; +use crate::processing::{ + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, + InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, + UpdateIndexProgress, +}; +use crate::utils::{ + self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, + ProcessingBatch, +}; +use crate::{Error, IndexScheduler, Result, TaskId}; + +impl IndexScheduler { + /// Apply the operation associated with the given batch. + /// + /// ## Return + /// The list of tasks that were processed. The metadata of each task in the returned + /// list is updated accordingly, with the exception of the its date fields + /// [`finished_at`](meilisearch_types::tasks::Task::finished_at) and [`started_at`](meilisearch_types::tasks::Task::started_at). + #[tracing::instrument(level = "trace", skip(self, batch, progress), target = "indexing::scheduler", fields(batch=batch.to_string()))] + pub(crate) fn process_batch( + &self, + batch: Batch, + current_batch: &mut ProcessingBatch, + progress: Progress, + ) -> Result<(Vec, Option)> { + #[cfg(test)] + { + self.maybe_fail(crate::test_utils::FailureLocation::InsideProcessBatch)?; + self.maybe_fail(crate::test_utils::FailureLocation::PanicInsideProcessBatch)?; + self.breakpoint(crate::test_utils::Breakpoint::InsideProcessBatch); + } + + match batch { + Batch::TaskCancelation { mut task } => { + // 1. Retrieve the tasks that matched the query at enqueue-time. + let matched_tasks = + if let KindWithContent::TaskCancelation { tasks, query: _ } = &task.kind { + tasks + } else { + unreachable!() + }; + + let rtxn = self.env.read_txn()?; + let mut canceled_tasks = self.cancel_matched_tasks( + &rtxn, + task.uid, + current_batch, + matched_tasks, + &progress, + )?; + + task.status = Status::Succeeded; + match &mut task.details { + Some(Details::TaskCancelation { + matched_tasks: _, + canceled_tasks: canceled_tasks_details, + original_filter: _, + }) => { + *canceled_tasks_details = Some(canceled_tasks.len() as u64); + } + _ => unreachable!(), + } + + canceled_tasks.push(task); + + Ok((canceled_tasks, None)) + } + Batch::TaskDeletions(mut tasks) => { + // 1. Retrieve the tasks that matched the query at enqueue-time. + let mut matched_tasks = RoaringBitmap::new(); + + for task in tasks.iter() { + if let KindWithContent::TaskDeletion { tasks, query: _ } = &task.kind { + matched_tasks |= tasks; + } else { + unreachable!() + } + } + + let mut wtxn = self.env.write_txn()?; + let mut deleted_tasks = + self.delete_matched_tasks(&mut wtxn, &matched_tasks, &progress)?; + wtxn.commit()?; + + for task in tasks.iter_mut() { + task.status = Status::Succeeded; + let KindWithContent::TaskDeletion { tasks, query: _ } = &task.kind else { + unreachable!() + }; + + let deleted_tasks_count = deleted_tasks.intersection_len(tasks); + deleted_tasks -= tasks; + + match &mut task.details { + Some(Details::TaskDeletion { + matched_tasks: _, + deleted_tasks, + original_filter: _, + }) => { + *deleted_tasks = Some(deleted_tasks_count); + } + _ => unreachable!(), + } + } + Ok((tasks, None)) + } + Batch::SnapshotCreation(tasks) => { + self.process_snapshot(progress, tasks).map(|tasks| (tasks, None)) + } + Batch::Dump(task) => { + self.process_dump_creation(progress, task).map(|tasks| (tasks, None)) + } + Batch::IndexOperation { op, must_create_index } => { + let index_uid = op.index_uid().to_string(); + let index = if must_create_index { + // create the index if it doesn't already exist + let wtxn = self.env.write_txn()?; + self.index_mapper.create_index(wtxn, &index_uid, None)? + } else { + let rtxn = self.env.read_txn()?; + self.index_mapper.index(&rtxn, &index_uid)? + }; + + // the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick + self.index_mapper + .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); + + let mut index_wtxn = index.write_txn()?; + let (tasks, congestion) = + self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; + + { + let span = tracing::trace_span!(target: "indexing::scheduler", "commit"); + let _entered = span.enter(); + + index_wtxn.commit()?; + } + + // if the update processed successfully, we're going to store the new + // stats of the index. Since the tasks have already been processed and + // this is a non-critical operation. If it fails, we should not fail + // the entire batch. + let res = || -> Result<()> { + let index_rtxn = index.read_txn()?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; + let mut wtxn = self.env.write_txn()?; + self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; + wtxn.commit()?; + Ok(()) + }(); + + match res { + Ok(_) => (), + Err(e) => tracing::error!( + error = &e as &dyn std::error::Error, + "Could not write the stats of the index" + ), + } + + Ok((tasks, congestion)) + } + Batch::IndexCreation { index_uid, primary_key, task } => { + progress.update_progress(CreateIndexProgress::CreatingTheIndex); + + let wtxn = self.env.write_txn()?; + if self.index_mapper.exists(&wtxn, &index_uid)? { + return Err(Error::IndexAlreadyExists(index_uid)); + } + self.index_mapper.create_index(wtxn, &index_uid, None)?; + + self.process_batch( + Batch::IndexUpdate { index_uid, primary_key, task }, + current_batch, + progress, + ) + } + Batch::IndexUpdate { index_uid, primary_key, mut task } => { + progress.update_progress(UpdateIndexProgress::UpdatingTheIndex); + let rtxn = self.env.read_txn()?; + let index = self.index_mapper.index(&rtxn, &index_uid)?; + + if let Some(primary_key) = primary_key.clone() { + let mut index_wtxn = index.write_txn()?; + let mut builder = MilliSettings::new( + &mut index_wtxn, + &index, + self.index_mapper.indexer_config(), + ); + builder.set_primary_key(primary_key); + let must_stop_processing = self.scheduler.must_stop_processing.clone(); + builder + .execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + || must_stop_processing.get(), + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; + index_wtxn.commit()?; + } + + // drop rtxn before starting a new wtxn on the same db + rtxn.commit()?; + + task.status = Status::Succeeded; + task.details = Some(Details::IndexInfo { primary_key }); + + // if the update processed successfully, we're going to store the new + // stats of the index. Since the tasks have already been processed and + // this is a non-critical operation. If it fails, we should not fail + // the entire batch. + let res = || -> Result<()> { + let mut wtxn = self.env.write_txn()?; + let index_rtxn = index.read_txn()?; + let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; + wtxn.commit()?; + Ok(()) + }(); + + match res { + Ok(_) => (), + Err(e) => tracing::error!( + error = &e as &dyn std::error::Error, + "Could not write the stats of the index" + ), + } + + Ok((vec![task], None)) + } + Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => { + progress.update_progress(DeleteIndexProgress::DeletingTheIndex); + let wtxn = self.env.write_txn()?; + + // it's possible that the index doesn't exist + let number_of_documents = || -> Result { + let index = self.index_mapper.index(&wtxn, &index_uid)?; + let index_rtxn = index.read_txn()?; + index + .number_of_documents(&index_rtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.to_string()))) + }() + .unwrap_or_default(); + + // The write transaction is directly owned and committed inside. + match self.index_mapper.delete_index(wtxn, &index_uid) { + Ok(()) => (), + Err(Error::IndexNotFound(_)) if index_has_been_created => (), + Err(e) => return Err(e), + } + + // We set all the tasks details to the default value. + for task in &mut tasks { + task.status = Status::Succeeded; + task.details = match &task.kind { + KindWithContent::IndexDeletion { .. } => { + Some(Details::ClearAll { deleted_documents: Some(number_of_documents) }) + } + otherwise => otherwise.default_finished_details(), + }; + } + + Ok((tasks, None)) + } + Batch::IndexSwap { mut task } => { + progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap); + + let mut wtxn = self.env.write_txn()?; + let swaps = if let KindWithContent::IndexSwap { swaps } = &task.kind { + swaps + } else { + unreachable!() + }; + let mut not_found_indexes = BTreeSet::new(); + for IndexSwap { indexes: (lhs, rhs) } in swaps { + for index in [lhs, rhs] { + let index_exists = self.index_mapper.index_exists(&wtxn, index)?; + if !index_exists { + not_found_indexes.insert(index); + } + } + } + if !not_found_indexes.is_empty() { + if not_found_indexes.len() == 1 { + return Err(Error::SwapIndexNotFound( + not_found_indexes.into_iter().next().unwrap().clone(), + )); + } else { + return Err(Error::SwapIndexesNotFound( + not_found_indexes.into_iter().cloned().collect(), + )); + } + } + progress.update_progress(SwappingTheIndexes::SwappingTheIndexes); + for (step, swap) in swaps.iter().enumerate() { + progress.update_progress(VariableNameStep::::new( + format!("swapping index {} and {}", swap.indexes.0, swap.indexes.1), + step as u32, + swaps.len() as u32, + )); + self.apply_index_swap( + &mut wtxn, + &progress, + task.uid, + &swap.indexes.0, + &swap.indexes.1, + )?; + } + wtxn.commit()?; + task.status = Status::Succeeded; + Ok((vec![task], None)) + } + Batch::UpgradeDatabase { mut tasks } => { + let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else { + unreachable!(); + }; + let ret = catch_unwind(AssertUnwindSafe(|| self.process_upgrade(from, progress))); + match ret { + Ok(Ok(())) => (), + Ok(Err(e)) => return Err(Error::DatabaseUpgrade(Box::new(e))), + Err(e) => { + let msg = match e.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match e.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + return Err(Error::DatabaseUpgrade(Box::new(Error::ProcessBatchPanicked( + msg.to_string(), + )))); + } + } + + for task in tasks.iter_mut() { + task.status = Status::Succeeded; + // Since this task can be retried we must reset its error status + task.error = None; + } + + Ok((tasks, None)) + } + } + } + + /// Swap the index `lhs` with the index `rhs`. + fn apply_index_swap( + &self, + wtxn: &mut RwTxn, + progress: &Progress, + task_id: u32, + lhs: &str, + rhs: &str, + ) -> Result<()> { + progress.update_progress(InnerSwappingTwoIndexes::RetrieveTheTasks); + // 1. Verify that both lhs and rhs are existing indexes + let index_lhs_exists = self.index_mapper.index_exists(wtxn, lhs)?; + if !index_lhs_exists { + return Err(Error::IndexNotFound(lhs.to_owned())); + } + let index_rhs_exists = self.index_mapper.index_exists(wtxn, rhs)?; + if !index_rhs_exists { + return Err(Error::IndexNotFound(rhs.to_owned())); + } + + // 2. Get the task set for index = name that appeared before the index swap task + let mut index_lhs_task_ids = self.queue.tasks.index_tasks(wtxn, lhs)?; + index_lhs_task_ids.remove_range(task_id..); + let mut index_rhs_task_ids = self.queue.tasks.index_tasks(wtxn, rhs)?; + index_rhs_task_ids.remove_range(task_id..); + + // 3. before_name -> new_name in the task's KindWithContent + progress.update_progress(InnerSwappingTwoIndexes::UpdateTheTasks); + let tasks_to_update = &index_lhs_task_ids | &index_rhs_task_ids; + let (atomic, task_progress) = AtomicTaskStep::new(tasks_to_update.len() as u32); + progress.update_progress(task_progress); + + for task_id in tasks_to_update { + let mut task = + self.queue.tasks.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; + swap_index_uid_in_task(&mut task, (lhs, rhs)); + self.queue.tasks.all_tasks.put(wtxn, &task_id, &task)?; + atomic.fetch_add(1, Ordering::Relaxed); + } + + // 4. remove the task from indexuid = before_name + // 5. add the task to indexuid = after_name + progress.update_progress(InnerSwappingTwoIndexes::UpdateTheIndexesMetadata); + self.queue.tasks.update_index(wtxn, lhs, |lhs_tasks| { + *lhs_tasks -= &index_lhs_task_ids; + *lhs_tasks |= &index_rhs_task_ids; + })?; + self.queue.tasks.update_index(wtxn, rhs, |rhs_tasks| { + *rhs_tasks -= &index_rhs_task_ids; + *rhs_tasks |= &index_lhs_task_ids; + })?; + + // 6. Swap in the index mapper + self.index_mapper.swap(wtxn, lhs, rhs)?; + + Ok(()) + } + + /// Delete each given task from all the databases (if it is deleteable). + /// + /// Return the number of tasks that were actually deleted. + fn delete_matched_tasks( + &self, + wtxn: &mut RwTxn, + matched_tasks: &RoaringBitmap, + progress: &Progress, + ) -> Result { + progress.update_progress(TaskDeletionProgress::DeletingTasksDateTime); + + // 1. Remove from this list the tasks that we are not allowed to delete + let enqueued_tasks = self.queue.tasks.get_status(wtxn, Status::Enqueued)?; + let processing_tasks = &self.processing_tasks.read().unwrap().processing.clone(); + + let all_task_ids = self.queue.tasks.all_task_ids(wtxn)?; + let mut to_delete_tasks = all_task_ids & matched_tasks; + to_delete_tasks -= &**processing_tasks; + to_delete_tasks -= &enqueued_tasks; + + // 2. We now have a list of tasks to delete, delete them + let mut affected_indexes = HashSet::new(); + let mut affected_statuses = HashSet::new(); + let mut affected_kinds = HashSet::new(); + let mut affected_canceled_by = RoaringBitmap::new(); + // The tasks that have been removed *per batches*. + let mut affected_batches: HashMap = HashMap::new(); + + let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32); + progress.update_progress(task_progress); + for task_id in to_delete_tasks.iter() { + let task = + self.queue.tasks.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; + + affected_indexes.extend(task.indexes().into_iter().map(|x| x.to_owned())); + affected_statuses.insert(task.status); + affected_kinds.insert(task.kind.as_kind()); + // Note: don't delete the persisted task data since + // we can only delete succeeded, failed, and canceled tasks. + // In each of those cases, the persisted data is supposed to + // have been deleted already. + utils::remove_task_datetime( + wtxn, + self.queue.tasks.enqueued_at, + task.enqueued_at, + task.uid, + )?; + if let Some(started_at) = task.started_at { + utils::remove_task_datetime( + wtxn, + self.queue.tasks.started_at, + started_at, + task.uid, + )?; + } + if let Some(finished_at) = task.finished_at { + utils::remove_task_datetime( + wtxn, + self.queue.tasks.finished_at, + finished_at, + task.uid, + )?; + } + if let Some(canceled_by) = task.canceled_by { + affected_canceled_by.insert(canceled_by); + } + if let Some(batch_uid) = task.batch_uid { + affected_batches.entry(batch_uid).or_default().insert(task_id); + } + atomic_progress.fetch_add(1, Ordering::Relaxed); + } + + progress.update_progress(TaskDeletionProgress::DeletingTasksMetadata); + let (atomic_progress, task_progress) = AtomicTaskStep::new( + (affected_indexes.len() + affected_statuses.len() + affected_kinds.len()) as u32, + ); + progress.update_progress(task_progress); + for index in affected_indexes.iter() { + self.queue.tasks.update_index(wtxn, index, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); + } + + for status in affected_statuses.iter() { + self.queue.tasks.update_status(wtxn, *status, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); + } + + for kind in affected_kinds.iter() { + self.queue.tasks.update_kind(wtxn, *kind, |bitmap| *bitmap -= &to_delete_tasks)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); + } + + progress.update_progress(TaskDeletionProgress::DeletingTasks); + let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32); + progress.update_progress(task_progress); + for task in to_delete_tasks.iter() { + self.queue.tasks.all_tasks.delete(wtxn, &task)?; + atomic_progress.fetch_add(1, Ordering::Relaxed); + } + for canceled_by in affected_canceled_by { + if let Some(mut tasks) = self.queue.tasks.canceled_by.get(wtxn, &canceled_by)? { + tasks -= &to_delete_tasks; + if tasks.is_empty() { + self.queue.tasks.canceled_by.delete(wtxn, &canceled_by)?; + } else { + self.queue.tasks.canceled_by.put(wtxn, &canceled_by, &tasks)?; + } + } + } + progress.update_progress(TaskDeletionProgress::DeletingBatches); + let (atomic_progress, batch_progress) = AtomicBatchStep::new(affected_batches.len() as u32); + progress.update_progress(batch_progress); + for (batch_id, to_delete_tasks) in affected_batches { + if let Some(mut tasks) = self.queue.batch_to_tasks_mapping.get(wtxn, &batch_id)? { + tasks -= &to_delete_tasks; + // We must remove the batch entirely + if tasks.is_empty() { + if let Some(batch) = self.queue.batches.get_batch(wtxn, batch_id)? { + if let Some(BatchEnqueuedAt { earliest, oldest }) = batch.enqueued_at { + remove_task_datetime( + wtxn, + self.queue.batches.enqueued_at, + earliest, + batch_id, + )?; + remove_task_datetime( + wtxn, + self.queue.batches.enqueued_at, + oldest, + batch_id, + )?; + } else { + // If we don't have the enqueued at in the batch it means the database comes from the v1.12 + // and we still need to find the date by scrolling the database + remove_n_tasks_datetime_earlier_than( + wtxn, + self.queue.batches.enqueued_at, + batch.started_at, + batch.stats.total_nb_tasks.clamp(1, 2) as usize, + batch_id, + )?; + } + remove_task_datetime( + wtxn, + self.queue.batches.started_at, + batch.started_at, + batch_id, + )?; + if let Some(finished_at) = batch.finished_at { + remove_task_datetime( + wtxn, + self.queue.batches.finished_at, + finished_at, + batch_id, + )?; + } + + self.queue.batches.all_batches.delete(wtxn, &batch_id)?; + self.queue.batch_to_tasks_mapping.delete(wtxn, &batch_id)?; + } + } + + // Anyway, we must remove the batch from all its reverse indexes. + // The only way to do that is to check + + for index in affected_indexes.iter() { + let index_tasks = self.queue.tasks.index_tasks(wtxn, index)?; + let remaining_index_tasks = index_tasks & &tasks; + if remaining_index_tasks.is_empty() { + self.queue.batches.update_index(wtxn, index, |bitmap| { + bitmap.remove(batch_id); + })?; + } + } + + for status in affected_statuses.iter() { + let status_tasks = self.queue.tasks.get_status(wtxn, *status)?; + let remaining_status_tasks = status_tasks & &tasks; + if remaining_status_tasks.is_empty() { + self.queue.batches.update_status(wtxn, *status, |bitmap| { + bitmap.remove(batch_id); + })?; + } + } + + for kind in affected_kinds.iter() { + let kind_tasks = self.queue.tasks.get_kind(wtxn, *kind)?; + let remaining_kind_tasks = kind_tasks & &tasks; + if remaining_kind_tasks.is_empty() { + self.queue.batches.update_kind(wtxn, *kind, |bitmap| { + bitmap.remove(batch_id); + })?; + } + } + } + atomic_progress.fetch_add(1, Ordering::Relaxed); + } + + Ok(to_delete_tasks) + } + + /// Cancel each given task from all the databases (if it is cancelable). + /// + /// Returns the list of tasks that matched the filter and must be written in the database. + fn cancel_matched_tasks( + &self, + rtxn: &RoTxn, + cancel_task_id: TaskId, + current_batch: &mut ProcessingBatch, + matched_tasks: &RoaringBitmap, + progress: &Progress, + ) -> Result> { + progress.update_progress(TaskCancelationProgress::RetrievingTasks); + + // 1. Remove from this list the tasks that we are not allowed to cancel + // Notice that only the _enqueued_ ones are cancelable and we should + // have already aborted the indexation of the _processing_ ones + let cancelable_tasks = self.queue.tasks.get_status(rtxn, Status::Enqueued)?; + let tasks_to_cancel = cancelable_tasks & matched_tasks; + + let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32); + progress.update_progress(progress_obj); + + // 2. We now have a list of tasks to cancel, cancel them + let mut tasks = self.queue.tasks.get_existing_tasks( + rtxn, + tasks_to_cancel.iter().inspect(|_| { + task_progress.fetch_add(1, Ordering::Relaxed); + }), + )?; + + progress.update_progress(TaskCancelationProgress::UpdatingTasks); + let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32); + progress.update_progress(progress_obj); + for task in tasks.iter_mut() { + task.status = Status::Canceled; + task.canceled_by = Some(cancel_task_id); + task.details = task.details.as_ref().map(|d| d.to_failed()); + current_batch.processing(Some(task)); + task_progress.fetch_add(1, Ordering::Relaxed); + } + + Ok(tasks) + } +} diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs new file mode 100644 index 000000000..a6d785b2f --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -0,0 +1,278 @@ +use std::collections::BTreeMap; +use std::fs::File; +use std::io::BufWriter; +use std::sync::atomic::Ordering; + +use dump::IndexMetadata; +use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; +use meilisearch_types::milli::{self}; +use meilisearch_types::tasks::{Details, KindWithContent, Status, Task}; +use time::macros::format_description; +use time::OffsetDateTime; + +use crate::processing::{ + AtomicBatchStep, AtomicDocumentStep, AtomicTaskStep, DumpCreationProgress, +}; +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + pub(super) fn process_dump_creation( + &self, + progress: Progress, + mut task: Task, + ) -> Result> { + progress.update_progress(DumpCreationProgress::StartTheDumpCreation); + let started_at = OffsetDateTime::now_utc(); + let (keys, instance_uid) = + if let KindWithContent::DumpCreation { keys, instance_uid } = &task.kind { + (keys, instance_uid) + } else { + unreachable!(); + }; + let dump = dump::DumpWriter::new(*instance_uid)?; + + // 1. dump the keys + progress.update_progress(DumpCreationProgress::DumpTheApiKeys); + let mut dump_keys = dump.create_keys()?; + for key in keys { + dump_keys.push_key(key)?; + } + dump_keys.flush()?; + + let rtxn = self.env.read_txn()?; + + // 2. dump the tasks + progress.update_progress(DumpCreationProgress::DumpTheTasks); + let mut dump_tasks = dump.create_tasks_queue()?; + + let (atomic, update_task_progress) = + AtomicTaskStep::new(self.queue.tasks.all_tasks.len(&rtxn)? as u32); + progress.update_progress(update_task_progress); + + for ret in self.queue.tasks.all_tasks.iter(&rtxn)? { + if self.scheduler.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + let (_, mut t) = ret?; + let status = t.status; + let content_file = t.content_uuid(); + + // In the case we're dumping ourselves we want to be marked as finished + // to not loop over ourselves indefinitely. + if t.uid == task.uid { + let finished_at = OffsetDateTime::now_utc(); + + // We're going to fake the date because we don't know if everything is going to go well. + // But we need to dump the task as finished and successful. + // If something fail everything will be set appropriately in the end. + t.status = Status::Succeeded; + t.started_at = Some(started_at); + t.finished_at = Some(finished_at); + } + + // Patch the task to remove the batch uid, because as of v1.12.5 batches are not persisted. + // This prevent from referencing *future* batches not actually associated with the task. + // + // See for details. + t.batch_uid = None; + + let mut dump_content_file = dump_tasks.push_task(&t.into())?; + + // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. + if let Some(content_file) = content_file { + if self.scheduler.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + if status == Status::Enqueued { + let content_file = self.queue.file_store.get_update(content_file)?; + + for document in + serde_json::de::Deserializer::from_reader(content_file).into_iter() + { + let document = document.map_err(|e| { + Error::from_milli(milli::InternalError::SerdeJson(e).into(), None) + })?; + dump_content_file.push_document(&document)?; + } + + dump_content_file.flush()?; + } + } + atomic.fetch_add(1, Ordering::Relaxed); + } + dump_tasks.flush()?; + + // 3. dump the batches + progress.update_progress(DumpCreationProgress::DumpTheBatches); + let mut dump_batches = dump.create_batches_queue()?; + + let (atomic_batch_progress, update_batch_progress) = + AtomicBatchStep::new(self.queue.batches.all_batches.len(&rtxn)? as u32); + progress.update_progress(update_batch_progress); + + for ret in self.queue.batches.all_batches.iter(&rtxn)? { + if self.scheduler.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + let (_, mut b) = ret?; + // In the case we're dumping ourselves we want to be marked as finished + // to not loop over ourselves indefinitely. + if b.uid == task.uid { + let finished_at = OffsetDateTime::now_utc(); + + // We're going to fake the date because we don't know if everything is going to go well. + // But we need to dump the task as finished and successful. + // If something fail everything will be set appropriately in the end. + let mut statuses = BTreeMap::new(); + statuses.insert(Status::Succeeded, b.stats.total_nb_tasks); + b.stats.status = statuses; + b.finished_at = Some(finished_at); + } + + dump_batches.push_batch(&b)?; + atomic_batch_progress.fetch_add(1, Ordering::Relaxed); + } + dump_batches.flush()?; + + // 4. Dump the indexes + progress.update_progress(DumpCreationProgress::DumpTheIndexes); + let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32; + let mut count = 0; + let () = self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> { + progress.update_progress(VariableNameStep::::new( + uid.to_string(), + count, + nb_indexes, + )); + count += 1; + + let rtxn = index.read_txn()?; + let metadata = IndexMetadata { + uid: uid.to_owned(), + primary_key: index.primary_key(&rtxn)?.map(String::from), + created_at: index + .created_at(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, + updated_at: index + .updated_at(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?, + }; + let mut index_dumper = dump.create_index(uid, &metadata)?; + + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index + .embedding_configs(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + let nb_documents = index + .number_of_documents(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? + as u32; + let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents); + progress.update_progress(update_document_progress); + let documents = index + .all_documents(&rtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // 4.1. Dump the documents + for ret in documents { + if self.scheduler.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + let (id, doc) = ret.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + 'inject_vectors: { + let embeddings = index + .embeddings(&rtxn, id) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME.to_owned()) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + let user_err = + milli::Error::UserError(milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&rtxn, std::iter::once(id)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={id}") + } + }, + value: vectors.clone(), + }); + + return Err(Error::from_milli(user_err, Some(uid.to_string()))); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(id)); + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, + }; + vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); + } + } + + index_dumper.push_document(&document)?; + atomic.fetch_add(1, Ordering::Relaxed); + } + + // 4.2. Dump the settings + let settings = meilisearch_types::settings::settings( + index, + &rtxn, + meilisearch_types::settings::SecretPolicy::RevealSecrets, + ) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + index_dumper.settings(&settings)?; + Ok(()) + })?; + + // 5. Dump experimental feature settings + progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures); + let features = self.features().runtime_features(); + dump.create_experimental_features(features)?; + let network = self.network(); + dump.create_network(network)?; + + let dump_uid = started_at.format(format_description!( + "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]" + )).unwrap(); + + if self.scheduler.must_stop_processing.get() { + return Err(Error::AbortedTask); + } + progress.update_progress(DumpCreationProgress::CompressTheDump); + let path = self.scheduler.dumps_path.join(format!("{}.dump", dump_uid)); + let file = File::create(path)?; + dump.persist_to(BufWriter::new(file))?; + + // if we reached this step we can tell the scheduler we succeeded to dump ourselves. + task.status = Status::Succeeded; + task.details = Some(Details::Dump { dump_uid: Some(dump_uid) }); + Ok(vec![task]) + } +} diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs new file mode 100644 index 000000000..690fe2efd --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -0,0 +1,539 @@ +use bumpalo::collections::CollectIn; +use bumpalo::Bump; +use meilisearch_types::heed::RwTxn; +use meilisearch_types::milli::documents::PrimaryKey; +use meilisearch_types::milli::progress::Progress; +use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction}; +use meilisearch_types::milli::update::DocumentAdditionResult; +use meilisearch_types::milli::{self, ChannelCongestion, Filter, ThreadPoolNoAbortBuilder}; +use meilisearch_types::settings::apply_settings_to_builder; +use meilisearch_types::tasks::{Details, KindWithContent, Status, Task}; +use meilisearch_types::Index; +use roaring::RoaringBitmap; + +use super::create_batch::{DocumentOperation, IndexOperation}; +use crate::processing::{ + DocumentDeletionProgress, DocumentEditionProgress, DocumentOperationProgress, SettingsProgress, +}; +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + /// Process the index operation on the given index. + /// + /// ## Return + /// The list of processed tasks. + #[tracing::instrument( + level = "trace", + skip(self, index_wtxn, index, progress), + target = "indexing::scheduler" + )] + pub(crate) fn apply_index_operation<'i>( + &self, + index_wtxn: &mut RwTxn<'i>, + index: &'i Index, + operation: IndexOperation, + progress: Progress, + ) -> Result<(Vec, Option)> { + let indexer_alloc = Bump::new(); + let started_processing_at = std::time::Instant::now(); + let must_stop_processing = self.scheduler.must_stop_processing.clone(); + + match operation { + IndexOperation::DocumentClear { index_uid, mut tasks } => { + let count = milli::update::ClearDocuments::new(index_wtxn, index) + .execute() + .map_err(|e| Error::from_milli(e, Some(index_uid)))?; + + let mut first_clear_found = false; + for task in &mut tasks { + task.status = Status::Succeeded; + // The first document clear will effectively delete every documents + // in the database but the next ones will clear 0 documents. + task.details = match &task.kind { + KindWithContent::DocumentClear { .. } => { + let count = if first_clear_found { 0 } else { count }; + first_clear_found = true; + Some(Details::ClearAll { deleted_documents: Some(count) }) + } + otherwise => otherwise.default_details(), + }; + } + + Ok((tasks, None)) + } + IndexOperation::DocumentOperation { index_uid, primary_key, operations, mut tasks } => { + progress.update_progress(DocumentOperationProgress::RetrievingConfig); + // TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches. + // this is made difficult by the fact we're doing private clones of the index scheduler and sending it + // to a fresh thread. + let mut content_files = Vec::new(); + for operation in &operations { + match operation { + DocumentOperation::Replace(content_uuid) + | DocumentOperation::Update(content_uuid) => { + let content_file = self.queue.file_store.get_update(*content_uuid)?; + let mmap = unsafe { memmap2::Mmap::map(&content_file)? }; + content_files.push(mmap); + } + _ => (), + } + } + + let rtxn = index.read_txn()?; + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + let mut content_files_iter = content_files.iter(); + let mut indexer = indexer::DocumentOperation::new(); + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; + for operation in operations { + match operation { + DocumentOperation::Replace(_content_uuid) => { + let mmap = content_files_iter.next().unwrap(); + indexer + .replace_documents(mmap) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + } + DocumentOperation::Update(_content_uuid) => { + let mmap = content_files_iter.next().unwrap(); + indexer + .update_documents(mmap) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + } + DocumentOperation::Delete(document_ids) => { + let document_ids: bumpalo::collections::vec::Vec<_> = document_ids + .iter() + .map(|s| &*indexer_alloc.alloc_str(s)) + .collect_in(&indexer_alloc); + indexer.delete_documents(document_ids.into_bump_slice()); + } + } + } + + let local_pool; + let indexer_config = self.index_mapper.indexer_config(); + let pool = match &indexer_config.thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); + &local_pool + } + }; + + progress.update_progress(DocumentOperationProgress::ComputingDocumentChanges); + let (document_changes, operation_stats, primary_key) = indexer + .into_changes( + &indexer_alloc, + index, + &rtxn, + primary_key.as_deref(), + &mut new_fields_ids_map, + &|| must_stop_processing.get(), + progress.clone(), + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + + let mut candidates_count = 0; + for (stats, task) in operation_stats.into_iter().zip(&mut tasks) { + candidates_count += stats.document_count; + match stats.error { + Some(error) => { + task.status = Status::Failed; + task.error = Some(milli::Error::UserError(error).into()); + } + None => task.status = Status::Succeeded, + } + + task.details = match task.details { + Some(Details::DocumentAdditionOrUpdate { received_documents, .. }) => { + Some(Details::DocumentAdditionOrUpdate { + received_documents, + indexed_documents: Some(stats.document_count), + }) + } + Some(Details::DocumentDeletion { provided_ids, .. }) => { + Some(Details::DocumentDeletion { + provided_ids, + deleted_documents: Some(stats.document_count), + }) + } + _ => { + // In the case of a `documentAdditionOrUpdate` or `DocumentDeletion` + // the details MUST be set to either addition or deletion + unreachable!(); + } + } + } + + progress.update_progress(DocumentOperationProgress::Indexing); + let mut congestion = None; + if tasks.iter().any(|res| res.error.is_none()) { + congestion = Some( + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, + ); + + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + } + + Ok((tasks, congestion)) + } + IndexOperation::DocumentEdition { index_uid, mut task } => { + progress.update_progress(DocumentEditionProgress::RetrievingConfig); + + let (filter, code) = if let KindWithContent::DocumentEdition { + filter_expr, + context: _, + function, + .. + } = &task.kind + { + (filter_expr, function) + } else { + unreachable!() + }; + + let candidates = match filter.as_ref().map(Filter::from_json) { + Some(Ok(Some(filter))) => filter + .evaluate(index_wtxn, index) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + None | Some(Ok(None)) => index.documents_ids(index_wtxn)?, + Some(Err(e)) => return Err(Error::from_milli(e, Some(index_uid.clone()))), + }; + + let (original_filter, context, function) = if let Some(Details::DocumentEdition { + original_filter, + context, + function, + .. + }) = task.details + { + (original_filter, context, function) + } else { + // In the case of a `documentEdition` the details MUST be set + unreachable!(); + }; + + if candidates.is_empty() { + task.status = Status::Succeeded; + task.details = Some(Details::DocumentEdition { + original_filter, + context, + function, + deleted_documents: Some(0), + edited_documents: Some(0), + }); + + return Ok((vec![task], None)); + } + + let rtxn = index.read_txn()?; + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + // candidates not empty => index not empty => a primary key is set + let primary_key = index.primary_key(&rtxn)?.unwrap(); + + let primary_key = + PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; + + let result_count = Ok((candidates.len(), candidates.len())) as Result<_>; + + let mut congestion = None; + if task.error.is_none() { + let local_pool; + let indexer_config = self.index_mapper.indexer_config(); + let pool = match &indexer_config.thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); + &local_pool + } + }; + + let candidates_count = candidates.len(); + progress.update_progress(DocumentEditionProgress::ComputingDocumentChanges); + let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); + let document_changes = pool + .install(|| { + indexer + .into_changes(&primary_key) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone()))) + }) + .unwrap()?; + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; + + progress.update_progress(DocumentEditionProgress::Indexing); + congestion = Some( + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // cannot change primary key in DocumentEdition + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); + + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + } + + match result_count { + Ok((deleted_documents, edited_documents)) => { + task.status = Status::Succeeded; + task.details = Some(Details::DocumentEdition { + original_filter, + context, + function, + deleted_documents: Some(deleted_documents), + edited_documents: Some(edited_documents), + }); + } + Err(e) => { + task.status = Status::Failed; + task.details = Some(Details::DocumentEdition { + original_filter, + context, + function, + deleted_documents: Some(0), + edited_documents: Some(0), + }); + task.error = Some(e.into()); + } + } + + Ok((vec![task], congestion)) + } + IndexOperation::DocumentDeletion { mut tasks, index_uid } => { + progress.update_progress(DocumentDeletionProgress::RetrievingConfig); + + let mut to_delete = RoaringBitmap::new(); + let external_documents_ids = index.external_documents_ids(); + + for task in tasks.iter_mut() { + let before = to_delete.len(); + task.status = Status::Succeeded; + + match &task.kind { + KindWithContent::DocumentDeletion { index_uid: _, documents_ids } => { + for id in documents_ids { + if let Some(id) = external_documents_ids.get(index_wtxn, id)? { + to_delete.insert(id); + } + } + let will_be_removed = to_delete.len() - before; + task.details = Some(Details::DocumentDeletion { + provided_ids: documents_ids.len(), + deleted_documents: Some(will_be_removed), + }); + } + KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } => { + let before = to_delete.len(); + let filter = match Filter::from_json(filter_expr) { + Ok(filter) => filter, + Err(err) => { + // theorically, this should be catched by deserr before reaching the index-scheduler and cannot happens + task.status = Status::Failed; + task.error = Some( + Error::from_milli(err, Some(index_uid.clone())).into(), + ); + None + } + }; + if let Some(filter) = filter { + let candidates = filter + .evaluate(index_wtxn, index) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone()))); + match candidates { + Ok(candidates) => to_delete |= candidates, + Err(err) => { + task.status = Status::Failed; + task.error = Some(err.into()); + } + }; + } + let will_be_removed = to_delete.len() - before; + if let Some(Details::DocumentDeletionByFilter { + original_filter: _, + deleted_documents, + }) = &mut task.details + { + *deleted_documents = Some(will_be_removed); + } else { + // In the case of a `documentDeleteByFilter` the details MUST be set + unreachable!() + } + } + _ => unreachable!(), + } + } + + if to_delete.is_empty() { + return Ok((tasks, None)); + } + + let rtxn = index.read_txn()?; + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + // to_delete not empty => index not empty => primary key set + let primary_key = index.primary_key(&rtxn)?.unwrap(); + + let primary_key = + PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map) + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; + + let mut congestion = None; + if !tasks.iter().all(|res| res.error.is_some()) { + let local_pool; + let indexer_config = self.index_mapper.indexer_config(); + let pool = match &indexer_config.thread_pool { + Some(pool) => pool, + None => { + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); + &local_pool + } + }; + + progress.update_progress(DocumentDeletionProgress::DeleteDocuments); + let mut indexer = indexer::DocumentDeletion::new(); + let candidates_count = to_delete.len(); + indexer.delete_documents_by_docids(to_delete); + let document_changes = indexer.into_changes(&indexer_alloc, primary_key); + let embedders = index + .embedding_configs(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + let embedders = self.embedders(index_uid.clone(), embedders)?; + + progress.update_progress(DocumentDeletionProgress::Indexing); + congestion = Some( + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // document deletion never changes primary key + &document_changes, + embedders, + &|| must_stop_processing.get(), + &progress, + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + ); + + let addition = DocumentAdditionResult { + indexed_documents: candidates_count, + number_of_documents: index + .number_of_documents(index_wtxn) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, + }; + + tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); + } + + Ok((tasks, congestion)) + } + IndexOperation::Settings { index_uid, settings, mut tasks } => { + progress.update_progress(SettingsProgress::RetrievingAndMergingTheSettings); + let indexer_config = self.index_mapper.indexer_config(); + let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config); + + for (task, (_, settings)) in tasks.iter_mut().zip(settings) { + let checked_settings = settings.clone().check(); + task.details = Some(Details::SettingsUpdate { settings: Box::new(settings) }); + apply_settings_to_builder(&checked_settings, &mut builder); + + // We can apply the status right now and if an update fail later + // the whole batch will be marked as failed. + task.status = Status::Succeeded; + } + + progress.update_progress(SettingsProgress::ApplyTheSettings); + builder + .execute( + |indexing_step| tracing::debug!(update = ?indexing_step), + || must_stop_processing.get(), + ) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + + Ok((tasks, None)) + } + IndexOperation::DocumentClearAndSetting { + index_uid, + cleared_tasks, + settings, + settings_tasks, + } => { + let (mut import_tasks, _congestion) = self.apply_index_operation( + index_wtxn, + index, + IndexOperation::DocumentClear { + index_uid: index_uid.clone(), + tasks: cleared_tasks, + }, + progress.clone(), + )?; + + let (settings_tasks, _congestion) = self.apply_index_operation( + index_wtxn, + index, + IndexOperation::Settings { index_uid, settings, tasks: settings_tasks }, + progress, + )?; + + let mut tasks = settings_tasks; + tasks.append(&mut import_tasks); + Ok((tasks, None)) + } + } + } +} diff --git a/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs b/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs new file mode 100644 index 000000000..f1bafed01 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs @@ -0,0 +1,128 @@ +use std::ffi::OsStr; +use std::fs; +use std::sync::atomic::Ordering; + +use meilisearch_types::heed::CompactionOption; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::tasks::{Status, Task}; +use meilisearch_types::{compression, VERSION_FILE_NAME}; + +use crate::processing::{AtomicUpdateFileStep, SnapshotCreationProgress}; +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + pub(super) fn process_snapshot( + &self, + progress: Progress, + mut tasks: Vec, + ) -> Result> { + progress.update_progress(SnapshotCreationProgress::StartTheSnapshotCreation); + + fs::create_dir_all(&self.scheduler.snapshots_path)?; + let temp_snapshot_dir = tempfile::tempdir()?; + + // 1. Snapshot the version file. + let dst = temp_snapshot_dir.path().join(VERSION_FILE_NAME); + fs::copy(&self.scheduler.version_file_path, dst)?; + + // 2. Snapshot the index-scheduler LMDB env + // + // When we call copy_to_path, LMDB opens a read transaction by itself, + // we can't provide our own. It is an issue as we would like to know + // the update files to copy but new ones can be enqueued between the copy + // of the env and the new transaction we open to retrieve the enqueued tasks. + // So we prefer opening a new transaction after copying the env and copy more + // update files than not enough. + // + // Note that there cannot be any update files deleted between those + // two read operations as the task processing is synchronous. + + // 2.1 First copy the LMDB env of the index-scheduler + progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexScheduler); + let dst = temp_snapshot_dir.path().join("tasks"); + fs::create_dir_all(&dst)?; + self.env.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?; + + // 2.2 Create a read transaction on the index-scheduler + let rtxn = self.env.read_txn()?; + + // 2.3 Create the update files directory + let update_files_dir = temp_snapshot_dir.path().join("update_files"); + fs::create_dir_all(&update_files_dir)?; + + // 2.4 Only copy the update files of the enqueued tasks + progress.update_progress(SnapshotCreationProgress::SnapshotTheUpdateFiles); + let enqueued = self.queue.tasks.get_status(&rtxn, Status::Enqueued)?; + let (atomic, update_file_progress) = AtomicUpdateFileStep::new(enqueued.len() as u32); + progress.update_progress(update_file_progress); + for task_id in enqueued { + let task = + self.queue.tasks.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; + if let Some(content_uuid) = task.content_uuid() { + let src = self.queue.file_store.get_update_path(content_uuid); + let dst = update_files_dir.join(content_uuid.to_string()); + fs::copy(src, dst)?; + } + atomic.fetch_add(1, Ordering::Relaxed); + } + + // 3. Snapshot every indexes + progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexes); + let index_mapping = self.index_mapper.index_mapping; + let nb_indexes = index_mapping.len(&rtxn)? as u32; + + for (i, result) in index_mapping.iter(&rtxn)?.enumerate() { + let (name, uuid) = result?; + progress.update_progress(VariableNameStep::::new( + name, i as u32, nb_indexes, + )); + let index = self.index_mapper.index(&rtxn, name)?; + let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string()); + fs::create_dir_all(&dst)?; + index + .copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled) + .map_err(|e| Error::from_milli(e, Some(name.to_string())))?; + } + + drop(rtxn); + + // 4. Snapshot the auth LMDB env + progress.update_progress(SnapshotCreationProgress::SnapshotTheApiKeys); + let dst = temp_snapshot_dir.path().join("auth"); + fs::create_dir_all(&dst)?; + self.scheduler.auth_env.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?; + + // 5. Copy and tarball the flat snapshot + progress.update_progress(SnapshotCreationProgress::CreateTheTarball); + // 5.1 Find the original name of the database + // TODO find a better way to get this path + let mut base_path = self.env.path().to_owned(); + base_path.pop(); + let db_name = base_path.file_name().and_then(OsStr::to_str).unwrap_or("data.ms"); + + // 5.2 Tarball the content of the snapshot in a tempfile with a .snapshot extension + let snapshot_path = self.scheduler.snapshots_path.join(format!("{}.snapshot", db_name)); + let temp_snapshot_file = tempfile::NamedTempFile::new_in(&self.scheduler.snapshots_path)?; + compression::to_tar_gz(temp_snapshot_dir.path(), temp_snapshot_file.path())?; + let file = temp_snapshot_file.persist(snapshot_path)?; + + // 5.3 Change the permission to make the snapshot readonly + let mut permissions = file.metadata()?.permissions(); + permissions.set_readonly(true); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + #[allow(clippy::non_octal_unix_permissions)] + // rwxrwxrwx + permissions.set_mode(0b100100100); + } + + file.set_permissions(permissions)?; + + for task in &mut tasks { + task.status = Status::Succeeded; + } + + Ok(tasks) + } +} diff --git a/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs b/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs new file mode 100644 index 000000000..4feebabc4 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs @@ -0,0 +1,49 @@ +use meilisearch_types::milli; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; + +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + pub(super) fn process_upgrade( + &self, + db_version: (u32, u32, u32), + progress: Progress, + ) -> Result<()> { + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::ProcessUpgrade)?; + + enum UpgradeIndex {} + let indexes = self.index_names()?; + + for (i, uid) in indexes.iter().enumerate() { + progress.update_progress(VariableNameStep::::new( + format!("Upgrading index `{uid}`"), + i as u32, + indexes.len() as u32, + )); + let index = self.index(uid)?; + let mut index_wtxn = index.write_txn()?; + let regen_stats = milli::update::upgrade::upgrade( + &mut index_wtxn, + &index, + db_version, + progress.clone(), + ) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + if regen_stats { + let stats = crate::index_mapper::IndexStats::new(&index, &index_wtxn) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + index_wtxn.commit()?; + + // Release wtxn as soon as possible because it stops us from registering tasks + let mut index_schd_wtxn = self.env.write_txn()?; + self.index_mapper.store_stats_of(&mut index_schd_wtxn, uid, &stats)?; + index_schd_wtxn.commit()?; + } else { + index_wtxn.commit()?; + } + } + + Ok(()) + } +} diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-2.snap similarity index 77% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-2.snap index 2b76f46a6..01a8429c4 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-2.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-2.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs expression: task.details +snapshot_kind: text --- { "embedders": { diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-5.snap similarity index 78% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-5.snap index 061de75a5..7b576aa24 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-5.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs expression: config.embedder_options +snapshot_kind: text --- { "Rest": { diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update.snap similarity index 77% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update.snap index 2b76f46a6..01a8429c4 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__settings_update.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs expression: task.details +snapshot_kind: text --- { "embedders": { diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-15.snap similarity index 63% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-15.snap index 540835dfb..ece33e3b4 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-15.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: doc +snapshot_kind: text --- { "doggo": "Intel", diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-2.snap similarity index 86% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-2.snap index 629ea87dc..025ea4a5e 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-2.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-2.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: task.details +snapshot_kind: text --- { "embedders": { diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-22.snap similarity index 62% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-22.snap index bc35d84f6..49c5403d4 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-22.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: doc +snapshot_kind: text --- { "doggo": "kefir", diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-5.snap similarity index 76% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-5.snap index c08aa8116..14fcb3ee9 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-5.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: fakerest_config.embedder_options +snapshot_kind: text --- { "Rest": { diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap similarity index 63% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap index 712a62c77..19b5cab92 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap @@ -1,11 +1,12 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: simple_hf_config.embedder_options --- { "HuggingFace": { "model": "sentence-transformers/all-MiniLM-L6-v2", "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", - "distribution": null + "distribution": null, + "pooling": "useModel" } } diff --git a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors.snap similarity index 86% rename from crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap rename to crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors.snap index 629ea87dc..025ea4a5e 100644 --- a/crates/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors.snap @@ -1,6 +1,7 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs expression: task.details +snapshot_kind: text --- { "embedders": { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_enqueued_task/cancel_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_enqueued_task/cancel_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_enqueued_task/cancel_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_enqueued_task/cancel_processed.snap index f0c382d86..e3da7bd06 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_enqueued_task/cancel_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_enqueued_task/cancel_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_enqueued_task/initial_tasks_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_enqueued_task/initial_tasks_enqueued.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_enqueued_task/initial_tasks_enqueued.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_enqueued_task/initial_tasks_enqueued.snap index b895bbc7c..51fb88025 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_enqueued_task/initial_tasks_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_enqueued_task/initial_tasks_enqueued.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/aborted_indexation.snap similarity index 93% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/aborted_indexation.snap index 9710c4911..408980c05 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/aborted_indexation.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, } +{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/cancel_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/cancel_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/cancel_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/cancel_processed.snap index 444b171dd..2a9de78ab 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/cancel_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/cancel_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/first_task_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/first_task_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/first_task_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/first_task_processed.snap index 17265263c..e85755e98 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/first_task_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/first_task_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap similarity index 93% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap index e70aa0850..957df00ea 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(1): [1,] -{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, } +{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/after_dump_register.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/after_dump_register.snap index 8821af805..a7c5e5c09 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/after_dump_register.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/cancel_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/cancel_processed.snap index dbae3a082..426a649c8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/cancel_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/cancel_registered.snap similarity index 94% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/cancel_registered.snap index 55c7b3ed2..fe128e3d3 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_dump/cancel_registered.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"dumpCreation":1},"indexUids":{}}, } +{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"dumpCreation":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/aborted_indexation.snap similarity index 91% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/aborted_indexation.snap index 91b4deb22..26ded73d7 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/aborted_indexation.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/cancel_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/cancel_processed.snap index ef6845b05..9a0d8cc88 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/cancel_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/cancel_task_registered.snap similarity index 91% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/cancel_task_registered.snap index 89e8c8c6f..6ac809600 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/cancel_task_registered.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/initial_task_processing.snap similarity index 90% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/initial_task_processing.snap index 12e1b1283..72f0c39eb 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/initial_task_processing.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/registered_the_first_task.snap index 087257e18..63919487a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_processing_task/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/cancel_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/cancel_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/cancel_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/cancel_processed.snap index de94da936..56dea5b08 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/cancel_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/cancel_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/initial_task_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/initial_task_processed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/initial_task_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/initial_task_processed.snap index 78b62979b..9d83368b1 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/initial_task_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/initial_task_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/registered_the_first_task.snap index 087257e18..63919487a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_succeeded_task/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/cancel_succeeded_task/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/do_not_batch_task_of_different_indexes/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/do_not_batch_task_of_different_indexes/all_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/do_not_batch_task_of_different_indexes/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/do_not_batch_task_of_different_indexes/all_tasks_processed.snap index 3fe1a7d01..58b2a831d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/do_not_batch_task_of_different_indexes/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/do_not_batch_task_of_different_indexes/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/before_index_creation.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/before_index_creation.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/before_index_creation.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/before_index_creation.snap index 0234a5057..6dc95e1d1 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/before_index_creation.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/before_index_creation.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/both_task_succeeded.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/both_task_succeeded.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/both_task_succeeded.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/both_task_succeeded.snap index 8203e81f4..7de83b538 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/both_task_succeeded.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/both_task_succeeded.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_first_task.snap index 230b5e195..92f24508c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_second_task.snap index 9b22afff0..5f25c2964 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_third_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_third_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_third_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_third_task.snap index 914660746..0006ee8c0 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion/registered_the_third_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion/registered_the_third_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion_on_unexisting_index/1.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion_on_unexisting_index/1.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion_on_unexisting_index/1.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion_on_unexisting_index/1.snap index c252d35c9..d7e2f3b07 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion_on_unexisting_index/1.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion_on_unexisting_index/1.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion_on_unexisting_index/2.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion_on_unexisting_index/2.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion_on_unexisting_index/2.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion_on_unexisting_index/2.snap index 830afd854..83604f393 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_index_deletion_on_unexisting_index/2.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/document_addition_and_index_deletion_on_unexisting_index/2.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap similarity index 90% rename from crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap index aafef2fce..11ec76348 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_first_task.snap index e3627bbd3..cf2b2b691 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap similarity index 91% rename from crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap index 86fea2386..e1e36fffc 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap similarity index 92% rename from crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap index ea910f491..0b16ffadd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, } +{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_first_task.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_first_task.snap index 8d499b59c..4e5651deb 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_second_task.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_second_task.snap index 423dfb37c..5b829d27e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_third_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_third_task.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_third_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_third_task.snap index e5878246d..d4113041a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/processed_the_third_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/processed_the_third_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_first_task.snap index 230b5e195..92f24508c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_second_task.snap index a0148db63..21a6a59f7 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_third_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_third_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_third_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_third_task.snap index bee90a73b..adf9a76fe 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_inserted_without_new_signal/registered_the_third_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_inserted_without_new_signal/registered_the_third_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/first.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/first.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/first.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/first.snap index ac18e924d..809273a20 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/first.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/first.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/fourth.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/fourth.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/fourth.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/fourth.snap index 06e63e00e..a871c2baa 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/fourth.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/fourth.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_first_task.snap index 33cea7854..5c8082f72 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_fourth_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_fourth_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_fourth_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_fourth_task.snap index ebd130966..a22004697 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_fourth_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_fourth_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_second_task.snap index c53aec0c9..635491dc1 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_third_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_third_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_third_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_third_task.snap index 7679999ce..1d190baca 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_third_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/registered_the_third_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/second.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/second.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/second.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/second.snap index 632b7a54a..208aa100b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/second.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/second.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/third.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/third.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/third.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/third.snap index 3a2963654..8977e4cf0 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/third.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/process_tasks_without_autobatching/third.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_a.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_a.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_a.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_a.snap index b1c6fde36..dc73ddb0d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_a.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_a.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_b.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_b.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_b.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_b.snap index 065023214..25827aa96 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_b.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_b.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_c.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_c.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_c.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_c.snap index 03b09b928..7b1ad4b9b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_c.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_c.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_d.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_d.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_d.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_d.snap index 08ecfddc2..aa4b71d67 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/create_d.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/create_d.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/first_swap_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/first_swap_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/first_swap_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/first_swap_processed.snap index bca858559..68934003d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/first_swap_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/first_swap_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/first_swap_registered.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/first_swap_registered.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/first_swap_registered.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/first_swap_registered.snap index 234915267..2296dc9f2 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/first_swap_registered.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/first_swap_registered.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/second_swap_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/second_swap_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/second_swap_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/second_swap_processed.snap index 7b5ab6e4b..abc2f2954 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/second_swap_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/second_swap_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/third_empty_swap_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/third_empty_swap_processed.snap index 77b1193a5..f75caa10c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/third_empty_swap_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/third_empty_swap_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/two_swaps_registered.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/two_swaps_registered.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/two_swaps_registered.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/two_swaps_registered.snap index ccab86904..cb5fd822d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes/two_swaps_registered.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes/two_swaps_registered.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/after_the_index_creation.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/after_the_index_creation.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/after_the_index_creation.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/after_the_index_creation.snap index 08ecfddc2..aa4b71d67 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/after_the_index_creation.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/after_the_index_creation.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/first_swap_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/first_swap_failed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/first_swap_failed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/first_swap_failed.snap index e8e74d0e3..f7eb4e1e7 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/first_swap_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/first_swap_failed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/initial_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/initial_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/initial_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/initial_tasks_processed.snap index 08ecfddc2..aa4b71d67 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/swap_indexes_errors/initial_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/swap_indexes_errors/initial_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/initial_tasks_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/initial_tasks_enqueued.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/initial_tasks_enqueued.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/initial_tasks_enqueued.snap index 0d51e242c..2c33bd04a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/initial_tasks_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/initial_tasks_enqueued.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/initial_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/initial_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/initial_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/initial_tasks_processed.snap index f63c498a5..83c43339f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/initial_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/initial_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap index 95d615b1e..89f87e29a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_delete_same_task_twice/task_deletion_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true @@ -56,16 +56,13 @@ succeeded [1,] ### Batches Index Tasks: ---------------------------------------------------------------------- ### Batches Enqueued At: -[timestamp] [0,] [timestamp] [1,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Started At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Finished At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### File Store: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/after_registering_the_task_deletion.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/after_registering_the_task_deletion.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/after_registering_the_task_deletion.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/after_registering_the_task_deletion.snap index 6402982ee..3c4b35d9f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/after_registering_the_task_deletion.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/after_registering_the_task_deletion.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/initial_tasks_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/initial_tasks_enqueued.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/initial_tasks_enqueued.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/initial_tasks_enqueued.snap index 0d51e242c..2c33bd04a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/initial_tasks_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/initial_tasks_enqueued.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/initial_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/initial_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/initial_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/initial_tasks_processed.snap index f63c498a5..83c43339f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_delete_same_task_twice/initial_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/initial_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/task_deletion_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/task_deletion_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap index 3f4ae56d8..135b272cd 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_deleteable/task_deletion_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_deleteable/task_deletion_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true @@ -54,15 +54,12 @@ succeeded [1,] ### Batches Index Tasks: ---------------------------------------------------------------------- ### Batches Enqueued At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Started At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### Batches Finished At: -[timestamp] [0,] [timestamp] [1,] ---------------------------------------------------------------------- ### File Store: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/initial_tasks_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/initial_tasks_enqueued.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/initial_tasks_enqueued.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/initial_tasks_enqueued.snap index aed0a818a..46cbaefc2 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/initial_tasks_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/initial_tasks_enqueued.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_done.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_done.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_done.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_done.snap index ae910d44b..b35bcdf1b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_done.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_done.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_enqueued.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_enqueued.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_enqueued.snap index 746caa1de..a861fea12 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_enqueued.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_processing.snap similarity index 94% rename from crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_processing.snap index fce223c6c..b3500b8a5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/task_deletion_undeleteable/task_deletion_processing.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [3,] -{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"taskDeletion":1},"indexUids":{}}, } +{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"taskDeletion":1},"indexUids":{}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap similarity index 56% rename from crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap index 22900371e..d9d8b0724 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap similarity index 62% rename from crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap index dae9b38cd..ca8a3e137 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_task_is_processing/registered_a_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_task_is_processing/registered_a_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_task_is_processing/registered_a_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test.rs/test_task_is_processing/registered_a_task.snap index e3627bbd3..cf2b2b691 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_task_is_processing/registered_a_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_task_is_processing/registered_a_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_register.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/after_register.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_register.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/after_register.snap index d8a689669..42df87d17 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_register.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/after_register.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/after_the_batch_creation.snap similarity index 89% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/after_the_batch_creation.snap index f7eaa6df8..a3d3deade 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/after_the_batch_creation.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/once_everything_is_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/once_everything_is_processed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition/once_everything_is_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/once_everything_is_processed.snap index 2357a404f..83e23e8b0 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/once_everything_is_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition/once_everything_is_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/after_processing_the_batch.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/after_processing_the_batch.snap index 1fce684f5..2bc94c1f9 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/after_processing_the_batch.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/documents.snap new file mode 100644 index 000000000..bef7fca61 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/documents.snap @@ -0,0 +1,10 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 3, + "doggo": "bork" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/registered_the_first_task.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/registered_the_first_task.snap index b1337b287..c1629dc02 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/registered_the_second_task.snap index 60e2d22be..c8c117b2a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_addition_and_document_deletion/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap index ee42e932a..825d74562 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/after_last_successful_addition.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/after_last_successful_addition.snap index 0e9e47574..4ffdf8958 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/after_last_successful_addition.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/documents.snap similarity index 60% rename from crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/documents.snap index 8204d059b..3619a50b5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/registered_the_first_task.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/registered_the_first_task.snap index 2e96a4614..6faba461a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/registered_the_second_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/registered_the_second_task.snap index d4f8b47b9..257c66390 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/document_deletion_and_document_addition/registered_the_second_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/after_processing_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/after_processing_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/after_processing_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/after_processing_the_10_tasks.snap index 5efac0653..2c29d9da7 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/after_processing_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/after_processing_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/after_registering_the_10_tasks.snap index cdc1f98b7..ce66dc4d1 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/documents.snap similarity index 82% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/documents.snap index 5a839838d..bf6495c9c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/processed_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/processed_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/processed_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/processed_the_first_task.snap index 24ace66bf..68f4b6701 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/processed_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/processed_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/registered_the_first_task.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/registered_the_first_task.snap index 230b5e195..03d4e5b16 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/after_registering_the_10_tasks.snap index f937c6805..9c3711061 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/all_tasks_processed.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/all_tasks_processed.snap index 28a2a65a5..ed9d02ae2 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/documents.snap similarity index 82% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/documents.snap index 5a839838d..bf6495c9c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/five_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/five_tasks_processed.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/five_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/five_tasks_processed.snap index 519646fcb..343c1f77d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/five_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/five_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/processed_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/processed_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/processed_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/processed_the_first_task.snap index f842a275e..9aa284128 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index_without_autobatching/processed_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/processed_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/registered_the_first_task.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/registered_the_first_task.snap index 33cea7854..6f0f9c782 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/process_tasks_without_autobatching/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_with_index_without_autobatching/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index/after_processing_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index/after_processing_the_10_tasks.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index/after_processing_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index/after_processing_the_10_tasks.snap index aa27500a7..d87a73a81 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index/after_processing_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index/after_processing_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index/after_registering_the_10_tasks.snap index 8b463b588..2fbcc3dc6 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/after_registering_the_10_tasks.snap index 537980795..e8f8d85d3 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/all_tasks_processed.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/all_tasks_processed.snap index e342fb2f3..a5e55b95f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/five_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/five_tasks_processed.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/five_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/five_tasks_processed.snap index 9531dd0bf..231352493 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_without_index_without_autobatching/five_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_cant_create_index_without_index_without_autobatching/five_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/after_registering_the_10_tasks.snap index 3cdee6f23..aa6956c5f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/all_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/all_tasks_processed.snap index a3609fb1b..e4b176513 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/documents.snap similarity index 81% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/documents.snap index cbd8d175a..ae77cfa9d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/only_first_task_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/only_first_task_failed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/only_first_task_failed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/only_first_task_failed.snap index d73d749f6..4b737c1e6 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/only_first_task_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_right_without_index_starts_with_cant_create/only_first_task_failed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/after_registering_the_10_tasks.snap index 00c911dae..79c0071ff 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/all_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/all_tasks_processed.snap index 99922b9a0..95466395e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/documents.snap similarity index 82% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/documents.snap index 5a839838d..bf6495c9c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/processed_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/processed_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/processed_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/processed_the_first_task.snap index 24ace66bf..68f4b6701 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/processed_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/processed_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/registered_the_first_task.snap similarity index 96% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/registered_the_first_task.snap index 230b5e195..03d4e5b16 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_cant_create_index_with_index/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_mixed_rights_with_index/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/after_registering_the_5_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/after_registering_the_5_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/after_registering_the_5_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/after_registering_the_5_tasks.snap index e8ee841ae..9878d5283 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/after_registering_the_5_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/after_registering_the_5_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/documents.snap similarity index 53% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/documents.snap index dd1bbf8b0..d8f74d472 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap index 1713c0ac2..bd87c1981 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/fifth_task_succeeds.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/first_and_second_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/first_and_second_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/first_and_second_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/first_and_second_task_fails.snap index 96e83ac9b..ac50daec7 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/first_and_second_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/first_and_second_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap index f54713081..7785e0cb0 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/fourth_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/third_task_succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/third_task_succeeds.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/third_task_succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/third_task_succeeds.snap index 0f24a6715..73c8fcf17 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_bad_primary_key/third_task_succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_bad_primary_key/third_task_succeeds.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/after_registering_the_3_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/after_registering_the_3_tasks.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/after_registering_the_3_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/after_registering_the_3_tasks.snap index a3a481855..86f301ba8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/after_registering_the_3_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/after_registering_the_3_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/documents.snap new file mode 100644 index 000000000..5022049f1 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/documents.snap @@ -0,0 +1,10 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 0, + "doggo": "jean bob" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/only_first_task_succeed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/only_first_task_succeed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/only_first_task_succeed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/only_first_task_succeed.snap index f12ac555b..4d5028d60 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/only_first_task_succeed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/only_first_task_succeed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap index b49d3ea64..4350f68ae 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/second_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap index 35783d84f..226a1d509 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key/third_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/after_registering_the_3_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/after_registering_the_3_tasks.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/after_registering_the_3_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/after_registering_the_3_tasks.snap index 3eb5c7a4d..c744a7f18 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/after_registering_the_3_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/after_registering_the_3_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/documents.snap new file mode 100644 index 000000000..5022049f1 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/documents.snap @@ -0,0 +1,10 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 0, + "doggo": "jean bob" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/only_first_task_succeed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/only_first_task_succeed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/only_first_task_succeed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/only_first_task_succeed.snap index d01799fc2..86ab07c3c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/only_first_task_succeed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/only_first_task_succeed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap index 2c6d29a18..889d65ff9 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/second_and_third_tasks_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/after_registering_the_6_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/after_registering_the_6_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/after_registering_the_6_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/after_registering_the_6_tasks.snap index 3306009aa..4111cb60e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/after_registering_the_6_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/after_registering_the_6_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/all_other_tasks_succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/all_other_tasks_succeeds.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/all_other_tasks_succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/all_other_tasks_succeeds.snap index 6d3fabe77..7ef550fd8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/all_other_tasks_succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/all_other_tasks_succeeds.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/documents.snap similarity index 68% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/documents.snap index a73c52da5..0a8f2e4e8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/first_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/first_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/first_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/first_task_fails.snap index 5b304aa24..8bda924d3 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/first_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/first_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/second_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/second_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/second_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/second_task_fails.snap index b5e113599..f153e2d44 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/second_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/second_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/third_task_succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/third_task_succeeds.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/third_task_succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/third_task_succeeds.snap index 0f3730932..cdd51b199 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key/third_task_succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key/third_task_succeeds.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/after_registering_the_6_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/after_registering_the_6_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/after_registering_the_6_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/after_registering_the_6_tasks.snap index 2876c7681..f18858451 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/after_registering_the_6_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/after_registering_the_6_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap index 46408e0a7..2fb5363e8 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/all_other_tasks_succeeds.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/documents.snap similarity index 74% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/documents.snap index 9c79853fa..1ff106b5b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/first_task_succeed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/first_task_succeed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/first_task_succeed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/first_task_succeed.snap index 4acc5342b..6a7d3617a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/first_task_succeed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/first_task_succeed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap index 828c28298..7a98a0e37 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/second_task_fails.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap index 7e9a02288..7603decab 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_addition_with_set_and_null_primary_key_inference_works/third_task_succeeds.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/1.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/1.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/1.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/1.snap index a5fbc024c..85b137b45 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/1.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/1.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/2.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/2.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/2.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/2.snap index d2cfc793b..8bd563e6e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace/2.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/2.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/documents.snap similarity index 82% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/documents.snap index 5a839838d..bf6495c9c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_mixed_rights_with_index/documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace/documents.snap @@ -1,5 +1,6 @@ --- -source: index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text --- [ { diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/after_registering_the_10_tasks.snap index ba16d64f1..122136e08 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/all_tasks_processed.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/all_tasks_processed.snap index 5c6c711a0..7ecc0e7a9 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/documents.snap new file mode 100644 index 000000000..bf6495c9c --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/documents.snap @@ -0,0 +1,46 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 0, + "doggo": "bob 0" + }, + { + "id": 1, + "doggo": "bob 1" + }, + { + "id": 2, + "doggo": "bob 2" + }, + { + "id": 3, + "doggo": "bob 3" + }, + { + "id": 4, + "doggo": "bob 4" + }, + { + "id": 5, + "doggo": "bob 5" + }, + { + "id": 6, + "doggo": "bob 6" + }, + { + "id": 7, + "doggo": "bob 7" + }, + { + "id": 8, + "doggo": "bob 8" + }, + { + "id": 9, + "doggo": "bob 9" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/five_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/five_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/five_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/five_tasks_processed.snap index e03da1332..7b3a9db02 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/five_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_replace_without_autobatching/five_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/1.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/1.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_update/1.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/1.snap index 5444d0a3e..bb4fb66df 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/1.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/1.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/2.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/2.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_update/2.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/2.snap index 9e742df7b..0911eb631 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/2.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/2.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/documents.snap new file mode 100644 index 000000000..bf6495c9c --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update/documents.snap @@ -0,0 +1,46 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 0, + "doggo": "bob 0" + }, + { + "id": 1, + "doggo": "bob 1" + }, + { + "id": 2, + "doggo": "bob 2" + }, + { + "id": 3, + "doggo": "bob 3" + }, + { + "id": 4, + "doggo": "bob 4" + }, + { + "id": 5, + "doggo": "bob 5" + }, + { + "id": 6, + "doggo": "bob 6" + }, + { + "id": 7, + "doggo": "bob 7" + }, + { + "id": 8, + "doggo": "bob 8" + }, + { + "id": 9, + "doggo": "bob 9" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/after_registering_the_10_tasks.snap index 35368e4b3..916b44f96 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/all_tasks_processed.snap similarity index 99% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/all_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/all_tasks_processed.snap index ef6e2d0e1..4b005f38e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/all_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/all_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/documents.snap new file mode 100644 index 000000000..bf6495c9c --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/documents.snap @@ -0,0 +1,46 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 0, + "doggo": "bob 0" + }, + { + "id": 1, + "doggo": "bob 1" + }, + { + "id": 2, + "doggo": "bob 2" + }, + { + "id": 3, + "doggo": "bob 3" + }, + { + "id": 4, + "doggo": "bob 4" + }, + { + "id": 5, + "doggo": "bob 5" + }, + { + "id": 6, + "doggo": "bob 6" + }, + { + "id": 7, + "doggo": "bob 7" + }, + { + "id": 8, + "doggo": "bob 8" + }, + { + "id": 9, + "doggo": "bob 9" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/five_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/five_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/five_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/five_tasks_processed.snap index bfc2e9f42..bf73f9593 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/five_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_document_update_without_autobatching/five_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = false diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/after_registering_the_10_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/after_registering_the_10_tasks.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/after_registering_the_10_tasks.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/after_registering_the_10_tasks.snap index 773f43c2c..7f08b5d0d 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/after_registering_the_10_tasks.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/after_registering_the_10_tasks.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/all_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/all_tasks_processed.snap new file mode 100644 index 000000000..cdab2097c --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/all_tasks_processed.snap @@ -0,0 +1,82 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} +1 {uid: 1, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} +2 {uid: 2, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000002, documents_count: 1, allow_index_creation: true }} +3 {uid: 3, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000003, documents_count: 1, allow_index_creation: true }} +4 {uid: 4, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000004, documents_count: 1, allow_index_creation: true }} +5 {uid: 5, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000005, documents_count: 1, allow_index_creation: true }} +6 {uid: 6, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000006, documents_count: 1, allow_index_creation: true }} +7 {uid: 7, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000007, documents_count: 1, allow_index_creation: true }} +8 {uid: 8, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000008, documents_count: 1, allow_index_creation: true }} +9 {uid: 9, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000009, documents_count: 1, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,2,3,4,5,6,7,8,9,] +---------------------------------------------------------------------- +### Kind: +"documentAdditionOrUpdate" [0,1,2,3,4,5,6,7,8,9,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,1,2,3,4,5,6,7,8,9,] +---------------------------------------------------------------------- +### Index Mapper: +doggos: { number_of_documents: 10, field_distribution: {"doggo": 10, "id": 10} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +[timestamp] [6,] +[timestamp] [7,] +[timestamp] [8,] +[timestamp] [9,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,1,2,3,4,5,6,7,8,9,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,1,2,3,4,5,6,7,8,9,] +---------------------------------------------------------------------- +### All Batches: +0 {uid: 0, details: {"receivedDocuments":10,"indexedDocuments":10}, stats: {"totalNbTasks":10,"status":{"succeeded":10},"types":{"documentAdditionOrUpdate":10},"indexUids":{"doggos":10}}, } +---------------------------------------------------------------------- +### Batch to tasks mapping: +0 [0,1,2,3,4,5,6,7,8,9,] +---------------------------------------------------------------------- +### Batches Status: +succeeded [0,] +---------------------------------------------------------------------- +### Batches Kind: +"documentAdditionOrUpdate" [0,] +---------------------------------------------------------------------- +### Batches Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Batches Enqueued At: +[timestamp] [0,] +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/documents.snap new file mode 100644 index 000000000..bf6495c9c --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/documents.snap @@ -0,0 +1,46 @@ +--- +source: crates/index-scheduler/src/scheduler/test_document_addition.rs +snapshot_kind: text +--- +[ + { + "id": 0, + "doggo": "bob 0" + }, + { + "id": 1, + "doggo": "bob 1" + }, + { + "id": 2, + "doggo": "bob 2" + }, + { + "id": 3, + "doggo": "bob 3" + }, + { + "id": 4, + "doggo": "bob 4" + }, + { + "id": 5, + "doggo": "bob 5" + }, + { + "id": 6, + "doggo": "bob 6" + }, + { + "id": 7, + "doggo": "bob 7" + }, + { + "id": 8, + "doggo": "bob 8" + }, + { + "id": 9, + "doggo": "bob 9" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/five_tasks_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/five_tasks_processed.snap similarity index 98% rename from crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/five_tasks_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/five_tasks_processed.snap index 8aba4bd5c..ff492e75e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/five_tasks_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_document_addition.rs/test_mixed_document_addition/five_tasks_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_document_addition.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap similarity index 63% rename from crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap index 11995b0bd..74cdb9bc1 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_embedders.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap similarity index 61% rename from crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap index 9c028d141..16858361e 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_embedders.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap similarity index 59% rename from crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap index 5c83f6cac..8daa10244 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap @@ -1,13 +1,12 @@ --- -source: crates/crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_embedders.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap similarity index 57% rename from crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap index c8f174c74..87a9ec11c 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_embedders.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap new file mode 100644 index 000000000..35bd9dee9 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -0,0 +1,51 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"settingsUpdate" [0,] +---------------------------------------------------------------------- +### Index Tasks: +doggos [0,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### All Batches: +---------------------------------------------------------------------- +### Batch to tasks mapping: +---------------------------------------------------------------------- +### Batches Status: +---------------------------------------------------------------------- +### Batches Kind: +---------------------------------------------------------------------- +### Batches Index Tasks: +---------------------------------------------------------------------- +### Batches Enqueued At: +---------------------------------------------------------------------- +### Batches Started At: +---------------------------------------------------------------------- +### Batches Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap similarity index 54% rename from crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap index 24d5fff27..40e8f63e9 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_embedders.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors_first_and_embedder_later/documents after initial push.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors_first_and_embedder_later/documents after initial push.snap index e06d09464..3eaf58e17 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -1,4 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_embedders.rs +snapshot_kind: text --- [{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],"unknown embedder":[1,2,3]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"regenerate":false,"embeddings":[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]},"unknown embedder":[4,5]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"regenerate":true,"embeddings":[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"regenerate":true,"embeddings":null}}}] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap similarity index 89% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap index f7eaa6df8..fcbaaace3 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/document_addition_failed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_failed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/document_addition_failed.snap index 875ae06c6..5b38f28b5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/document_addition_failed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/registered_the_first_task.snap index d8a689669..c5b21621f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_addition/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap similarity index 70% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap index 8d175e388..ebacb5415 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_failure.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap similarity index 68% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap index d1de7ec61..0fc0d7fb5 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_failure.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap similarity index 65% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap index 114df2852..d06a4e78a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap @@ -1,17 +1,16 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_failure.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} -3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} -4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} +3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} +4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Attribute `id` is not filterable. Available filterable attribute patterns are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }} 5 {uid: 5, batch_uid: 2, status: succeeded, details: { original_filter: "catto EXISTS", deleted_documents: Some(1) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("catto EXISTS") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/documents_remaining_should_only_be_bork.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/documents_remaining_should_only_be_bork.snap new file mode 100644 index 000000000..18071608b --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/documents_remaining_should_only_be_bork.snap @@ -0,0 +1,10 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +snapshot_kind: text +--- +[ + { + "id": 3, + "doggo": "bork" + } +] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap similarity index 75% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap index b2b368be4..8b010498f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_failure.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_document_ids: 1, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, status: enqueued, details: { original_filter: true, deleted_documents: None }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap similarity index 63% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap index 9e1995fee..0ba3ef598 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap @@ -1,13 +1,12 @@ --- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text +source: crates/index-scheduler/src/scheduler/test_failure.rs --- ### Autobatching Enabled = true ### Processing batch None: [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_index_creation/after_register.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_index_creation/after_register.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_index_creation/after_register.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_index_creation/after_register.snap index 5129662eb..4ece15b13 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_index_creation/after_register.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_index_creation/after_register.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_index_creation/index_creation_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_index_creation/index_creation_failed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_index_creation/index_creation_failed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_index_creation/index_creation_failed.snap index b24d0be1e..24589fc66 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_index_creation/index_creation_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_index_creation/index_creation_failed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap similarity index 90% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap index 0091af65b..f698eff0a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap similarity index 90% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap index 0091af65b..f698eff0a 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap @@ -1,11 +1,11 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true ### Processing batch Some(0): [0,] -{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, } +{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ---------------------------------------------------------------------- ### All Tasks: 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/document_addition_succeeded_but_index_scheduler_not_updated.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/document_addition_succeeded_but_index_scheduler_not_updated.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/document_addition_succeeded_but_index_scheduler_not_updated.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/document_addition_succeeded_but_index_scheduler_not_updated.snap index d8a689669..c5b21621f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/document_addition_succeeded_but_index_scheduler_not_updated.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/document_addition_succeeded_but_index_scheduler_not_updated.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/registered_the_first_task.snap index d8a689669..c5b21621f 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/task_successfully_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/task_successfully_processed.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/task_successfully_processed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/task_successfully_processed.snap index 2357a404f..1a678e46b 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/task_successfully_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_update_task_after_process_batch_success_for_document_addition/task_successfully_processed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/snapshots/lib.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap similarity index 87% rename from crates/index-scheduler/src/snapshots/lib.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap index c776baab7..b0c450092 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/index_creation_failed.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true @@ -7,7 +7,7 @@ snapshot_kind: text [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "An unexpected crash occurred when processing the task.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "An unexpected crash occurred when processing the task: simulated panic", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/panic_in_process_batch_for_index_creation/registered_the_first_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/registered_the_first_task.snap similarity index 97% rename from crates/index-scheduler/src/snapshots/lib.rs/panic_in_process_batch_for_index_creation/registered_the_first_task.snap rename to crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/registered_the_first_task.snap index 5129662eb..4ece15b13 100644 --- a/crates/index-scheduler/src/snapshots/lib.rs/panic_in_process_batch_for_index_creation/registered_the_first_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/panic_in_process_batch_for_index_creation/registered_the_first_task.snap @@ -1,5 +1,5 @@ --- -source: crates/index-scheduler/src/lib.rs +source: crates/index-scheduler/src/scheduler/test_failure.rs snapshot_kind: text --- ### Autobatching Enabled = true diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap new file mode 100644 index 000000000..f98e763e1 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap @@ -0,0 +1,109 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 14, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +4 {uid: 4, batch_uid: 4, status: succeeded, details: { primary_key: Some("leaves") }, kind: IndexCreation { index_uid: "girafo", primary_key: Some("leaves") }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [0,1,2,4,] +failed [3,] +---------------------------------------------------------------------- +### Kind: +"indexCreation" [1,2,3,4,] +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Index Tasks: +catto [1,] +doggo [2,3,] +girafo [4,] +---------------------------------------------------------------------- +### Index Mapper: +catto: { number_of_documents: 0, field_distribution: {} } +doggo: { number_of_documents: 0, field_distribution: {} } +girafo: { number_of_documents: 0, field_distribution: {} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +---------------------------------------------------------------------- +### All Batches: +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.14.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, } +2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } +3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } +4 {uid: 4, details: {"primaryKey":"leaves"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"girafo":1}}, } +---------------------------------------------------------------------- +### Batch to tasks mapping: +0 [0,] +1 [1,] +2 [2,] +3 [3,] +4 [4,] +---------------------------------------------------------------------- +### Batches Status: +succeeded [0,1,2,4,] +failed [3,] +---------------------------------------------------------------------- +### Batches Kind: +"indexCreation" [1,2,3,4,] +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Batches Index Tasks: +catto [1,] +doggo [2,3,] +girafo [4,] +---------------------------------------------------------------------- +### Batches Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +---------------------------------------------------------------------- +### Batches Started At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +---------------------------------------------------------------------- +### Batches Finished At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap new file mode 100644 index 000000000..4c828b71d --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_removing_the_upgrade_tasks.snap @@ -0,0 +1,112 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +snapshot_kind: text +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +4 {uid: 4, batch_uid: 4, status: succeeded, details: { primary_key: Some("leaves") }, kind: IndexCreation { index_uid: "girafo", primary_key: Some("leaves") }} +5 {uid: 5, batch_uid: 5, status: succeeded, details: { matched_tasks: 1, deleted_tasks: Some(1), original_filter: "types=upgradeDatabase" }, kind: TaskDeletion { query: "types=upgradeDatabase", tasks: RoaringBitmap<[0]> }} +---------------------------------------------------------------------- +### Status: +enqueued [] +succeeded [1,2,4,5,] +failed [3,] +---------------------------------------------------------------------- +### Kind: +"indexCreation" [1,2,3,4,] +"taskDeletion" [5,] +"upgradeDatabase" [] +---------------------------------------------------------------------- +### Index Tasks: +catto [1,] +doggo [2,3,] +girafo [4,] +---------------------------------------------------------------------- +### Index Mapper: +catto: { number_of_documents: 0, field_distribution: {} } +doggo: { number_of_documents: 0, field_distribution: {} } +girafo: { number_of_documents: 0, field_distribution: {} } + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +---------------------------------------------------------------------- +### All Batches: +1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, } +2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } +3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, } +4 {uid: 4, details: {"primaryKey":"leaves"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"girafo":1}}, } +5 {uid: 5, details: {"matchedTasks":1,"deletedTasks":1,"originalFilter":"types=upgradeDatabase"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"taskDeletion":1},"indexUids":{}}, } +---------------------------------------------------------------------- +### Batch to tasks mapping: +1 [1,] +2 [2,] +3 [3,] +4 [4,] +5 [5,] +---------------------------------------------------------------------- +### Batches Status: +succeeded [1,2,4,5,] +failed [3,] +---------------------------------------------------------------------- +### Batches Kind: +"indexCreation" [1,2,3,4,] +"taskDeletion" [5,] +"upgradeDatabase" [] +---------------------------------------------------------------------- +### Batches Index Tasks: +catto [1,] +doggo [2,3,] +girafo [4,] +---------------------------------------------------------------------- +### Batches Enqueued At: +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +---------------------------------------------------------------------- +### Batches Started At: +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +---------------------------------------------------------------------- +### Batches Finished At: +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +[timestamp] [4,] +[timestamp] [5,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap new file mode 100644 index 000000000..6d6276067 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap @@ -0,0 +1,50 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 14, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +---------------------------------------------------------------------- +### Status: +enqueued [0,] +---------------------------------------------------------------------- +### Kind: +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Index Tasks: +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### All Batches: +---------------------------------------------------------------------- +### Batch to tasks mapping: +---------------------------------------------------------------------- +### Batches Status: +---------------------------------------------------------------------- +### Batches Kind: +---------------------------------------------------------------------- +### Batches Index Tasks: +---------------------------------------------------------------------- +### Batches Enqueued At: +---------------------------------------------------------------------- +### Batches Started At: +---------------------------------------------------------------------- +### Batches Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap new file mode 100644 index 000000000..7ef9c42e1 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap @@ -0,0 +1,54 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 14, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +---------------------------------------------------------------------- +### Status: +enqueued [0,1,] +---------------------------------------------------------------------- +### Kind: +"indexCreation" [1,] +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Index Tasks: +catto [1,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +---------------------------------------------------------------------- +### Finished At: +---------------------------------------------------------------------- +### All Batches: +---------------------------------------------------------------------- +### Batch to tasks mapping: +---------------------------------------------------------------------- +### Batches Status: +---------------------------------------------------------------------- +### Batches Kind: +---------------------------------------------------------------------- +### Batches Index Tasks: +---------------------------------------------------------------------- +### Batches Enqueued At: +---------------------------------------------------------------------- +### Batches Started At: +---------------------------------------------------------------------- +### Batches Finished At: +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap new file mode 100644 index 000000000..932124295 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap @@ -0,0 +1,64 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 14, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +---------------------------------------------------------------------- +### Status: +enqueued [1,] +failed [0,] +---------------------------------------------------------------------- +### Kind: +"indexCreation" [1,] +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Index Tasks: +catto [1,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### All Batches: +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.14.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +---------------------------------------------------------------------- +### Batch to tasks mapping: +0 [0,] +---------------------------------------------------------------------- +### Batches Status: +failed [0,] +---------------------------------------------------------------------- +### Batches Kind: +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Batches Index Tasks: +---------------------------------------------------------------------- +### Batches Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap new file mode 100644 index 000000000..d3832e1f1 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap @@ -0,0 +1,67 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 14, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +---------------------------------------------------------------------- +### Status: +enqueued [1,2,] +failed [0,] +---------------------------------------------------------------------- +### Kind: +"indexCreation" [1,2,] +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Index Tasks: +catto [1,] +doggo [2,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### All Batches: +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.14.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +---------------------------------------------------------------------- +### Batch to tasks mapping: +0 [0,] +---------------------------------------------------------------------- +### Batches Status: +failed [0,] +---------------------------------------------------------------------- +### Batches Kind: +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Batches Index Tasks: +---------------------------------------------------------------------- +### Batches Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap new file mode 100644 index 000000000..a03e920e3 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap @@ -0,0 +1,71 @@ +--- +source: crates/index-scheduler/src/scheduler/test_failure.rs +--- +### Autobatching Enabled = true +### Processing batch None: +[] +---------------------------------------------------------------------- +### All Tasks: +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 14, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} +2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +3 {uid: 3, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} +---------------------------------------------------------------------- +### Status: +enqueued [1,2,3,] +succeeded [0,] +failed [] +---------------------------------------------------------------------- +### Kind: +"indexCreation" [1,2,3,] +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Index Tasks: +catto [1,] +doggo [2,3,] +---------------------------------------------------------------------- +### Index Mapper: + +---------------------------------------------------------------------- +### Canceled By: + +---------------------------------------------------------------------- +### Enqueued At: +[timestamp] [0,] +[timestamp] [1,] +[timestamp] [2,] +[timestamp] [3,] +---------------------------------------------------------------------- +### Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### All Batches: +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.14.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, } +---------------------------------------------------------------------- +### Batch to tasks mapping: +0 [0,] +---------------------------------------------------------------------- +### Batches Status: +succeeded [0,] +failed [] +---------------------------------------------------------------------- +### Batches Kind: +"upgradeDatabase" [0,] +---------------------------------------------------------------------- +### Batches Index Tasks: +---------------------------------------------------------------------- +### Batches Enqueued At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Started At: +[timestamp] [0,] +---------------------------------------------------------------------- +### Batches Finished At: +[timestamp] [0,] +---------------------------------------------------------------------- +### File Store: + +---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs new file mode 100644 index 000000000..84112de08 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -0,0 +1,931 @@ +use std::collections::BTreeMap; + +use big_s::S; +use meili_snap::{json_string, snapshot}; +use meilisearch_auth::AuthFilter; +use meilisearch_types::milli::index::IndexEmbeddingConfig; +use meilisearch_types::milli::update::IndexDocumentsMethod::*; +use meilisearch_types::milli::{self}; +use meilisearch_types::settings::SettingEmbeddingSettings; +use meilisearch_types::tasks::{IndexSwap, KindWithContent}; +use roaring::RoaringBitmap; + +use crate::insta_snapshot::snapshot_index_scheduler; +use crate::test_utils::Breakpoint::*; +use crate::test_utils::{ + index_creation_task, read_json, replace_document_import_task, sample_documents, +}; +use crate::IndexScheduler; + +#[test] +fn insert_task_while_another_task_is_processing() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + handle.advance_till([Start, BatchCreated]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_creation"); + + // while the task is processing can we register another task? + index_scheduler.register(index_creation_task("index_b", "id"), None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + + index_scheduler + .register(KindWithContent::IndexDeletion { index_uid: S("index_a") }, None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); +} + +#[test] +fn test_task_is_processing() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + index_scheduler.register(index_creation_task("index_a", "id"), None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task"); + + handle.advance_till([Start, BatchCreated]); + assert!(index_scheduler.is_task_processing().unwrap()); +} + +/// We send a lot of tasks but notify the tasks scheduler only once as +/// we send them very fast, we must make sure that they are all processed. +#[test] +fn process_tasks_inserted_without_new_signal() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("cattos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + + index_scheduler + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_second_task"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_third_task"); +} + +#[test] +fn process_tasks_without_autobatching() { + let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_fourth_task"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fourth"); +} + +#[test] +fn task_deletion_undeleteable() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); + file0.persist().unwrap(); + file1.persist().unwrap(); + + let to_enqueue = [ + index_creation_task("catto", "mouse"), + replace_document_import_task("catto", None, 0, documents_count0), + replace_document_import_task("doggo", Some("bone"), 1, documents_count1), + ]; + + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + + // here we have registered all the tasks, but the index scheduler + // has not progressed at all + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); + + index_scheduler + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0, 1]), + }, + None, + false, + ) + .unwrap(); + // again, no progress made at all, but one more task is registered + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_enqueued"); + + // now we create the first batch + handle.advance_till([Start, BatchCreated]); + + // the task deletion should now be "processing" + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processing"); + + handle.advance_till([InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); + // after the task deletion is processed, no task should actually have been deleted, + // because the tasks with ids 0 and 1 were still "enqueued", and thus undeleteable + // the "task deletion" task should be marked as "succeeded" and, in its details, the + // number of deleted tasks should be 0 + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_done"); +} + +#[test] +fn task_deletion_deleteable() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); + file0.persist().unwrap(); + file1.persist().unwrap(); + + let to_enqueue = [ + replace_document_import_task("catto", None, 0, documents_count0), + replace_document_import_task("doggo", Some("bone"), 1, documents_count1), + ]; + + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); + + handle.advance_one_successful_batch(); + // first addition of documents should be successful + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_processed"); + + // Now we delete the first task + index_scheduler + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_task_deletion"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processed"); +} + +#[test] +fn task_deletion_delete_same_task_twice() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); + file0.persist().unwrap(); + file1.persist().unwrap(); + + let to_enqueue = [ + replace_document_import_task("catto", None, 0, documents_count0), + replace_document_import_task("doggo", Some("bone"), 1, documents_count1), + ]; + + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); + + handle.advance_one_successful_batch(); + // first addition of documents should be successful + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_processed"); + + // Now we delete the first task multiple times in a row + for _ in 0..2 { + index_scheduler + .register( + KindWithContent::TaskDeletion { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + handle.advance_one_successful_batch(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_deletion_processed"); +} + +#[test] +fn document_addition_and_index_deletion() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = r#" + { + "id": 1, + "doggo": "bob" + }"#; + + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + + index_scheduler + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_third_task"); + + handle.advance_one_successful_batch(); // The index creation. + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "before_index_creation"); + handle.advance_one_successful_batch(); // // after the execution of the two tasks in a single batch. + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded"); +} + +#[test] +fn do_not_batch_task_of_different_indexes() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + let index_names = ["doggos", "cattos", "girafos"]; + + for name in index_names { + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: name.to_string(), primary_key: None }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + for name in index_names { + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: name.to_string() }, None, false) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + for _ in 0..(index_names.len() * 2) { + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); +} + +#[test] +fn swap_indexes() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let to_enqueue = [ + index_creation_task("a", "id"), + index_creation_task("b", "id"), + index_creation_task("c", "id"), + index_creation_task("d", "id"), + ]; + + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_a"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_b"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_c"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "create_d"); + + index_scheduler + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("c".to_owned(), "d".to_owned()) }, + ], + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_registered"); + index_scheduler + .register( + KindWithContent::IndexSwap { + swaps: vec![IndexSwap { indexes: ("a".to_owned(), "c".to_owned()) }], + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "two_swaps_registered"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_processed"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_swap_processed"); + + index_scheduler.register(KindWithContent::IndexSwap { swaps: vec![] }, None, false).unwrap(); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_empty_swap_processed"); +} + +#[test] +fn swap_indexes_errors() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let to_enqueue = [ + index_creation_task("a", "id"), + index_creation_task("b", "id"), + index_creation_task("c", "id"), + index_creation_task("d", "id"), + ]; + + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + handle.advance_n_successful_batches(4); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_the_index_creation"); + + let first_snap = snapshot_index_scheduler(&index_scheduler); + snapshot!(first_snap, name: "initial_tasks_processed"); + + let err = index_scheduler + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("b".to_owned(), "a".to_owned()) }, + ], + }, + None, + false, + ) + .unwrap_err(); + snapshot!(format!("{err}"), @"Indexes must be declared only once during a swap. `a`, `b` were specified several times."); + + let second_snap = snapshot_index_scheduler(&index_scheduler); + assert_eq!(first_snap, second_snap); + + // Index `e` does not exist, but we don't check its existence yet + index_scheduler + .register( + KindWithContent::IndexSwap { + swaps: vec![ + IndexSwap { indexes: ("a".to_owned(), "b".to_owned()) }, + IndexSwap { indexes: ("c".to_owned(), "e".to_owned()) }, + IndexSwap { indexes: ("d".to_owned(), "f".to_owned()) }, + ], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_failed_batch(); + // Now the first swap should have an error message saying `e` and `f` do not exist + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_swap_failed"); +} + +#[test] +fn document_addition_and_index_deletion_on_unexisting_index() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = r#" + { + "id": 1, + "doggo": "bob" + }"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler + .register(KindWithContent::IndexDeletion { index_uid: S("doggos") }, None, false) + .unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler)); + + handle.advance_n_successful_batches(1); + + snapshot!(snapshot_index_scheduler(&index_scheduler)); +} + +#[test] +fn cancel_enqueued_task() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + file0.persist().unwrap(); + + let to_enqueue = [ + replace_document_import_task("catto", None, 0, documents_count0), + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + ]; + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_tasks_enqueued"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); +} + +#[test] +fn cancel_succeeded_task() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + file0.persist().unwrap(); + + let _ = index_scheduler + .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processed"); + + index_scheduler + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + false, + ) + .unwrap(); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); +} + +#[test] +fn cancel_processing_task() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + file0.persist().unwrap(); + + let _ = index_scheduler + .register(replace_document_import_task("catto", None, 0, documents_count0), None, false) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + handle.advance_till([Start, BatchCreated, InsideProcessBatch]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "initial_task_processing"); + + index_scheduler + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }, + None, + false, + ) + .unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_task_registered"); + // Now we check that we can reach the AbortedIndexation error handling + handle.advance_till([AbortedIndexation]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "aborted_indexation"); + + // handle.advance_till([Start, BatchCreated, BeforeProcessing, AfterProcessing]); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); +} + +#[test] +fn cancel_mix_of_tasks() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let (file0, documents_count0) = sample_documents(&index_scheduler, 0, 0); + file0.persist().unwrap(); + let (file1, documents_count1) = sample_documents(&index_scheduler, 1, 1); + file1.persist().unwrap(); + let (file2, documents_count2) = sample_documents(&index_scheduler, 2, 2); + file2.persist().unwrap(); + + let to_enqueue = [ + replace_document_import_task("catto", None, 0, documents_count0), + replace_document_import_task("beavero", None, 1, documents_count1), + replace_document_import_task("wolfo", None, 2, documents_count2), + ]; + for task in to_enqueue { + let _ = index_scheduler.register(task, None, false).unwrap(); + index_scheduler.assert_internally_consistent(); + } + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_task_processed"); + + handle.advance_till([Start, BatchCreated, InsideProcessBatch]); + index_scheduler + .register( + KindWithContent::TaskCancelation { + query: "test_query".to_owned(), + tasks: RoaringBitmap::from_iter([0, 1, 2]), + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processing_second_task_cancel_enqueued"); + + handle.advance_till([AbortedIndexation]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "aborted_indexation"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); +} + +#[test] +fn test_settings_update() { + use meilisearch_types::settings::{Settings, Unchecked}; + use milli::update::Setting; + + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let mut new_settings: Box> = Box::default(); + let mut embedders = BTreeMap::default(); + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), + api_key: Setting::Set(S("My super secret")), + url: Setting::Set(S("http://localhost:7777")), + dimensions: Setting::Set(4), + request: Setting::Set(serde_json::json!("{{text}}")), + response: Setting::Set(serde_json::json!("{{embedding}}")), + ..Default::default() + }; + embedders + .insert(S("default"), SettingEmbeddingSettings { inner: Setting::Set(embedding_settings) }); + new_settings.embedders = Setting::Set(embedders); + + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings, + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.queue.tasks.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.queue.tasks.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + // has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let configs = index.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); + insta::assert_snapshot!(name, @"default"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(config.embedder_options); +} + +#[test] +fn simple_new() { + crate::IndexScheduler::test(true, vec![]); +} + +#[test] +fn basic_get_stats() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let kind = index_creation_task("catto", "mouse"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("doggo", "sheep"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + let kind = index_creation_task("whalo", "fish"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + + snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r#" + { + "indexes": { + "catto": 1, + "doggo": 1, + "whalo": 1 + }, + "statuses": { + "canceled": 0, + "enqueued": 3, + "failed": 0, + "processing": 0, + "succeeded": 0 + }, + "types": { + "documentAdditionOrUpdate": 0, + "documentDeletion": 0, + "documentEdition": 0, + "dumpCreation": 0, + "indexCreation": 3, + "indexDeletion": 0, + "indexSwap": 0, + "indexUpdate": 0, + "settingsUpdate": 0, + "snapshotCreation": 0, + "taskCancelation": 0, + "taskDeletion": 0, + "upgradeDatabase": 0 + } + } + "#); + + handle.advance_till([Start, BatchCreated]); + snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r#" + { + "indexes": { + "catto": 1, + "doggo": 1, + "whalo": 1 + }, + "statuses": { + "canceled": 0, + "enqueued": 2, + "failed": 0, + "processing": 1, + "succeeded": 0 + }, + "types": { + "documentAdditionOrUpdate": 0, + "documentDeletion": 0, + "documentEdition": 0, + "dumpCreation": 0, + "indexCreation": 3, + "indexDeletion": 0, + "indexSwap": 0, + "indexUpdate": 0, + "settingsUpdate": 0, + "snapshotCreation": 0, + "taskCancelation": 0, + "taskDeletion": 0, + "upgradeDatabase": 0 + } + } + "#); + + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + Start, + BatchCreated, + ]); + snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r#" + { + "indexes": { + "catto": 1, + "doggo": 1, + "whalo": 1 + }, + "statuses": { + "canceled": 0, + "enqueued": 1, + "failed": 0, + "processing": 1, + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 0, + "documentDeletion": 0, + "documentEdition": 0, + "dumpCreation": 0, + "indexCreation": 3, + "indexDeletion": 0, + "indexSwap": 0, + "indexUpdate": 0, + "settingsUpdate": 0, + "snapshotCreation": 0, + "taskCancelation": 0, + "taskDeletion": 0, + "upgradeDatabase": 0 + } + } + "#); + + // now we make one more batch, the started_at field of the new tasks will be past `second_start_time` + handle.advance_till([ + InsideProcessBatch, + InsideProcessBatch, + ProcessBatchSucceeded, + AfterProcessing, + Start, + BatchCreated, + ]); + snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r#" + { + "indexes": { + "catto": 1, + "doggo": 1, + "whalo": 1 + }, + "statuses": { + "canceled": 0, + "enqueued": 0, + "failed": 0, + "processing": 1, + "succeeded": 2 + }, + "types": { + "documentAdditionOrUpdate": 0, + "documentDeletion": 0, + "documentEdition": 0, + "dumpCreation": 0, + "indexCreation": 3, + "indexDeletion": 0, + "indexSwap": 0, + "indexUpdate": 0, + "settingsUpdate": 0, + "snapshotCreation": 0, + "taskCancelation": 0, + "taskDeletion": 0, + "upgradeDatabase": 0 + } + } + "#); +} + +#[test] +fn cancel_processing_dump() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None }; + let dump_cancellation = KindWithContent::TaskCancelation { + query: "cancel dump".to_owned(), + tasks: RoaringBitmap::from_iter([0]), + }; + let _ = index_scheduler.register(dump_creation, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register"); + handle.advance_till([Start, BatchCreated, InsideProcessBatch]); + + let _ = index_scheduler.register(dump_cancellation, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered"); + + snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed"); +} + +#[test] +fn create_and_list_index() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let index_creation = + KindWithContent::IndexCreation { index_uid: S("kefir"), primary_key: None }; + let _ = index_scheduler.register(index_creation, None, false).unwrap(); + handle.advance_till([Start, BatchCreated, InsideProcessBatch]); + // The index creation has not been started, the index should not exists + + let err = index_scheduler.index("kefir").map(|_| ()).unwrap_err(); + snapshot!(err, @"Index `kefir` not found."); + let empty = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); + snapshot!(format!("{empty:?}"), @"(0, [])"); + + // After advancing just once the index should've been created, the wtxn has been released and commited + // but the indexUpdate task has not been processed yet + handle.advance_till([InsideProcessBatch]); + + index_scheduler.index("kefir").unwrap(); + let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); + snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###" + [ + 1, + [ + [ + "kefir", + { + "documents_database_stats": { + "numberOfEntries": 0, + "totalKeySize": 0, + "totalValueSize": 0 + }, + "database_size": "[bytes]", + "number_of_embeddings": 0, + "number_of_embedded_documents": 0, + "used_database_size": "[bytes]", + "primary_key": null, + "field_distribution": {}, + "created_at": "[date]", + "updated_at": "[date]" + } + ] + ] + ] + "###); +} diff --git a/crates/index-scheduler/src/scheduler/test_document_addition.rs b/crates/index-scheduler/src/scheduler/test_document_addition.rs new file mode 100644 index 000000000..3c0d89d54 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/test_document_addition.rs @@ -0,0 +1,1166 @@ +use big_s::S; +use meili_snap::snapshot; +use meilisearch_types::milli::obkv_to_json; +use meilisearch_types::milli::update::IndexDocumentsMethod::*; +use meilisearch_types::tasks::KindWithContent; + +use crate::insta_snapshot::snapshot_index_scheduler; +use crate::test_utils::read_json; +use crate::test_utils::Breakpoint::*; +use crate::IndexScheduler; + +#[test] +fn document_addition() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = r#" + { + "id": 1, + "doggo": "bob" + }"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); + + handle.advance_till([Start, BatchCreated]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_the_batch_creation"); + + handle.advance_till([InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "once_everything_is_processed"); +} + +#[test] +fn document_addition_and_document_deletion() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = r#"[ + { "id": 1, "doggo": "jean bob" }, + { "id": 2, "catto": "jorts" }, + { "id": 3, "doggo": "bork" } + ]"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1"), S("2")], + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + + handle.advance_one_successful_batch(); // The addition AND deletion should've been batched together + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_batch"); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn document_deletion_and_document_addition() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1"), S("2")], + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + let content = r#"[ + { "id": 1, "doggo": "jean bob" }, + { "id": 2, "catto": "jorts" }, + { "id": 3, "doggo": "bork" } + ]"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task"); + + // The deletion should have failed because it can't create an index + handle.advance_one_failed_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_the_deletion"); + + // The addition should works + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_last_successful_addition"); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_replace() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler)); + + // everything should be batched together. + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler)); + + // has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_update() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler)); + + // everything should be batched together. + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler)); + + // has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_mixed_document_addition() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for i in 0..10 { + let method = if i % 2 == 0 { UpdateDocuments } else { ReplaceDocuments }; + + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // All tasks should've been batched and processed together since any indexing task (updates with replacements) can be batched together + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_replace_without_autobatching() { + let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Nothing should be batched thus half of the tasks are processed. + handle.advance_n_successful_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); + + // Everything is processed. + handle.advance_n_successful_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_update_without_autobatching() { + let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Nothing should be batched thus half of the tasks are processed. + handle.advance_n_successful_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); + + // Everything is processed. + handle.advance_n_successful_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_cant_create_index_without_index() { + // We're going to autobatch multiple document addition that don't have + // the right to create an index while there is no index currently. + // Thus, everything should be batched together and a IndexDoesNotExists + // error should be throwed. + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Everything should be batched together. + handle.advance_till([ + Start, + BatchCreated, + InsideProcessBatch, + ProcessBatchFailed, + AfterProcessing, + ]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_10_tasks"); + + // The index should not exist. + snapshot!(matches!(index_scheduler.index_exists("doggos"), Ok(true)), @"false"); +} + +#[test] +fn test_document_addition_cant_create_index_without_index_without_autobatching() { + // We're going to execute multiple document addition that don't have + // the right to create an index while there is no index currently. + // Since the auto-batching is disabled, every task should be processed + // sequentially and throw an IndexDoesNotExists. + let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Nothing should be batched thus half of the tasks are processed. + handle.advance_n_failed_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); + + // Everything is processed. + handle.advance_n_failed_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // The index should not exist. + snapshot!(matches!(index_scheduler.index_exists("doggos"), Ok(true)), @"false"); +} + +#[test] +fn test_document_addition_cant_create_index_with_index() { + // We're going to autobatch multiple document addition that don't have + // the right to create an index while there is already an index. + // Thus, everything should be batched together and no error should be + // throwed. + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + // Create the index. + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Everything should be batched together. + handle.advance_n_successful_batches(1); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_10_tasks"); + + // Has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_cant_create_index_with_index_without_autobatching() { + // We're going to execute multiple document addition that don't have + // the right to create an index while there is no index currently. + // Since the autobatching is disabled, every tasks should be processed + // sequentially and throw an IndexDoesNotExists. + let (index_scheduler, mut handle) = IndexScheduler::test(false, vec![]); + + // Create the index. + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Nothing should be batched thus half of the tasks are processed. + handle.advance_n_successful_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "five_tasks_processed"); + + // Everything is processed. + handle.advance_n_successful_batches(5); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // Has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_mixed_rights_with_index() { + // We're going to autobatch multiple document addition. + // - The index already exists + // - The first document addition don't have the right to create an index + // can it batch with the other one? + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + // Create the index. + index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggos"), primary_key: None }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "processed_the_first_task"); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + let allow_index_creation = i % 2 != 0; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // Everything should be batched together. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // Has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_mixed_right_without_index_starts_with_cant_create() { + // We're going to autobatch multiple document addition. + // - The index does not exists + // - The first document addition don't have the right to create an index + // - The second do. They should not batch together. + // - The second should batch with everything else as it's going to create an index. + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for i in 0..10 { + let content = format!( + r#"{{ + "id": {}, + "doggo": "bob {}" + }}"#, + i, i + ); + let allow_index_creation = i % 2 != 0; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(i).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_10_tasks"); + + // A first batch should be processed with only the first documentAddition that's going to fail. + handle.advance_one_failed_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_failed"); + + // Everything else should be batched together. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_tasks_processed"); + + // Has everything being pushed successfully in milli? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_with_multiple_primary_key() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for (id, primary_key) in ["id", "bork", "bloup"].iter().enumerate() { + let content = format!( + r#"{{ + "id": {id}, + "doggo": "jean bob" + }}"#, + ); + let (uuid, mut file) = + index_scheduler.queue.create_update_file_with_uuid(id as u128).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_3_tasks"); + + // A first batch should be processed with only the first documentAddition. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_fails"); + + // Is the primary key still what we expect? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"id"); + + // Is the document still the one we expect?. + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_with_multiple_primary_key_batch_wrong_key() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for (id, primary_key) in ["id", "bork", "bork"].iter().enumerate() { + let content = format!( + r#"{{ + "id": {id}, + "doggo": "jean bob" + }}"#, + ); + let (uuid, mut file) = + index_scheduler.queue.create_update_file_with_uuid(id as u128).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_3_tasks"); + + // A first batch should be processed with only the first documentAddition. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_and_third_tasks_fails"); + + // Is the primary key still what we expect? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"id"); + + // Is the document still the one we expect?. + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_with_bad_primary_key() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for (id, primary_key) in ["bork", "bork", "id", "bork", "id"].iter().enumerate() { + let content = format!( + r#"{{ + "id": {id}, + "doggo": "jean bob" + }}"#, + ); + let (uuid, mut file) = + index_scheduler.queue.create_update_file_with_uuid(id as u128).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S(primary_key)), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_5_tasks"); + + // A first batch should be processed with only the first two documentAddition. + // it should fails because the documents don't contains any `bork` field. + // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_and_second_task_fails"); + + // The primary key should be set to none since we failed the batch. + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap(); + snapshot!(primary_key.is_none(), @"true"); + + // The second batch should succeed and only contains one task. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_succeeds"); + + // The primary key should be set to `id` since this batch succeeded. + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"id"); + + // We're trying to `bork` again, but now there is already a primary key set for this index. + // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fourth_task_fails"); + + // Finally the last task should succeed since its primary key is the same as the valid one. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "fifth_task_succeeds"); + + // Is the primary key still what we expect? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"id"); + + // Is the document still the one we expect?. + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_with_set_and_null_primary_key() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for (id, primary_key) in + [None, Some("bork"), Some("paw"), None, None, Some("paw")].into_iter().enumerate() + { + let content = format!( + r#"{{ + "paw": {id}, + "doggo": "jean bob" + }}"#, + ); + let (uuid, mut file) = + index_scheduler.queue.create_update_file_with_uuid(id as u128).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: primary_key.map(|pk| pk.to_string()), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_6_tasks"); + + // A first batch should contains only one task that fails because we can't infer the primary key. + // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_task_fails"); + + // The second batch should contains only one task that fails because we bork is not a valid primary key. + // NOTE: it's marked as successful because the batch didn't fails, it's the individual tasks that failed. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); + + // No primary key should be set at this point. + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap(); + snapshot!(primary_key.is_none(), @"true"); + + // The third batch should succeed and only contains one task. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_succeeds"); + + // The primary key should be set to `id` since this batch succeeded. + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"paw"); + + // We should be able to batch together the next two tasks that don't specify any primary key + // + the last task that matches the current primary-key. Everything should succeed. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_other_tasks_succeeds"); + + // Is the primary key still what we expect? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"paw"); + + // Is the document still the one we expect?. + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} + +#[test] +fn test_document_addition_with_set_and_null_primary_key_inference_works() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + for (id, primary_key) in + [None, Some("bork"), Some("doggoid"), None, None, Some("doggoid")].into_iter().enumerate() + { + let content = format!( + r#"{{ + "doggoid": {id}, + "doggo": "jean bob" + }}"#, + ); + let (uuid, mut file) = + index_scheduler.queue.create_update_file_with_uuid(id as u128).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: primary_key.map(|pk| pk.to_string()), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + } + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_the_6_tasks"); + + // A first batch should contains only one task that succeed and sets the primary key to `doggoid`. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "first_task_succeed"); + + // Checking the primary key. + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap(); + snapshot!(primary_key.is_none(), @"false"); + + // The second batch should contains only one task that fails because it tries to update the primary key to `bork`. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails"); + + // The third batch should succeed and only contains one task. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_succeeds"); + + // We should be able to batch together the next two tasks that don't specify any primary key + // + the last task that matches the current primary-key. Everything should succeed. + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "all_other_tasks_succeeds"); + + // Is the primary key still what we expect? + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let primary_key = index.primary_key(&rtxn).unwrap().unwrap(); + snapshot!(primary_key, @"doggoid"); + + // Is the document still the one we expect?. + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents"); +} diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs new file mode 100644 index 000000000..772aa1520 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -0,0 +1,840 @@ +use std::collections::BTreeMap; + +use big_s::S; +use insta::assert_json_snapshot; +use meili_snap::{json_string, snapshot}; +use meilisearch_types::milli::index::IndexEmbeddingConfig; +use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::milli::{self, obkv_to_json}; +use meilisearch_types::settings::{SettingEmbeddingSettings, Settings, Unchecked}; +use meilisearch_types::tasks::KindWithContent; +use milli::update::IndexDocumentsMethod::*; + +use crate::insta_snapshot::snapshot_index_scheduler; +use crate::test_utils::read_json; +use crate::IndexScheduler; + +#[test] +fn import_vectors() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let mut new_settings: Box> = Box::default(); + let mut embedders = BTreeMap::default(); + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), + api_key: Setting::Set(S("My super secret")), + url: Setting::Set(S("http://localhost:7777")), + dimensions: Setting::Set(384), + request: Setting::Set(serde_json::json!("{{text}}")), + response: Setting::Set(serde_json::json!("{{embedding}}")), + ..Default::default() + }; + embedders.insert( + S("A_fakerest"), + SettingEmbeddingSettings { inner: Setting::Set(embedding_settings) }, + ); + + let embedding_settings = milli::vector::settings::EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}} the {{doc.breed}} best doggo")), + ..Default::default() + }; + embedders.insert( + S("B_small_hf"), + SettingEmbeddingSettings { inner: Setting::Set(embedding_settings) }, + ); + + new_settings.embedders = Setting::Set(embedders); + + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings, + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.queue.tasks.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); + + { + let rtxn = index_scheduler.read_txn().unwrap(); + let task = index_scheduler.queue.tasks.get_task(&rtxn, 0).unwrap().unwrap(); + let task = meilisearch_types::task_view::TaskView::from_task(&task); + insta::assert_json_snapshot!(task.details); + } + + let (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) = { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(fakerest_config.embedder_options); + let fakerest_name = name.clone(); + + let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = + configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(simple_hf_config.embedder_options); + let simple_hf_name = name.clone(); + + let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap(); + let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); + let beagle_embed = hf_embedder.embed_search("Intel the beagle best doggo", None).unwrap(); + let lab_embed = hf_embedder.embed_search("Max the lab best doggo", None).unwrap(); + let patou_embed = hf_embedder.embed_search("kefir the patou best doggo", None).unwrap(); + (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) + }; + + // add one doc, specifying vectors + + let doc = serde_json::json!( + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "regenerate": false, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "regenerate": true, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0u128).unwrap(); + let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); + + handle.advance_one_successful_batch(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "adding Intel succeeds"); + + // check embeddings + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + + let embeddings = index.embeddings(&rtxn, 0).unwrap(); + + assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + + let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let doc = obkv_to_json( + &[ + fields_ids_map.id("doggo").unwrap(), + fields_ids_map.id("breed").unwrap(), + fields_ids_map.id("_vectors").unwrap(), + ], + &fields_ids_map, + doc, + ) + .unwrap(); + assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); + } + + // update the doc, specifying vectors + + let doc = serde_json::json!( + { + "id": 0, + "doggo": "kefir", + "breed": "patou", + } + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(1u128).unwrap(); + let documents_count = read_json(doc.to_string().as_bytes(), &mut file).unwrap(); + assert_eq!(documents_count, 1); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); + + { + // check embeddings + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + + let embeddings = index.embeddings(&rtxn, 0).unwrap(); + + // automatically changed to patou because set to regenerate + assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); + // remained beagle + assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + + let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let doc = obkv_to_json( + &[ + fields_ids_map.id("doggo").unwrap(), + fields_ids_map.id("breed").unwrap(), + fields_ids_map.id("_vectors").unwrap(), + ], + &fields_ids_map, + doc, + ) + .unwrap(); + assert_json_snapshot!(doc, {"._vectors.A_fakerest.embeddings" => "[vector]"}); + } + } +} + +#[test] +fn import_vectors_first_and_embedder_later() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "my_doggo_embedder": vec![1; 384], + "unknown embedder": vec![1, 2, 3], + } + }, + { + "id": 2, + "doggo": "max", + "_vectors": { + "my_doggo_embedder": { + "regenerate": false, + "embeddings": vec![2; 384], + }, + "unknown embedder": vec![4, 5], + }, + }, + { + "id": 3, + "doggo": "marcel", + "_vectors": { + "my_doggo_embedder": { + "regenerate": true, + "embeddings": vec![3; 384], + }, + }, + }, + { + "id": 4, + "doggo": "sora", + "_vectors": { + "my_doggo_embedder": { + "regenerate": true, + }, + }, + }, + ] + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file).unwrap(); + snapshot!(documents_count, @"5"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => SettingEmbeddingSettings { inner: Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }) } + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + // the all the vectors linked to the new specified embedder have been removed + // Only the unknown embedders stays in the document DB + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // even though we specified the vector for the ID 3, it shouldn't be marked + // as user provided since we explicitely marked it as NOT user provided. + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + ), + distribution: None, + pooling: UseModel, + }, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + max_bytes: Some( + 400, + ), + }, + quantized: None, + }, + user_provided: RoaringBitmap<[1, 2]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + // the document with the id 3 should keep its original embedding + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embeddings = &embeddings["my_doggo_embedder"]; + + snapshot!(embeddings.len(), @"1"); + assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); + + // If we update marcel it should regenerate its embedding automatically + + let content = serde_json::json!( + [ + { + "id": 3, + "doggo": "marvel", + }, + { + "id": 4, + "doggo": "sorry", + }, + ] + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(1_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file).unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + // the document with the id 3 should have its original embedding updated + let rtxn = index.read_txn().unwrap(); + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let doc = index.documents(&rtxn, Some(docid)).unwrap()[0]; + let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap(); + snapshot!(json_string!(doc), @r###" + { + "id": 3, + "doggo": "marvel" + } + "###); + + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + + // the document with the id 4 should generate an embedding + let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); +} + +#[test] +fn delete_document_containing_vector() { + // 1. Add an embedder + // 2. Push two documents containing a simple vector + // 3. Delete the first document + // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore + // 5. Clear the index + // 6. The user defined roaring bitmap shouldn't contains the id of the second document + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => SettingEmbeddingSettings { inner: Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) } + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file).unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), + }, + quantized: None, + }, + user_provided: RoaringBitmap<[0]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["manual"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); + let conf = index.embedding_configs(&rtxn).unwrap(); + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), + }, + quantized: None, + }, + user_provided: RoaringBitmap<[]>, + }, + ] + "###); +} + +#[test] +fn delete_embedder_with_user_provided_vectors() { + // 1. Add two embedders + // 2. Push two documents containing a simple vector + // 3. The documents must not contain the vectors after the update as they are in the vectors db + // 3. Delete the embedders + // 4. The documents contain the vectors again + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => SettingEmbeddingSettings { inner: Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) }, + S("my_doggo_embedder") => SettingEmbeddingSettings { inner: Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }) }, + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + "my_doggo_embedder": vec![1; 384], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file).unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => SettingEmbeddingSettings { inner: Setting::Reset }, + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Reset, + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + + // FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); + } +} diff --git a/crates/index-scheduler/src/scheduler/test_failure.rs b/crates/index-scheduler/src/scheduler/test_failure.rs new file mode 100644 index 000000000..191910d38 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/test_failure.rs @@ -0,0 +1,327 @@ +use std::time::Instant; + +use big_s::S; +use meili_snap::snapshot; +use meilisearch_types::milli::obkv_to_json; +use meilisearch_types::milli::update::IndexDocumentsMethod::*; +use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::FilterableAttributesRule; +use meilisearch_types::tasks::{Kind, KindWithContent}; + +use crate::insta_snapshot::snapshot_index_scheduler; +use crate::test_utils::Breakpoint::*; +use crate::test_utils::{index_creation_task, read_json, FailureLocation}; +use crate::IndexScheduler; + +#[test] +fn fail_in_process_batch_for_index_creation() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(1, FailureLocation::InsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_register"); + + handle.advance_one_failed_batch(); + + // Still in the first iteration + assert_eq!(*index_scheduler.run_loop_iteration.read().unwrap(), 1); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "index_creation_failed"); +} + +#[test] +fn fail_in_process_batch_for_document_addition() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(1, FailureLocation::InsideProcessBatch)]); + + let content = r#" + { + "id": 1, + "doggo": "bob" + }"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + handle.advance_till([Start, BatchCreated]); + + snapshot!( + snapshot_index_scheduler(&index_scheduler), + name: "document_addition_batch_created" + ); + + handle.advance_till([ProcessBatchFailed, AfterProcessing]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "document_addition_failed"); +} + +#[test] +fn fail_in_update_task_after_process_batch_success_for_document_addition() { + let (index_scheduler, mut handle) = IndexScheduler::test( + true, + vec![(1, FailureLocation::UpdatingTaskAfterProcessBatchSuccess { task_uid: 0 })], + ); + + let content = r#" + { + "id": 1, + "doggo": "bob" + }"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + handle.advance_till([Start]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "document_addition_succeeded_but_index_scheduler_not_updated"); + + handle.advance_till([BatchCreated, InsideProcessBatch, ProcessBatchSucceeded]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_batch_succeeded"); + + // At this point the next time the scheduler will try to progress it should encounter + // a critical failure and have to wait for 1s before retrying anything. + + let before_failure = Instant::now(); + handle.advance_till([Start]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_to_commit"); + let failure_duration = before_failure.elapsed(); + assert!(failure_duration.as_millis() >= 1000); + + handle.advance_till([BatchCreated, InsideProcessBatch, ProcessBatchSucceeded, AfterProcessing]); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "task_successfully_processed"); +} + +#[test] +fn fail_in_process_batch_for_document_deletion() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + use meilisearch_types::settings::{Settings, Unchecked}; + let mut new_settings: Box> = Box::default(); + new_settings.filterable_attributes = + Setting::Set(vec![FilterableAttributesRule::Field(S("catto"))]); + + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings, + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + + let content = r#"[ + { "id": 1, "doggo": "jean bob" }, + { "id": 2, "catto": "jorts" }, + { "id": 3, "doggo": "bork" } + ]"#; + + let (uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(0).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + file.persist().unwrap(); + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: Some(S("id")), + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_setting_and_document_addition"); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_adding_the_settings"); + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_adding_the_documents"); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + // This one should not be catched by Meilisearch but it's still nice to handle it because if one day we break the filters it could happens + index_scheduler + .register( + KindWithContent::DocumentDeletionByFilter { + index_uid: S("doggos"), + filter_expr: serde_json::json!(true), + }, + None, + false, + ) + .unwrap(); + // Should fail because the ids are not filterable + index_scheduler + .register( + KindWithContent::DocumentDeletionByFilter { + index_uid: S("doggos"), + filter_expr: serde_json::json!("id = 2"), + }, + None, + false, + ) + .unwrap(); + index_scheduler + .register( + KindWithContent::DocumentDeletionByFilter { + index_uid: S("doggos"), + filter_expr: serde_json::json!("catto EXISTS"), + }, + None, + false, + ) + .unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_document_deletions"); + + // Everything should be batched together + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_removing_the_documents"); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents_remaining_should_only_be_bork"); +} + +#[test] +fn panic_in_process_batch_for_index_creation() { + let (index_scheduler, mut handle) = + IndexScheduler::test(true, vec![(1, FailureLocation::PanicInsideProcessBatch)]); + + let kind = index_creation_task("catto", "mouse"); + + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task"); + + handle.advance_till([Start, BatchCreated, ProcessBatchFailed, AfterProcessing]); + + // Still in the first iteration + assert_eq!(*index_scheduler.run_loop_iteration.read().unwrap(), 1); + // No matter what happens in process_batch, the index_scheduler should be internally consistent + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "index_creation_failed"); +} + +#[test] +fn upgrade_failure() { + // By starting the index-scheduler at the v1.12.0 an upgrade task should be automatically enqueued + let (index_scheduler, mut handle) = + IndexScheduler::test_with_custom_config(vec![(1, FailureLocation::ProcessUpgrade)], |_| { + Some((1, 12, 0)) + }); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "register_automatic_upgrade_task"); + + let kind = index_creation_task("catto", "mouse"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_a_task_while_the_upgrade_task_is_enqueued"); + + handle.advance_one_failed_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "upgrade_task_failed"); + + // We can still register tasks + let kind = index_creation_task("doggo", "bone"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + + // But the scheduler is down and won't process anything ever again + handle.scheduler_is_down(); + + // =====> After a restart is it still working as expected? + let (index_scheduler, mut handle) = + handle.restart(index_scheduler, true, vec![(1, FailureLocation::ProcessUpgrade)], |_| { + Some((1, 12, 0)) // the upgrade task should be rerun automatically and nothing else should be enqueued + }); + + handle.advance_one_failed_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "upgrade_task_failed_again"); + // We can still register tasks + let kind = index_creation_task("doggo", "bone"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + // And the scheduler is still down and won't process anything ever again + handle.scheduler_is_down(); + + // =====> After a rerestart and without failure can we upgrade the indexes and process the tasks + let (index_scheduler, mut handle) = + handle.restart(index_scheduler, true, vec![], |_| Some((1, 12, 0))); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "upgrade_task_succeeded"); + // We can still register tasks + let kind = index_creation_task("girafo", "leaves"); + let _task = index_scheduler.register(kind, None, false).unwrap(); + // The scheduler is up and running + handle.advance_one_successful_batch(); + handle.advance_one_successful_batch(); + handle.advance_one_failed_batch(); // doggo already exists + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_everything"); + + let (upgrade_tasks_ids, _) = index_scheduler + .get_task_ids_from_authorized_indexes( + &crate::Query { types: Some(vec![Kind::UpgradeDatabase]), ..Default::default() }, + &Default::default(), + ) + .unwrap(); + // When deleting the single upgrade task it should remove the associated batch + let _task = index_scheduler + .register( + KindWithContent::TaskDeletion { + query: String::from("types=upgradeDatabase"), + tasks: upgrade_tasks_ids, + }, + None, + false, + ) + .unwrap(); + + handle.advance_one_successful_batch(); + snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_removing_the_upgrade_tasks"); +} diff --git a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap deleted file mode 100644 index 2b56b71d1..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 3, - "doggo": "bork" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings_and_documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings_and_documents.snap deleted file mode 100644 index 45065d8b1..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings_and_documents.snap +++ /dev/null @@ -1,43 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -### Autobatching Enabled = true -### Processing Tasks: -[] ----------------------------------------------------------------------- -### All Tasks: -0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} -1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ----------------------------------------------------------------------- -### Status: -enqueued [1,] -succeeded [0,] ----------------------------------------------------------------------- -### Kind: -"documentAdditionOrUpdate" [1,] -"settingsUpdate" [0,] ----------------------------------------------------------------------- -### Index Tasks: -doggos [0,1,] ----------------------------------------------------------------------- -### Index Mapper: -doggos: { number_of_documents: 0, field_distribution: {} } - ----------------------------------------------------------------------- -### Canceled By: - ----------------------------------------------------------------------- -### Enqueued At: -[timestamp] [0,] -[timestamp] [1,] ----------------------------------------------------------------------- -### Started At: -[timestamp] [0,] ----------------------------------------------------------------------- -### Finished At: -[timestamp] [0,] ----------------------------------------------------------------------- -### File Store: -00000000-0000-0000-0000-000000000000 - ----------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/documents_remaining_should_only_be_bork.snap b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/documents_remaining_should_only_be_bork.snap deleted file mode 100644 index 2b56b71d1..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_deletion/documents_remaining_should_only_be_bork.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 3, - "doggo": "bork" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap deleted file mode 100644 index f9e6df03e..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/import_vectors/after_registering_settings_task_vectors.snap +++ /dev/null @@ -1,52 +0,0 @@ ---- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text ---- -### Autobatching Enabled = true -### Processing batch None: -[] ----------------------------------------------------------------------- -### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ----------------------------------------------------------------------- -### Status: -enqueued [0,] ----------------------------------------------------------------------- -### Kind: -"settingsUpdate" [0,] ----------------------------------------------------------------------- -### Index Tasks: -doggos [0,] ----------------------------------------------------------------------- -### Index Mapper: - ----------------------------------------------------------------------- -### Canceled By: - ----------------------------------------------------------------------- -### Enqueued At: -[timestamp] [0,] ----------------------------------------------------------------------- -### Started At: ----------------------------------------------------------------------- -### Finished At: ----------------------------------------------------------------------- -### All Batches: ----------------------------------------------------------------------- -### Batch to tasks mapping: ----------------------------------------------------------------------- -### Batches Status: ----------------------------------------------------------------------- -### Batches Kind: ----------------------------------------------------------------------- -### Batches Index Tasks: ----------------------------------------------------------------------- -### Batches Enqueued At: ----------------------------------------------------------------------- -### Batches Started At: ----------------------------------------------------------------------- -### Batches Finished At: ----------------------------------------------------------------------- -### File Store: - ----------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/documents.snap deleted file mode 100644 index 96f9d447f..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key/documents.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 0, - "doggo": "jean bob" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/documents.snap deleted file mode 100644 index 96f9d447f..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_addition_with_multiple_primary_key_batch_wrong_key/documents.snap +++ /dev/null @@ -1,9 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 0, - "doggo": "jean bob" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/documents.snap deleted file mode 100644 index 5a839838d..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_replace_without_autobatching/documents.snap +++ /dev/null @@ -1,45 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 0, - "doggo": "bob 0" - }, - { - "id": 1, - "doggo": "bob 1" - }, - { - "id": 2, - "doggo": "bob 2" - }, - { - "id": 3, - "doggo": "bob 3" - }, - { - "id": 4, - "doggo": "bob 4" - }, - { - "id": 5, - "doggo": "bob 5" - }, - { - "id": 6, - "doggo": "bob 6" - }, - { - "id": 7, - "doggo": "bob 7" - }, - { - "id": 8, - "doggo": "bob 8" - }, - { - "id": 9, - "doggo": "bob 9" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/documents.snap deleted file mode 100644 index 5a839838d..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update/documents.snap +++ /dev/null @@ -1,45 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 0, - "doggo": "bob 0" - }, - { - "id": 1, - "doggo": "bob 1" - }, - { - "id": 2, - "doggo": "bob 2" - }, - { - "id": 3, - "doggo": "bob 3" - }, - { - "id": 4, - "doggo": "bob 4" - }, - { - "id": 5, - "doggo": "bob 5" - }, - { - "id": 6, - "doggo": "bob 6" - }, - { - "id": 7, - "doggo": "bob 7" - }, - { - "id": 8, - "doggo": "bob 8" - }, - { - "id": 9, - "doggo": "bob 9" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/documents.snap deleted file mode 100644 index 5a839838d..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_document_update_without_autobatching/documents.snap +++ /dev/null @@ -1,45 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 0, - "doggo": "bob 0" - }, - { - "id": 1, - "doggo": "bob 1" - }, - { - "id": 2, - "doggo": "bob 2" - }, - { - "id": 3, - "doggo": "bob 3" - }, - { - "id": 4, - "doggo": "bob 4" - }, - { - "id": 5, - "doggo": "bob 5" - }, - { - "id": 6, - "doggo": "bob 6" - }, - { - "id": 7, - "doggo": "bob 7" - }, - { - "id": 8, - "doggo": "bob 8" - }, - { - "id": 9, - "doggo": "bob 9" - } -] diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/all_tasks_processed.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/all_tasks_processed.snap deleted file mode 100644 index a4649c1eb..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/all_tasks_processed.snap +++ /dev/null @@ -1,145 +0,0 @@ ---- -source: crates/index-scheduler/src/lib.rs -snapshot_kind: text ---- -### Autobatching Enabled = true -### Processing batch None: -[] ----------------------------------------------------------------------- -### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} -1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} -2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000002, documents_count: 1, allow_index_creation: true }} -3 {uid: 3, batch_uid: 3, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000003, documents_count: 1, allow_index_creation: true }} -4 {uid: 4, batch_uid: 4, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000004, documents_count: 1, allow_index_creation: true }} -5 {uid: 5, batch_uid: 5, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000005, documents_count: 1, allow_index_creation: true }} -6 {uid: 6, batch_uid: 6, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000006, documents_count: 1, allow_index_creation: true }} -7 {uid: 7, batch_uid: 7, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000007, documents_count: 1, allow_index_creation: true }} -8 {uid: 8, batch_uid: 8, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000008, documents_count: 1, allow_index_creation: true }} -9 {uid: 9, batch_uid: 9, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000009, documents_count: 1, allow_index_creation: true }} ----------------------------------------------------------------------- -### Status: -enqueued [] -succeeded [0,1,2,3,4,5,6,7,8,9,] ----------------------------------------------------------------------- -### Kind: -"documentAdditionOrUpdate" [0,1,2,3,4,5,6,7,8,9,] ----------------------------------------------------------------------- -### Index Tasks: -doggos [0,1,2,3,4,5,6,7,8,9,] ----------------------------------------------------------------------- -### Index Mapper: -doggos: { number_of_documents: 10, field_distribution: {"doggo": 10, "id": 10} } - ----------------------------------------------------------------------- -### Canceled By: - ----------------------------------------------------------------------- -### Enqueued At: -[timestamp] [0,] -[timestamp] [1,] -[timestamp] [2,] -[timestamp] [3,] -[timestamp] [4,] -[timestamp] [5,] -[timestamp] [6,] -[timestamp] [7,] -[timestamp] [8,] -[timestamp] [9,] ----------------------------------------------------------------------- -### Started At: -[timestamp] [0,] -[timestamp] [1,] -[timestamp] [2,] -[timestamp] [3,] -[timestamp] [4,] -[timestamp] [5,] -[timestamp] [6,] -[timestamp] [7,] -[timestamp] [8,] -[timestamp] [9,] ----------------------------------------------------------------------- -### Finished At: -[timestamp] [0,] -[timestamp] [1,] -[timestamp] [2,] -[timestamp] [3,] -[timestamp] [4,] -[timestamp] [5,] -[timestamp] [6,] -[timestamp] [7,] -[timestamp] [8,] -[timestamp] [9,] ----------------------------------------------------------------------- -### All Batches: -0 {uid: 0, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -1 {uid: 1, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -2 {uid: 2, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -3 {uid: 3, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -4 {uid: 4, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -5 {uid: 5, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -6 {uid: 6, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -7 {uid: 7, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -8 {uid: 8, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } -9 {uid: 9, details: {"receivedDocuments":1,"indexedDocuments":1}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, } ----------------------------------------------------------------------- -### Batch to tasks mapping: -0 [0,] -1 [1,] -2 [2,] -3 [3,] -4 [4,] -5 [5,] -6 [6,] -7 [7,] -8 [8,] -9 [9,] ----------------------------------------------------------------------- -### Batches Status: -succeeded [0,1,2,3,4,5,6,7,8,9,] ----------------------------------------------------------------------- -### Batches Kind: -"documentAdditionOrUpdate" [0,1,2,3,4,5,6,7,8,9,] ----------------------------------------------------------------------- -### Batches Index Tasks: -doggos [0,1,2,3,4,5,6,7,8,9,] ----------------------------------------------------------------------- -### Batches Enqueued At: -[timestamp] [0,] -[timestamp] [1,] -[timestamp] [2,] -[timestamp] [3,] -[timestamp] [4,] -[timestamp] [5,] -[timestamp] [6,] -[timestamp] [7,] -[timestamp] [8,] -[timestamp] [9,] ----------------------------------------------------------------------- -### Batches Started At: -[timestamp] [0,] -[timestamp] [1,] -[timestamp] [2,] -[timestamp] [3,] -[timestamp] [4,] -[timestamp] [5,] -[timestamp] [6,] -[timestamp] [7,] -[timestamp] [8,] -[timestamp] [9,] ----------------------------------------------------------------------- -### Batches Finished At: -[timestamp] [0,] -[timestamp] [1,] -[timestamp] [2,] -[timestamp] [3,] -[timestamp] [4,] -[timestamp] [5,] -[timestamp] [6,] -[timestamp] [7,] -[timestamp] [8,] -[timestamp] [9,] ----------------------------------------------------------------------- -### File Store: - ----------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/documents.snap b/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/documents.snap deleted file mode 100644 index 5a839838d..000000000 --- a/crates/index-scheduler/src/snapshots/lib.rs/test_mixed_document_addition/documents.snap +++ /dev/null @@ -1,45 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[ - { - "id": 0, - "doggo": "bob 0" - }, - { - "id": 1, - "doggo": "bob 1" - }, - { - "id": 2, - "doggo": "bob 2" - }, - { - "id": 3, - "doggo": "bob 3" - }, - { - "id": 4, - "doggo": "bob 4" - }, - { - "id": 5, - "doggo": "bob 5" - }, - { - "id": 6, - "doggo": "bob 6" - }, - { - "id": 7, - "doggo": "bob 7" - }, - { - "id": 8, - "doggo": "bob 8" - }, - { - "id": 9, - "doggo": "bob 9" - } -] diff --git a/crates/index-scheduler/src/test_utils.rs b/crates/index-scheduler/src/test_utils.rs new file mode 100644 index 000000000..5c04a66ff --- /dev/null +++ b/crates/index-scheduler/src/test_utils.rs @@ -0,0 +1,428 @@ +use std::io::{BufWriter, Write}; +use std::sync::Arc; +use std::time::Duration; + +use big_s::S; +use crossbeam_channel::RecvTimeoutError; +use file_store::File; +use meilisearch_auth::open_auth_store_env; +use meilisearch_types::document_formats::DocumentFormatError; +use meilisearch_types::milli::update::IndexDocumentsMethod::ReplaceDocuments; +use meilisearch_types::milli::update::IndexerConfig; +use meilisearch_types::tasks::KindWithContent; +use meilisearch_types::{versioning, VERSION_FILE_NAME}; +use tempfile::{NamedTempFile, TempDir}; +use uuid::Uuid; +use Breakpoint::*; + +use crate::insta_snapshot::snapshot_index_scheduler; +use crate::{Error, IndexScheduler, IndexSchedulerOptions}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum Breakpoint { + // this state is only encountered while creating the scheduler in the test suite. + Init, + + Start, + BatchCreated, + AfterProcessing, + AbortedIndexation, + ProcessBatchSucceeded, + ProcessBatchFailed, + InsideProcessBatch, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum FailureLocation { + InsideCreateBatch, + InsideProcessBatch, + PanicInsideProcessBatch, + ProcessUpgrade, + AcquiringWtxn, + UpdatingTaskAfterProcessBatchSuccess { task_uid: u32 }, + UpdatingTaskAfterProcessBatchFailure, + CommittingWtxn, +} + +impl IndexScheduler { + /// Blocks the thread until the test handle asks to progress to/through this breakpoint. + /// + /// Two messages are sent through the channel for each breakpoint. + /// The first message is `(b, false)` and the second message is `(b, true)`. + /// + /// Since the channel has a capacity of zero, the `send` and `recv` calls wait for each other. + /// So when the index scheduler calls `test_breakpoint_sdr.send(b, false)`, it blocks + /// the thread until the test catches up by calling `test_breakpoint_rcv.recv()` enough. + /// From the test side, we call `recv()` repeatedly until we find the message `(breakpoint, false)`. + /// As soon as we find it, the index scheduler is unblocked but then wait again on the call to + /// `test_breakpoint_sdr.send(b, true)`. This message will only be able to send once the + /// test asks to progress to the next `(b2, false)`. + pub(crate) fn breakpoint(&self, b: Breakpoint) { + // We send two messages. The first one will sync with the call + // to `handle.wait_until(b)`. The second one will block until the + // the next call to `handle.wait_until(..)`. + self.test_breakpoint_sdr.send((b, false)).unwrap(); + // This one will only be able to be sent if the test handle stays alive. + // If it fails, then it means that we have exited the test. + // By crashing with `unwrap`, we kill the run loop. + self.test_breakpoint_sdr.send((b, true)).unwrap(); + } +} + +impl IndexScheduler { + pub(crate) fn test( + autobatching_enabled: bool, + planned_failures: Vec<(usize, FailureLocation)>, + ) -> (Self, IndexSchedulerHandle) { + Self::test_with_custom_config(planned_failures, |config| { + config.autobatching_enabled = autobatching_enabled; + None + }) + } + + pub(crate) fn test_with_custom_config( + planned_failures: Vec<(usize, FailureLocation)>, + configuration: impl Fn(&mut IndexSchedulerOptions) -> Option<(u32, u32, u32)>, + ) -> (Self, IndexSchedulerHandle) { + let tempdir = TempDir::new().unwrap(); + let (sender, receiver) = crossbeam_channel::bounded(0); + + let indexer_config = IndexerConfig { skip_index_budget: true, ..Default::default() }; + + let mut options = IndexSchedulerOptions { + version_file_path: tempdir.path().join(VERSION_FILE_NAME), + auth_path: tempdir.path().join("auth"), + tasks_path: tempdir.path().join("db_path"), + update_file_path: tempdir.path().join("file_store"), + indexes_path: tempdir.path().join("indexes"), + snapshots_path: tempdir.path().join("snapshots"), + dumps_path: tempdir.path().join("dumps"), + webhook_url: None, + webhook_authorization_header: None, + task_db_size: 1000 * 1000 * 10, // 10 MB, we don't use MiB on purpose. + index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose. + enable_mdb_writemap: false, + index_growth_amount: 1000 * 1000 * 1000 * 1000, // 1 TB + index_count: 5, + indexer_config: Arc::new(indexer_config), + autobatching_enabled: true, + cleanup_enabled: true, + max_number_of_tasks: 1_000_000, + max_number_of_batched_tasks: usize::MAX, + batched_tasks_size_limit: u64::MAX, + instance_features: Default::default(), + auto_upgrade: true, // Don't cost much and will ensure the happy path works + embedding_cache_cap: 10, + }; + let version = configuration(&mut options).unwrap_or_else(|| { + ( + versioning::VERSION_MAJOR.parse().unwrap(), + versioning::VERSION_MINOR.parse().unwrap(), + versioning::VERSION_PATCH.parse().unwrap(), + ) + }); + + std::fs::create_dir_all(&options.auth_path).unwrap(); + let auth_env = open_auth_store_env(&options.auth_path).unwrap(); + let index_scheduler = + Self::new(options, auth_env, version, sender, planned_failures).unwrap(); + + // To be 100% consistent between all test we're going to start the scheduler right now + // and ensure it's in the expected starting state. + let breakpoint = match receiver.recv_timeout(std::time::Duration::from_secs(10)) { + Ok(b) => b, + Err(RecvTimeoutError::Timeout) => { + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + } + Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), + }; + assert_eq!(breakpoint, (Init, false)); + let index_scheduler_handle = IndexSchedulerHandle { + _tempdir: tempdir, + index_scheduler: index_scheduler.private_clone(), + test_breakpoint_rcv: receiver, + last_breakpoint: breakpoint.0, + }; + + (index_scheduler, index_scheduler_handle) + } + + /// Return a [`PlannedFailure`](Error::PlannedFailure) error if a failure is planned + /// for the given location and current run loop iteration. + pub(crate) fn maybe_fail(&self, location: FailureLocation) -> crate::Result<()> { + if self.planned_failures.contains(&(*self.run_loop_iteration.read().unwrap(), location)) { + match location { + FailureLocation::PanicInsideProcessBatch => { + panic!("simulated panic") + } + _ => Err(Error::PlannedFailure), + } + } else { + Ok(()) + } + } +} + +/// Return a `KindWithContent::IndexCreation` task +pub(crate) fn index_creation_task( + index: &'static str, + primary_key: &'static str, +) -> KindWithContent { + KindWithContent::IndexCreation { index_uid: S(index), primary_key: Some(S(primary_key)) } +} + +/// Create a `KindWithContent::DocumentImport` task that imports documents. +/// +/// - `index_uid` is given as parameter +/// - `primary_key` is given as parameter +/// - `method` is set to `ReplaceDocuments` +/// - `content_file` is given as parameter +/// - `documents_count` is given as parameter +/// - `allow_index_creation` is set to `true` +pub(crate) fn replace_document_import_task( + index: &'static str, + primary_key: Option<&'static str>, + content_file_uuid: u128, + documents_count: u64, +) -> KindWithContent { + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S(index), + primary_key: primary_key.map(ToOwned::to_owned), + method: ReplaceDocuments, + content_file: Uuid::from_u128(content_file_uuid), + documents_count, + allow_index_creation: true, + } +} + +/// Adapting to the new json reading interface +pub(crate) fn read_json( + bytes: &[u8], + write: impl Write, +) -> std::result::Result { + let temp_file = NamedTempFile::new().unwrap(); + let mut buffer = BufWriter::new(temp_file.reopen().unwrap()); + buffer.write_all(bytes).unwrap(); + buffer.flush().unwrap(); + meilisearch_types::document_formats::read_json(temp_file.as_file(), write) +} + +/// Create an update file with the given file uuid. +/// +/// The update file contains just one simple document whose id is given by `document_id`. +/// +/// The uuid of the file and its documents count is returned. +pub(crate) fn sample_documents( + index_scheduler: &IndexScheduler, + file_uuid: u128, + document_id: usize, +) -> (File, u64) { + let content = format!( + r#" + {{ + "id" : "{document_id}" + }}"# + ); + + let (_uuid, mut file) = index_scheduler.queue.create_update_file_with_uuid(file_uuid).unwrap(); + let documents_count = read_json(content.as_bytes(), &mut file).unwrap(); + (file, documents_count) +} + +pub struct IndexSchedulerHandle { + _tempdir: TempDir, + index_scheduler: IndexScheduler, + test_breakpoint_rcv: crossbeam_channel::Receiver<(Breakpoint, bool)>, + last_breakpoint: Breakpoint, +} + +impl IndexSchedulerHandle { + /// Restarts the index-scheduler on the same database. + /// To use this function you must give back the index-scheduler that was given to you when + /// creating the handle the first time. + /// If the index-scheduler has been cloned in the meantime you must drop all copy otherwise + /// the function will panic. + pub(crate) fn restart( + self, + index_scheduler: IndexScheduler, + autobatching_enabled: bool, + planned_failures: Vec<(usize, FailureLocation)>, + configuration: impl Fn(&mut IndexSchedulerOptions) -> Option<(u32, u32, u32)>, + ) -> (IndexScheduler, Self) { + drop(index_scheduler); + let Self { _tempdir: tempdir, index_scheduler, test_breakpoint_rcv, last_breakpoint: _ } = + self; + let env = index_scheduler.env.clone(); + drop(index_scheduler); + + // We must ensure that the `run` function has stopped running before restarting the index scheduler + loop { + match test_breakpoint_rcv.recv_timeout(Duration::from_secs(5)) { + Ok((_, true)) => continue, + Ok((b, false)) => { + panic!("Scheduler is not stopped and passed {b:?}") + } + Err(RecvTimeoutError::Timeout) => panic!("The indexing loop is stuck somewhere"), + Err(RecvTimeoutError::Disconnected) => break, + } + } + let closed = env.prepare_for_closing().wait_timeout(Duration::from_secs(5)); + assert!(closed, "The index scheduler couldn't close itself, it seems like someone else is holding the env somewhere"); + + let (scheduler, mut handle) = + IndexScheduler::test_with_custom_config(planned_failures, |config| { + let version = configuration(config); + config.autobatching_enabled = autobatching_enabled; + config.version_file_path = tempdir.path().join(VERSION_FILE_NAME); + config.auth_path = tempdir.path().join("auth"); + config.tasks_path = tempdir.path().join("db_path"); + config.update_file_path = tempdir.path().join("file_store"); + config.indexes_path = tempdir.path().join("indexes"); + config.snapshots_path = tempdir.path().join("snapshots"); + config.dumps_path = tempdir.path().join("dumps"); + version + }); + handle._tempdir = tempdir; + (scheduler, handle) + } + + /// Advance the scheduler to the next tick. + /// Panic + /// * If the scheduler is waiting for a task to be registered. + /// * If the breakpoint queue is in a bad state. + #[track_caller] + pub(crate) fn advance(&mut self) -> Breakpoint { + let (breakpoint_1, b) = match self + .test_breakpoint_rcv + .recv_timeout(std::time::Duration::from_secs(50)) + { + Ok(b) => b, + Err(RecvTimeoutError::Timeout) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") + } + }; + // if we've already encountered a breakpoint we're supposed to be stuck on the false + // and we expect the same variant with the true to come now. + assert_eq!( + (breakpoint_1, b), + (self.last_breakpoint, true), + "Internal error in the test suite. In the previous iteration I got `({:?}, false)` and now I got `({:?}, {:?})`.", + self.last_breakpoint, + breakpoint_1, + b, + ); + + let (breakpoint_2, b) = match self + .test_breakpoint_rcv + .recv_timeout(std::time::Duration::from_secs(50)) + { + Ok(b) => b, + Err(RecvTimeoutError::Timeout) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") + } + }; + assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); + + self.last_breakpoint = breakpoint_2; + + breakpoint_2 + } + + /// Advance the scheduler until all the provided breakpoints are reached in order. + #[track_caller] + pub(crate) fn advance_till(&mut self, breakpoints: impl IntoIterator) { + for breakpoint in breakpoints { + let b = self.advance(); + assert_eq!( + b, + breakpoint, + "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{}", + breakpoint, + b, + snapshot_index_scheduler(&self.index_scheduler) + ); + } + } + + /// Wait for `n` successful batches. + #[track_caller] + pub(crate) fn advance_n_successful_batches(&mut self, n: usize) { + for _ in 0..n { + self.advance_one_successful_batch(); + } + } + + /// Wait for `n` failed batches. + #[track_caller] + pub(crate) fn advance_n_failed_batches(&mut self, n: usize) { + for _ in 0..n { + self.advance_one_failed_batch(); + } + } + + // Wait for one successful batch. + #[track_caller] + pub(crate) fn advance_one_successful_batch(&mut self) { + self.advance_till([Start, BatchCreated]); + loop { + match self.advance() { + // the process_batch function can call itself recursively, thus we need to + // accept as may InsideProcessBatch as possible before moving to the next state. + InsideProcessBatch => (), + // the batch went successfully, we can stop the loop and go on with the next states. + ProcessBatchSucceeded => break, + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => { + while self.advance() != Start {} + panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) + }, + breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), + } + } + + self.advance_till([AfterProcessing]); + } + + // Wait for one failed batch. + #[track_caller] + pub(crate) fn advance_one_failed_batch(&mut self) { + self.advance_till([Start, BatchCreated]); + loop { + match self.advance() { + // the process_batch function can call itself recursively, thus we need to + // accept as may InsideProcessBatch as possible before moving to the next state. + InsideProcessBatch => (), + // the batch went failed, we can stop the loop and go on with the next states. + ProcessBatchFailed => break, + ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), + } + } + self.advance_till([AfterProcessing]); + } + + // Wait for one failed batch. + #[track_caller] + pub(crate) fn scheduler_is_down(&mut self) { + loop { + match self + .test_breakpoint_rcv + .recv_timeout(std::time::Duration::from_secs(1)) { + Ok((_, true)) => continue, + Ok((b, false)) => panic!("The scheduler was supposed to be down but successfully moved to the next breakpoint: {b:?}"), + Err(RecvTimeoutError::Timeout | RecvTimeoutError::Disconnected) => break, + } + } + } +} diff --git a/crates/index-scheduler/src/upgrade/mod.rs b/crates/index-scheduler/src/upgrade/mod.rs new file mode 100644 index 000000000..4a3cb2f75 --- /dev/null +++ b/crates/index-scheduler/src/upgrade/mod.rs @@ -0,0 +1,113 @@ +use anyhow::bail; +use meilisearch_types::heed::{Env, RwTxn, WithoutTls}; +use meilisearch_types::tasks::{Details, KindWithContent, Status, Task}; +use meilisearch_types::versioning::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use time::OffsetDateTime; +use tracing::info; + +use crate::queue::TaskQueue; +use crate::versioning::Versioning; + +trait UpgradeIndexScheduler { + fn upgrade( + &self, + env: &Env, + wtxn: &mut RwTxn, + original: (u32, u32, u32), + ) -> anyhow::Result<()>; + fn target_version(&self) -> (u32, u32, u32); +} + +pub fn upgrade_index_scheduler( + env: &Env, + versioning: &Versioning, + from: (u32, u32, u32), + to: (u32, u32, u32), +) -> anyhow::Result<()> { + let current_major = to.0; + let current_minor = to.1; + let current_patch = to.2; + + let upgrade_functions: &[&dyn UpgradeIndexScheduler] = &[&ToCurrentNoOp {}]; + + let start = match from { + (1, 12, _) => 0, + (1, 13, _) => 0, + (1, 14, _) => 0, + (major, minor, patch) => { + if major > current_major + || (major == current_major && minor > current_minor) + || (major == current_major && minor == current_minor && patch > current_patch) + { + bail!( + "Database version {major}.{minor}.{patch} is higher than the Meilisearch version {current_major}.{current_minor}.{current_patch}. Downgrade is not supported", + ); + } else if major < 1 || (major == current_major && minor < 12) { + bail!( + "Database version {major}.{minor}.{patch} is too old for the experimental dumpless upgrade feature. Please generate a dump using the v{major}.{minor}.{patch} and import it in the v{current_major}.{current_minor}.{current_patch}", + ); + } else { + bail!("Unknown database version: v{major}.{minor}.{patch}"); + } + } + }; + + info!("Upgrading the task queue"); + let mut local_from = from; + for upgrade in upgrade_functions[start..].iter() { + let target = upgrade.target_version(); + info!( + "Upgrading from v{}.{}.{} to v{}.{}.{}", + local_from.0, local_from.1, local_from.2, target.0, target.1, target.2 + ); + let mut wtxn = env.write_txn()?; + upgrade.upgrade(env, &mut wtxn, local_from)?; + versioning.set_version(&mut wtxn, target)?; + wtxn.commit()?; + local_from = target; + } + + let mut wtxn = env.write_txn()?; + let queue = TaskQueue::new(env, &mut wtxn)?; + let uid = queue.next_task_id(&wtxn)?; + queue.register( + &mut wtxn, + &Task { + uid, + batch_uid: None, + enqueued_at: OffsetDateTime::now_utc(), + started_at: None, + finished_at: None, + error: None, + canceled_by: None, + details: Some(Details::UpgradeDatabase { from, to }), + status: Status::Enqueued, + kind: KindWithContent::UpgradeDatabase { from }, + }, + )?; + wtxn.commit()?; + + Ok(()) +} + +#[allow(non_camel_case_types)] +struct ToCurrentNoOp {} + +impl UpgradeIndexScheduler for ToCurrentNoOp { + fn upgrade( + &self, + _env: &Env, + _wtxn: &mut RwTxn, + _original: (u32, u32, u32), + ) -> anyhow::Result<()> { + Ok(()) + } + + fn target_version(&self) -> (u32, u32, u32) { + ( + VERSION_MAJOR.parse().unwrap(), + VERSION_MINOR.parse().unwrap(), + VERSION_PATCH.parse().unwrap(), + ) + } +} diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 1ca782f8c..42bf253ad 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -3,16 +3,15 @@ use std::collections::{BTreeSet, HashSet}; use std::ops::Bound; -use meilisearch_types::batches::{Batch, BatchId, BatchStats}; -use meilisearch_types::heed::types::DecodeIgnore; +use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats}; use meilisearch_types::heed::{Database, RoTxn, RwTxn}; use meilisearch_types::milli::CboRoaringBitmapCodec; use meilisearch_types::task_view::DetailsView; use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status}; -use roaring::{MultiOps, RoaringBitmap}; +use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{Error, IndexScheduler, ProcessingTasks, Result, Task, TaskId, BEI128}; +use crate::{Error, Result, Task, TaskId, BEI128}; /// This structure contains all the information required to write a batch in the database without reading the tasks. /// It'll stay in RAM so it must be small. @@ -22,7 +21,7 @@ use crate::{Error, IndexScheduler, ProcessingTasks, Result, Task, TaskId, BEI128 /// 3. Call `finished` once the batch has been processed. /// 4. Call `update` on all the tasks. #[derive(Debug, Clone)] -pub(crate) struct ProcessingBatch { +pub struct ProcessingBatch { pub uid: BatchId, pub details: DetailsView, pub stats: BatchStats, @@ -31,8 +30,7 @@ pub(crate) struct ProcessingBatch { pub kinds: HashSet, pub indexes: HashSet, pub canceled_by: HashSet, - pub oldest_enqueued_at: Option, - pub earliest_enqueued_at: Option, + pub enqueued_at: Option, pub started_at: OffsetDateTime, pub finished_at: Option, } @@ -52,8 +50,7 @@ impl ProcessingBatch { kinds: HashSet::default(), indexes: HashSet::default(), canceled_by: HashSet::default(), - oldest_enqueued_at: None, - earliest_enqueued_at: None, + enqueued_at: None, started_at: OffsetDateTime::now_utc(), finished_at: None, } @@ -67,7 +64,7 @@ impl ProcessingBatch { task.batch_uid = Some(self.uid); // We don't store the statuses in the map since they're all enqueued but we must // still store them in the stats since that can be displayed. - *self.stats.status.entry(task.status).or_default() += 1; + *self.stats.status.entry(Status::Processing).or_default() += 1; self.kinds.insert(task.kind.as_kind()); *self.stats.types.entry(task.kind.as_kind()).or_default() += 1; @@ -81,14 +78,18 @@ impl ProcessingBatch { if let Some(canceled_by) = task.canceled_by { self.canceled_by.insert(canceled_by); } - self.oldest_enqueued_at = - Some(self.oldest_enqueued_at.map_or(task.enqueued_at, |oldest_enqueued_at| { - task.enqueued_at.min(oldest_enqueued_at) - })); - self.earliest_enqueued_at = - Some(self.earliest_enqueued_at.map_or(task.enqueued_at, |earliest_enqueued_at| { - task.enqueued_at.max(earliest_enqueued_at) - })); + match self.enqueued_at.as_mut() { + Some(BatchEnqueuedAt { earliest, oldest }) => { + *oldest = task.enqueued_at.min(*oldest); + *earliest = task.enqueued_at.max(*earliest); + } + None => { + self.enqueued_at = Some(BatchEnqueuedAt { + earliest: task.enqueued_at, + oldest: task.enqueued_at, + }); + } + } } } @@ -106,7 +107,7 @@ impl ProcessingBatch { self.stats.total_nb_tasks = 0; } - /// Update the timestamp of the tasks and the inner structure of this sturcture. + /// Update the timestamp of the tasks and the inner structure of this structure. pub fn update(&mut self, task: &mut Task) { // We must re-set this value in case we're dealing with a task that has been added between // the `processing` and `finished` state @@ -134,354 +135,16 @@ impl ProcessingBatch { pub fn to_batch(&self) -> Batch { Batch { uid: self.uid, + progress: None, details: self.details.clone(), stats: self.stats.clone(), started_at: self.started_at, finished_at: self.finished_at, + enqueued_at: self.enqueued_at, } } } -impl IndexScheduler { - pub(crate) fn all_task_ids(&self, rtxn: &RoTxn) -> Result { - enum_iterator::all().map(|s| self.get_status(rtxn, s)).union() - } - - pub(crate) fn all_batch_ids(&self, rtxn: &RoTxn) -> Result { - enum_iterator::all().map(|s| self.get_batch_status(rtxn, s)).union() - } - - pub(crate) fn last_task_id(&self, rtxn: &RoTxn) -> Result> { - Ok(self.all_tasks.remap_data_type::().last(rtxn)?.map(|(k, _)| k + 1)) - } - - pub(crate) fn next_task_id(&self, rtxn: &RoTxn) -> Result { - Ok(self.last_task_id(rtxn)?.unwrap_or_default()) - } - - pub(crate) fn next_batch_id(&self, rtxn: &RoTxn) -> Result { - Ok(self - .all_batches - .remap_data_type::() - .last(rtxn)? - .map(|(k, _)| k + 1) - .unwrap_or_default()) - } - - pub(crate) fn get_task(&self, rtxn: &RoTxn, task_id: TaskId) -> Result> { - Ok(self.all_tasks.get(rtxn, &task_id)?) - } - - pub(crate) fn get_batch(&self, rtxn: &RoTxn, batch_id: BatchId) -> Result> { - Ok(self.all_batches.get(rtxn, &batch_id)?) - } - - pub(crate) fn write_batch( - &self, - wtxn: &mut RwTxn, - batch: ProcessingBatch, - tasks: &RoaringBitmap, - ) -> Result<()> { - self.all_batches.put( - wtxn, - &batch.uid, - &Batch { - uid: batch.uid, - details: batch.details, - stats: batch.stats, - started_at: batch.started_at, - finished_at: batch.finished_at, - }, - )?; - self.batch_to_tasks_mapping.put(wtxn, &batch.uid, tasks)?; - - for status in batch.statuses { - self.update_batch_status(wtxn, status, |bitmap| { - bitmap.insert(batch.uid); - })?; - } - - for kind in batch.kinds { - self.update_batch_kind(wtxn, kind, |bitmap| { - bitmap.insert(batch.uid); - })?; - } - - for index in batch.indexes { - self.update_batch_index(wtxn, &index, |bitmap| { - bitmap.insert(batch.uid); - })?; - } - - if let Some(enqueued_at) = batch.oldest_enqueued_at { - insert_task_datetime(wtxn, self.batch_enqueued_at, enqueued_at, batch.uid)?; - } - if let Some(enqueued_at) = batch.earliest_enqueued_at { - insert_task_datetime(wtxn, self.batch_enqueued_at, enqueued_at, batch.uid)?; - } - insert_task_datetime(wtxn, self.batch_started_at, batch.started_at, batch.uid)?; - insert_task_datetime(wtxn, self.batch_finished_at, batch.finished_at.unwrap(), batch.uid)?; - - Ok(()) - } - - /// Convert an iterator to a `Vec` of tasks and edit the `ProcessingBatch` to add the given tasks. - /// - /// The tasks MUST exist, or a `CorruptedTaskQueue` error will be thrown. - pub(crate) fn get_existing_tasks_for_processing_batch( - &self, - rtxn: &RoTxn, - processing_batch: &mut ProcessingBatch, - tasks: impl IntoIterator, - ) -> Result> { - tasks - .into_iter() - .map(|task_id| { - let mut task = self - .get_task(rtxn, task_id) - .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)); - processing_batch.processing(&mut task); - task - }) - .collect::>() - } - - /// Convert an iterator to a `Vec` of tasks. The tasks MUST exist or a - /// `CorruptedTaskQueue` error will be thrown. - pub(crate) fn get_existing_tasks( - &self, - rtxn: &RoTxn, - tasks: impl IntoIterator, - ) -> Result> { - tasks - .into_iter() - .map(|task_id| { - self.get_task(rtxn, task_id).and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) - }) - .collect::>() - } - - /// Convert an iterator to a `Vec` of batches. The batches MUST exist or a - /// `CorruptedTaskQueue` error will be thrown. - pub(crate) fn get_existing_batches( - &self, - rtxn: &RoTxn, - processing: &ProcessingTasks, - tasks: impl IntoIterator, - ) -> Result> { - tasks - .into_iter() - .map(|batch_id| { - if Some(batch_id) == processing.batch.as_ref().map(|batch| batch.uid) { - Ok(processing.batch.as_ref().unwrap().to_batch()) - } else { - self.get_batch(rtxn, batch_id) - .and_then(|task| task.ok_or(Error::CorruptedTaskQueue)) - } - }) - .collect::>() - } - - pub(crate) fn update_task(&self, wtxn: &mut RwTxn, task: &Task) -> Result<()> { - let old_task = self.get_task(wtxn, task.uid)?.ok_or(Error::CorruptedTaskQueue)?; - - debug_assert!(old_task != *task); - debug_assert_eq!(old_task.uid, task.uid); - debug_assert!(old_task.batch_uid.is_none() && task.batch_uid.is_some()); - - if old_task.status != task.status { - self.update_status(wtxn, old_task.status, |bitmap| { - bitmap.remove(task.uid); - })?; - self.update_status(wtxn, task.status, |bitmap| { - bitmap.insert(task.uid); - })?; - } - - if old_task.kind.as_kind() != task.kind.as_kind() { - self.update_kind(wtxn, old_task.kind.as_kind(), |bitmap| { - bitmap.remove(task.uid); - })?; - self.update_kind(wtxn, task.kind.as_kind(), |bitmap| { - bitmap.insert(task.uid); - })?; - } - - assert_eq!( - old_task.enqueued_at, task.enqueued_at, - "Cannot update a task's enqueued_at time" - ); - if old_task.started_at != task.started_at { - assert!(old_task.started_at.is_none(), "Cannot update a task's started_at time"); - if let Some(started_at) = task.started_at { - insert_task_datetime(wtxn, self.started_at, started_at, task.uid)?; - } - } - if old_task.finished_at != task.finished_at { - assert!(old_task.finished_at.is_none(), "Cannot update a task's finished_at time"); - if let Some(finished_at) = task.finished_at { - insert_task_datetime(wtxn, self.finished_at, finished_at, task.uid)?; - } - } - - self.all_tasks.put(wtxn, &task.uid, task)?; - Ok(()) - } - - /// Returns the whole set of tasks that belongs to this batch. - pub(crate) fn tasks_in_batch(&self, rtxn: &RoTxn, batch_id: BatchId) -> Result { - Ok(self.batch_to_tasks_mapping.get(rtxn, &batch_id)?.unwrap_or_default()) - } - - /// Returns the whole set of tasks that belongs to this index. - pub(crate) fn index_tasks(&self, rtxn: &RoTxn, index: &str) -> Result { - Ok(self.index_tasks.get(rtxn, index)?.unwrap_or_default()) - } - - pub(crate) fn update_index( - &self, - wtxn: &mut RwTxn, - index: &str, - f: impl Fn(&mut RoaringBitmap), - ) -> Result<()> { - let mut tasks = self.index_tasks(wtxn, index)?; - f(&mut tasks); - if tasks.is_empty() { - self.index_tasks.delete(wtxn, index)?; - } else { - self.index_tasks.put(wtxn, index, &tasks)?; - } - - Ok(()) - } - - /// Returns the whole set of batches that belongs to this index. - pub(crate) fn index_batches(&self, rtxn: &RoTxn, index: &str) -> Result { - Ok(self.batch_index_tasks.get(rtxn, index)?.unwrap_or_default()) - } - - pub(crate) fn update_batch_index( - &self, - wtxn: &mut RwTxn, - index: &str, - f: impl Fn(&mut RoaringBitmap), - ) -> Result<()> { - let mut batches = self.index_batches(wtxn, index)?; - f(&mut batches); - if batches.is_empty() { - self.batch_index_tasks.delete(wtxn, index)?; - } else { - self.batch_index_tasks.put(wtxn, index, &batches)?; - } - - Ok(()) - } - - pub(crate) fn get_status(&self, rtxn: &RoTxn, status: Status) -> Result { - Ok(self.status.get(rtxn, &status)?.unwrap_or_default()) - } - - pub(crate) fn put_status( - &self, - wtxn: &mut RwTxn, - status: Status, - bitmap: &RoaringBitmap, - ) -> Result<()> { - Ok(self.status.put(wtxn, &status, bitmap)?) - } - - pub(crate) fn update_status( - &self, - wtxn: &mut RwTxn, - status: Status, - f: impl Fn(&mut RoaringBitmap), - ) -> Result<()> { - let mut tasks = self.get_status(wtxn, status)?; - f(&mut tasks); - self.put_status(wtxn, status, &tasks)?; - - Ok(()) - } - - pub(crate) fn get_batch_status(&self, rtxn: &RoTxn, status: Status) -> Result { - Ok(self.batch_status.get(rtxn, &status)?.unwrap_or_default()) - } - - pub(crate) fn put_batch_status( - &self, - wtxn: &mut RwTxn, - status: Status, - bitmap: &RoaringBitmap, - ) -> Result<()> { - Ok(self.batch_status.put(wtxn, &status, bitmap)?) - } - - pub(crate) fn update_batch_status( - &self, - wtxn: &mut RwTxn, - status: Status, - f: impl Fn(&mut RoaringBitmap), - ) -> Result<()> { - let mut tasks = self.get_batch_status(wtxn, status)?; - f(&mut tasks); - self.put_batch_status(wtxn, status, &tasks)?; - - Ok(()) - } - - pub(crate) fn get_kind(&self, rtxn: &RoTxn, kind: Kind) -> Result { - Ok(self.kind.get(rtxn, &kind)?.unwrap_or_default()) - } - - pub(crate) fn put_kind( - &self, - wtxn: &mut RwTxn, - kind: Kind, - bitmap: &RoaringBitmap, - ) -> Result<()> { - Ok(self.kind.put(wtxn, &kind, bitmap)?) - } - - pub(crate) fn update_kind( - &self, - wtxn: &mut RwTxn, - kind: Kind, - f: impl Fn(&mut RoaringBitmap), - ) -> Result<()> { - let mut tasks = self.get_kind(wtxn, kind)?; - f(&mut tasks); - self.put_kind(wtxn, kind, &tasks)?; - - Ok(()) - } - - pub(crate) fn get_batch_kind(&self, rtxn: &RoTxn, kind: Kind) -> Result { - Ok(self.batch_kind.get(rtxn, &kind)?.unwrap_or_default()) - } - - pub(crate) fn put_batch_kind( - &self, - wtxn: &mut RwTxn, - kind: Kind, - bitmap: &RoaringBitmap, - ) -> Result<()> { - Ok(self.batch_kind.put(wtxn, &kind, bitmap)?) - } - - pub(crate) fn update_batch_kind( - &self, - wtxn: &mut RwTxn, - kind: Kind, - f: impl Fn(&mut RoaringBitmap), - ) -> Result<()> { - let mut tasks = self.get_batch_kind(wtxn, kind)?; - f(&mut tasks); - self.put_batch_kind(wtxn, kind, &tasks)?; - - Ok(()) - } -} - pub(crate) fn insert_task_datetime( wtxn: &mut RwTxn, database: Database, @@ -514,6 +177,33 @@ pub(crate) fn remove_task_datetime( Ok(()) } +pub(crate) fn remove_n_tasks_datetime_earlier_than( + wtxn: &mut RwTxn, + database: Database, + earlier_than: OffsetDateTime, + mut count: usize, + task_id: TaskId, +) -> Result<()> { + let earlier_than = earlier_than.unix_timestamp_nanos(); + let mut iter = database.rev_range_mut(wtxn, &(..earlier_than))?; + while let Some((current, mut existing)) = iter.next().transpose()? { + count -= existing.remove(task_id) as usize; + + if existing.is_empty() { + // safety: We don't keep references to the database + unsafe { iter.del_current()? }; + } else { + // safety: We don't keep references to the database + unsafe { iter.put_current(¤t, &existing)? }; + } + if count == 0 { + break; + } + } + + Ok(()) +} + pub(crate) fn keep_ids_within_datetimes( rtxn: &RoTxn, ids: &mut RoaringBitmap, @@ -574,6 +264,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } + | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; if let Some(Details::IndexSwap { swaps }) = &mut task.details { @@ -647,11 +338,11 @@ pub fn clamp_to_page_size(size: usize) -> usize { } #[cfg(test)] -impl IndexScheduler { +impl crate::IndexScheduler { /// Asserts that the index scheduler's content is internally consistent. pub fn assert_internally_consistent(&self) { let rtxn = self.env.read_txn().unwrap(); - for task in self.all_tasks.iter(&rtxn).unwrap() { + for task in self.queue.tasks.all_tasks.iter(&rtxn).unwrap() { let (task_id, task) = task.unwrap(); let task_index_uid = task.index_uid().map(ToOwned::to_owned); @@ -668,27 +359,50 @@ impl IndexScheduler { kind, } = task; assert_eq!(uid, task.uid); - if let Some(ref batch) = batch_uid { + if task.status != Status::Enqueued { + let batch_uid = batch_uid.expect("All non enqueued tasks must be part of a batch"); assert!(self + .queue .batch_to_tasks_mapping - .get(&rtxn, batch) + .get(&rtxn, &batch_uid) .unwrap() .unwrap() .contains(uid)); + let batch = self.queue.batches.get_batch(&rtxn, batch_uid).unwrap().unwrap(); + assert_eq!(batch.uid, batch_uid); + if task.status == Status::Processing { + assert!(batch.progress.is_some()); + } else { + assert!(batch.progress.is_none()); + } + assert_eq!(batch.started_at, task.started_at.unwrap()); + assert_eq!(batch.finished_at, task.finished_at); + let enqueued_at = batch.enqueued_at.unwrap(); + assert!(task.enqueued_at >= enqueued_at.oldest); + assert!(task.enqueued_at <= enqueued_at.earliest); } if let Some(task_index_uid) = &task_index_uid { assert!(self + .queue + .tasks .index_tasks .get(&rtxn, task_index_uid.as_str()) .unwrap() .unwrap() .contains(task.uid)); } - let db_enqueued_at = - self.enqueued_at.get(&rtxn, &enqueued_at.unix_timestamp_nanos()).unwrap().unwrap(); + let db_enqueued_at = self + .queue + .tasks + .enqueued_at + .get(&rtxn, &enqueued_at.unix_timestamp_nanos()) + .unwrap() + .unwrap(); assert!(db_enqueued_at.contains(task_id)); if let Some(started_at) = started_at { let db_started_at = self + .queue + .tasks .started_at .get(&rtxn, &started_at.unix_timestamp_nanos()) .unwrap() @@ -697,6 +411,8 @@ impl IndexScheduler { } if let Some(finished_at) = finished_at { let db_finished_at = self + .queue + .tasks .finished_at .get(&rtxn, &finished_at.unix_timestamp_nanos()) .unwrap() @@ -704,9 +420,11 @@ impl IndexScheduler { assert!(db_finished_at.contains(task_id)); } if let Some(canceled_by) = canceled_by { - let db_canceled_tasks = self.get_status(&rtxn, Status::Canceled).unwrap(); + let db_canceled_tasks = + self.queue.tasks.get_status(&rtxn, Status::Canceled).unwrap(); assert!(db_canceled_tasks.contains(uid)); - let db_canceling_task = self.get_task(&rtxn, canceled_by).unwrap().unwrap(); + let db_canceling_task = + self.queue.tasks.get_task(&rtxn, canceled_by).unwrap().unwrap(); assert_eq!(db_canceling_task.status, Status::Succeeded); match db_canceling_task.kind { KindWithContent::TaskCancelation { query: _, tasks } => { @@ -766,7 +484,9 @@ impl IndexScheduler { Details::IndexInfo { primary_key: pk1 } => match &kind { KindWithContent::IndexCreation { index_uid, primary_key: pk2 } | KindWithContent::IndexUpdate { index_uid, primary_key: pk2 } => { - self.index_tasks + self.queue + .tasks + .index_tasks .get(&rtxn, index_uid.as_str()) .unwrap() .unwrap() @@ -871,26 +591,30 @@ impl IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } + Details::UpgradeDatabase { from: _, to: _ } => { + assert_eq!(kind.as_kind(), Kind::UpgradeDatabase); + } } } - assert!(self.get_status(&rtxn, status).unwrap().contains(uid)); - assert!(self.get_kind(&rtxn, kind.as_kind()).unwrap().contains(uid)); + assert!(self.queue.tasks.get_status(&rtxn, status).unwrap().contains(uid)); + assert!(self.queue.tasks.get_kind(&rtxn, kind.as_kind()).unwrap().contains(uid)); if let KindWithContent::DocumentAdditionOrUpdate { content_file, .. } = kind { match status { Status::Enqueued | Status::Processing => { assert!(self - .file_store + .queue.file_store .all_uuids() .unwrap() .any(|uuid| uuid.as_ref().unwrap() == &content_file), "Could not find uuid `{content_file}` in the file_store. Available uuids are {:?}.", - self.file_store.all_uuids().unwrap().collect::, file_store::Error>>().unwrap(), + self.queue.file_store.all_uuids().unwrap().collect::, file_store::Error>>().unwrap(), ); } Status::Succeeded | Status::Failed | Status::Canceled => { assert!(self + .queue .file_store .all_uuids() .unwrap() diff --git a/crates/index-scheduler/src/versioning.rs b/crates/index-scheduler/src/versioning.rs new file mode 100644 index 000000000..107b8e0ba --- /dev/null +++ b/crates/index-scheduler/src/versioning.rs @@ -0,0 +1,84 @@ +use meilisearch_types::heed::types::Str; +use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn, WithoutTls}; +use meilisearch_types::milli::heed_codec::version::VersionCodec; +use meilisearch_types::versioning; + +use crate::upgrade::upgrade_index_scheduler; +use crate::Result; + +/// The number of database used by queue itself +const NUMBER_OF_DATABASES: u32 = 1; +/// Database const names for the `IndexScheduler`. +mod db_name { + pub const VERSION: &str = "version"; +} +mod entry_name { + pub const MAIN: &str = "main"; +} + +#[derive(Clone)] +pub struct Versioning { + pub version: Database, +} + +impl Versioning { + pub const fn nb_db() -> u32 { + NUMBER_OF_DATABASES + } + + pub fn get_version(&self, rtxn: &RoTxn) -> Result, heed::Error> { + self.version.get(rtxn, entry_name::MAIN) + } + + pub fn set_version( + &self, + wtxn: &mut RwTxn, + version: (u32, u32, u32), + ) -> Result<(), heed::Error> { + self.version.put(wtxn, entry_name::MAIN, &version) + } + + pub fn set_current_version(&self, wtxn: &mut RwTxn) -> Result<(), heed::Error> { + let major = versioning::VERSION_MAJOR.parse().unwrap(); + let minor = versioning::VERSION_MINOR.parse().unwrap(); + let patch = versioning::VERSION_PATCH.parse().unwrap(); + self.set_version(wtxn, (major, minor, patch)) + } + + /// Return `Self` without checking anything about the version + pub fn raw_new(env: &Env, wtxn: &mut RwTxn) -> Result { + let version = env.create_database(wtxn, Some(db_name::VERSION))?; + Ok(Self { version }) + } + + pub(crate) fn new(env: &Env, db_version: (u32, u32, u32)) -> Result { + let mut wtxn = env.write_txn()?; + let this = Self::raw_new(env, &mut wtxn)?; + let from = match this.get_version(&wtxn)? { + Some(version) => version, + // fresh DB: use the db version + None => { + this.set_version(&mut wtxn, db_version)?; + db_version + } + }; + wtxn.commit()?; + + let bin_major: u32 = versioning::VERSION_MAJOR.parse().unwrap(); + let bin_minor: u32 = versioning::VERSION_MINOR.parse().unwrap(); + let bin_patch: u32 = versioning::VERSION_PATCH.parse().unwrap(); + let to = (bin_major, bin_minor, bin_patch); + + if from != to { + upgrade_index_scheduler(env, &this, from, to)?; + } + + // Once we reach this point it means the upgrade process, if there was one is entirely finished + // we can safely say we reached the latest version of the index scheduler + let mut wtxn = env.write_txn()?; + this.set_current_version(&mut wtxn)?; + wtxn.commit()?; + + Ok(this) + } +} diff --git a/crates/meili-snap/Cargo.toml b/crates/meili-snap/Cargo.toml index aee6b497f..0c48ff824 100644 --- a/crates/meili-snap/Cargo.toml +++ b/crates/meili-snap/Cargo.toml @@ -14,4 +14,4 @@ license.workspace = true # fixed version due to format breakages in v1.40 insta = { version = "=1.39.0", features = ["json", "redactions"] } md5 = "0.7.0" -once_cell = "1.19" +once_cell = "1.20" diff --git a/crates/meilisearch-auth/Cargo.toml b/crates/meilisearch-auth/Cargo.toml index ae0095ab4..d31effd6e 100644 --- a/crates/meilisearch-auth/Cargo.toml +++ b/crates/meilisearch-auth/Cargo.toml @@ -17,10 +17,10 @@ hmac = "0.12.1" maplit = "1.0.2" meilisearch-types = { path = "../meilisearch-types" } rand = "0.8.5" -roaring = { version = "0.10.6", features = ["serde"] } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order"] } +roaring = { version = "0.10.10", features = ["serde"] } +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.135", features = ["preserve_order"] } sha2 = "0.10.8" -thiserror = "1.0.61" -time = { version = "0.3.36", features = ["serde-well-known", "formatting", "parsing", "macros"] } -uuid = { version = "1.10.0", features = ["serde", "v4"] } +thiserror = "2.0.9" +time = { version = "0.3.37", features = ["serde-well-known", "formatting", "parsing", "macros"] } +uuid = { version = "1.11.0", features = ["serde", "v4"] } diff --git a/crates/meilisearch-auth/src/dump.rs b/crates/meilisearch-auth/src/dump.rs index 8a215d8ae..6a71b636e 100644 --- a/crates/meilisearch-auth/src/dump.rs +++ b/crates/meilisearch-auth/src/dump.rs @@ -2,6 +2,7 @@ use std::fs::File; use std::io::{BufReader, Write}; use std::path::Path; +use meilisearch_types::heed::{Env, WithoutTls}; use serde_json::Deserializer; use crate::{AuthController, HeedAuthStore, Result}; @@ -9,11 +10,8 @@ use crate::{AuthController, HeedAuthStore, Result}; const KEYS_PATH: &str = "keys"; impl AuthController { - pub fn dump(src: impl AsRef, dst: impl AsRef) -> Result<()> { - let mut store = HeedAuthStore::new(&src)?; - - // do not attempt to close the database on drop! - store.set_drop_on_close(false); + pub fn dump(auth_env: Env, dst: impl AsRef) -> Result<()> { + let store = HeedAuthStore::new(auth_env)?; let keys_file_path = dst.as_ref().join(KEYS_PATH); @@ -27,8 +25,8 @@ impl AuthController { Ok(()) } - pub fn load_dump(src: impl AsRef, dst: impl AsRef) -> Result<()> { - let store = HeedAuthStore::new(&dst)?; + pub fn load_dump(src: impl AsRef, auth_env: Env) -> Result<()> { + let store = HeedAuthStore::new(auth_env)?; let keys_file_path = src.as_ref().join(KEYS_PATH); diff --git a/crates/meilisearch-auth/src/lib.rs b/crates/meilisearch-auth/src/lib.rs index 4dbf1bf6f..01c986d9f 100644 --- a/crates/meilisearch-auth/src/lib.rs +++ b/crates/meilisearch-auth/src/lib.rs @@ -3,11 +3,10 @@ pub mod error; mod store; use std::collections::{HashMap, HashSet}; -use std::path::Path; -use std::sync::Arc; use error::{AuthControllerError, Result}; use maplit::hashset; +use meilisearch_types::heed::{Env, WithoutTls}; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::{Action, CreateApiKey, Key, PatchApiKey}; use meilisearch_types::milli::update::Setting; @@ -19,19 +18,19 @@ use uuid::Uuid; #[derive(Clone)] pub struct AuthController { - store: Arc, + store: HeedAuthStore, master_key: Option, } impl AuthController { - pub fn new(db_path: impl AsRef, master_key: &Option) -> Result { - let store = HeedAuthStore::new(db_path)?; + pub fn new(auth_env: Env, master_key: &Option) -> Result { + let store = HeedAuthStore::new(auth_env)?; if store.is_empty()? { generate_default_keys(&store)?; } - Ok(Self { store: Arc::new(store), master_key: master_key.clone() }) + Ok(Self { store, master_key: master_key.clone() }) } /// Return `Ok(())` if the auth controller is able to access one of its database. diff --git a/crates/meilisearch-auth/src/store.rs b/crates/meilisearch-auth/src/store.rs index 62aa445fa..167b3b503 100644 --- a/crates/meilisearch-auth/src/store.rs +++ b/crates/meilisearch-auth/src/store.rs @@ -1,18 +1,16 @@ use std::borrow::Cow; use std::cmp::Reverse; use std::collections::HashSet; -use std::fs::create_dir_all; use std::path::Path; use std::result::Result as StdResult; use std::str; use std::str::FromStr; -use std::sync::Arc; use hmac::{Hmac, Mac}; -use meilisearch_types::heed::BoxedError; +use meilisearch_types::heed::{BoxedError, WithoutTls}; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::KeyId; -use meilisearch_types::milli; +use meilisearch_types::milli::heed; use meilisearch_types::milli::heed::types::{Bytes, DecodeIgnore, SerdeJson}; use meilisearch_types::milli::heed::{Database, Env, EnvOpenOptions, RwTxn}; use sha2::Sha256; @@ -25,44 +23,32 @@ use super::error::{AuthControllerError, Result}; use super::{Action, Key}; const AUTH_STORE_SIZE: usize = 1_073_741_824; //1GiB -const AUTH_DB_PATH: &str = "auth"; const KEY_DB_NAME: &str = "api-keys"; const KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME: &str = "keyid-action-index-expiration"; #[derive(Clone)] pub struct HeedAuthStore { - env: Arc, + env: Env, keys: Database>, action_keyid_index_expiration: Database>>, - should_close_on_drop: bool, } -impl Drop for HeedAuthStore { - fn drop(&mut self) { - if self.should_close_on_drop && Arc::strong_count(&self.env) == 1 { - self.env.as_ref().clone().prepare_for_closing(); - } - } -} - -pub fn open_auth_store_env(path: &Path) -> milli::heed::Result { - let mut options = EnvOpenOptions::new(); +pub fn open_auth_store_env(path: &Path) -> heed::Result> { + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(AUTH_STORE_SIZE); // 1GB options.max_dbs(2); unsafe { options.open(path) } } impl HeedAuthStore { - pub fn new(path: impl AsRef) -> Result { - let path = path.as_ref().join(AUTH_DB_PATH); - create_dir_all(&path)?; - let env = Arc::new(open_auth_store_env(path.as_ref())?); + pub fn new(env: Env) -> Result { let mut wtxn = env.write_txn()?; let keys = env.create_database(&mut wtxn, Some(KEY_DB_NAME))?; let action_keyid_index_expiration = env.create_database(&mut wtxn, Some(KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME))?; wtxn.commit()?; - Ok(Self { env, keys, action_keyid_index_expiration, should_close_on_drop: true }) + Ok(Self { env, keys, action_keyid_index_expiration }) } /// Return `Ok(())` if the auth store is able to access one of its database. @@ -82,10 +68,6 @@ impl HeedAuthStore { Ok(self.env.non_free_pages_size()?) } - pub fn set_drop_on_close(&mut self, v: bool) { - self.should_close_on_drop = v; - } - pub fn is_empty(&self) -> Result { let rtxn = self.env.read_txn()?; @@ -287,7 +269,7 @@ impl KeyIdActionCodec { } } -impl<'a> milli::heed::BytesDecode<'a> for KeyIdActionCodec { +impl<'a> heed::BytesDecode<'a> for KeyIdActionCodec { type DItem = (KeyId, Action, Option<&'a [u8]>); fn bytes_decode(bytes: &'a [u8]) -> StdResult { @@ -304,7 +286,7 @@ impl<'a> milli::heed::BytesDecode<'a> for KeyIdActionCodec { } } -impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec { +impl<'a> heed::BytesEncode<'a> for KeyIdActionCodec { type EItem = (&'a KeyId, &'a Action, Option<&'a [u8]>); fn bytes_encode((key_id, action, index): &Self::EItem) -> StdResult, BoxedError> { diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index b071daac6..694757973 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -11,39 +11,42 @@ edition.workspace = true license.workspace = true [dependencies] -actix-web = { version = "4.8.0", default-features = false } -anyhow = "1.0.86" +actix-web = { version = "4.9.0", default-features = false } +anyhow = "1.0.95" bumpalo = "3.16.0" +bumparaw-collections = "0.1.4" convert_case = "0.6.0" -csv = "1.3.0" -deserr = { version = "0.6.2", features = ["actix-web"] } +csv = "1.3.1" +deserr = { version = "0.6.3", features = ["actix-web"] } either = { version = "1.13.0", features = ["serde"] } enum-iterator = "2.1.0" file-store = { path = "../file-store" } -flate2 = "1.0.30" +flate2 = "1.0.35" fst = "0.4.7" -memmap2 = "0.9.4" +memmap2 = "0.9.5" milli = { path = "../milli" } -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } -roaring = { version = "0.10.6", features = ["serde"] } -serde = { version = "1.0.204", features = ["derive"] } +roaring = { version = "0.10.10", features = ["serde"] } +rustc-hash = "2.1.0" +serde = { version = "1.0.217", features = ["derive"] } serde-cs = "0.2.4" -serde_json = "1.0.120" -tar = "0.4.41" -tempfile = "3.10.1" -thiserror = "1.0.61" -time = { version = "0.3.36", features = [ +serde_json = { version = "1.0.135", features = ["preserve_order"] } +tar = "0.4.43" +tempfile = "3.15.0" +thiserror = "2.0.9" +time = { version = "0.3.37", features = [ "serde-well-known", "formatting", "parsing", "macros", ] } -tokio = "1.38" -uuid = { version = "1.10.0", features = ["serde", "v4"] } +tokio = "1.42" +utoipa = { version = "5.3.1", features = ["macros"] } +uuid = { version = "1.11.0", features = ["serde", "v4"] } bitflags = "2.6.0" [dev-dependencies] -insta = "1.39.0" +# fixed version due to format breakages in v1.40 +insta = "=1.39.0" meili-snap = { path = "../meili-snap" } [features] diff --git a/crates/meilisearch-types/src/batch_view.rs b/crates/meilisearch-types/src/batch_view.rs index 5d800d897..112abd1dd 100644 --- a/crates/meilisearch-types/src/batch_view.rs +++ b/crates/meilisearch-types/src/batch_view.rs @@ -1,16 +1,18 @@ +use milli::progress::ProgressView; use serde::Serialize; use time::{Duration, OffsetDateTime}; +use utoipa::ToSchema; -use crate::{ - batches::{Batch, BatchId, BatchStats}, - task_view::DetailsView, - tasks::serialize_duration, -}; +use crate::batches::{Batch, BatchId, BatchStats}; +use crate::task_view::DetailsView; +use crate::tasks::serialize_duration; -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct BatchView { pub uid: BatchId, + pub progress: Option, pub details: DetailsView, pub stats: BatchStats, #[serde(serialize_with = "serialize_duration", default)] @@ -25,6 +27,7 @@ impl BatchView { pub fn from_batch(batch: &Batch) -> Self { Self { uid: batch.uid, + progress: batch.progress.clone(), details: batch.details.clone(), stats: batch.stats.clone(), duration: batch.finished_at.map(|finished_at| finished_at - batch.started_at), diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index a60386e52..904682585 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -1,12 +1,12 @@ use std::collections::BTreeMap; +use milli::progress::ProgressView; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; +use utoipa::ToSchema; -use crate::{ - task_view::DetailsView, - tasks::{Kind, Status}, -}; +use crate::task_view::DetailsView; +use crate::tasks::{Kind, Status}; pub type BatchId = u32; @@ -15,6 +15,8 @@ pub type BatchId = u32; pub struct Batch { pub uid: BatchId, + #[serde(skip)] + pub progress: Option, pub details: DetailsView, pub stats: BatchStats, @@ -22,13 +24,44 @@ pub struct Batch { pub started_at: OffsetDateTime, #[serde(with = "time::serde::rfc3339::option")] pub finished_at: Option, + + // Enqueued at is never displayed and is only required when removing a batch. + // It's always some except when upgrading from a database pre v1.12 + pub enqueued_at: Option, } -#[derive(Default, Debug, Clone, Serialize, Deserialize)] +impl PartialEq for Batch { + fn eq(&self, other: &Self) -> bool { + let Self { uid, progress, details, stats, started_at, finished_at, enqueued_at } = self; + + *uid == other.uid + && progress.is_none() == other.progress.is_none() + && details == &other.details + && stats == &other.stats + && started_at == &other.started_at + && finished_at == &other.finished_at + && enqueued_at == &other.enqueued_at + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct BatchEnqueuedAt { + #[serde(with = "time::serde::rfc3339")] + pub earliest: OffsetDateTime, + #[serde(with = "time::serde::rfc3339")] + pub oldest: OffsetDateTime, +} + +#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct BatchStats { pub total_nb_tasks: BatchId, pub status: BTreeMap, pub types: BTreeMap, pub index_uids: BTreeMap, + #[serde(default, skip_serializing_if = "serde_json::Map::is_empty")] + pub progress_trace: serde_json::Map, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub write_channel_congestion: Option>, } diff --git a/crates/meilisearch-types/src/deserr/mod.rs b/crates/meilisearch-types/src/deserr/mod.rs index 3c5e0fcf8..f5ad18d5c 100644 --- a/crates/meilisearch-types/src/deserr/mod.rs +++ b/crates/meilisearch-types/src/deserr/mod.rs @@ -193,6 +193,8 @@ merge_with_error_impl_take_error_message!(ParseTaskKindError); merge_with_error_impl_take_error_message!(ParseTaskStatusError); merge_with_error_impl_take_error_message!(IndexUidFormatError); merge_with_error_impl_take_error_message!(InvalidMultiSearchWeight); +merge_with_error_impl_take_error_message!(InvalidNetworkUrl); +merge_with_error_impl_take_error_message!(InvalidNetworkSearchApiKey); merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio); merge_with_error_impl_take_error_message!(InvalidSearchRankingScoreThreshold); merge_with_error_impl_take_error_message!(InvalidSimilarRankingScoreThreshold); diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 096349448..70a0e6204 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -4,10 +4,11 @@ use std::io::{self, BufWriter}; use std::marker::PhantomData; use bumpalo::Bump; +use bumparaw_collections::RawMap; use memmap2::Mmap; use milli::documents::Error; use milli::Object; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use serde::de::{SeqAccess, Visitor}; use serde::{Deserialize, Deserializer}; use serde_json::error::Category; @@ -214,13 +215,13 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB + let mut doc_alloc = Bump::with_capacity(1024 * 1024); // 1MiB let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); let res = array_each(&mut deserializer, |obj: &RawValue| { doc_alloc.reset(); - let map = RawMap::from_raw_value(obj, &doc_alloc)?; + let map = RawMap::from_raw_value_and_hasher(obj, FxBuildHasher, &doc_alloc)?; to_writer(&mut out, &map) }); let count = match res { @@ -250,26 +251,25 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { } } -/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way. -pub fn read_ndjson(input: &File, output: impl io::Write) -> Result { +/// Reads NDJSON from file and checks it. +pub fn read_ndjson(input: &File) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut output = BufWriter::new(output); - let mut bump = Bump::with_capacity(1024 * 1024); let mut count = 0; for result in serde_json::Deserializer::from_slice(&input).into_iter() { bump.reset(); - count += 1; - result - .and_then(|raw: &RawValue| { + match result { + Ok(raw) => { // try to deserialize as a map - let map = RawMap::from_raw_value(raw, &bump)?; - to_writer(&mut output, &map) - }) - .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; + RawMap::from_raw_value_and_hasher(raw, FxBuildHasher, &bump) + .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?; + count += 1; + } + Err(e) => return Err(DocumentFormatError::from((PayloadType::Ndjson, e))), + } } Ok(count) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 00f88b7b4..859563d8a 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -7,17 +7,25 @@ use aweb::rt::task::JoinError; use convert_case::Casing; use milli::heed::{Error as HeedError, MdbError}; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct ResponseError { #[serde(skip)] pub code: StatusCode, + /// The error message. pub message: String, + /// The error code. + #[schema(value_type = Code)] #[serde(rename = "code")] error_code: String, + /// The error type. + #[schema(value_type = ErrorType)] #[serde(rename = "type")] error_type: String, + /// A link to the documentation about this specific error. #[serde(rename = "link")] error_link: String, } @@ -97,7 +105,9 @@ pub trait ErrorCode { } #[allow(clippy::enum_variant_names)] -enum ErrorType { +#[derive(ToSchema)] +#[schema(rename_all = "snake_case")] +pub enum ErrorType { Internal, InvalidRequest, Auth, @@ -129,7 +139,8 @@ impl fmt::Display for ErrorType { /// `MyErrorCode::default().error_code()`. macro_rules! make_error_codes { ($($code_ident:ident, $err_type:ident, $status:ident);*) => { - #[derive(Debug, Clone, Copy, PartialEq, Eq)] + #[derive(Debug, Clone, Copy, PartialEq, Eq, ToSchema)] + #[schema(rename_all = "snake_case")] pub enum Code { $($code_ident),* } @@ -230,10 +241,12 @@ InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ; InvalidVectorsType , InvalidRequest , BAD_REQUEST ; InvalidDocumentId , InvalidRequest , BAD_REQUEST ; +InvalidDocumentIds , InvalidRequest , BAD_REQUEST ; InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ; InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ; -InvalidEmbedder , InvalidRequest , BAD_REQUEST ; -InvalidHybridQuery , InvalidRequest , BAD_REQUEST ; +InvalidSearchEmbedder , InvalidRequest , BAD_REQUEST ; +InvalidSimilarEmbedder , InvalidRequest , BAD_REQUEST ; +InvalidSearchHybridQuery , InvalidRequest , BAD_REQUEST ; InvalidIndexLimit , InvalidRequest , BAD_REQUEST ; InvalidIndexOffset , InvalidRequest , BAD_REQUEST ; InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ; @@ -248,7 +261,13 @@ InvalidMultiSearchMergeFacets , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryFacets , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryPagination , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchQueryRankingRules , InvalidRequest , BAD_REQUEST ; +InvalidMultiSearchQueryPosition , InvalidRequest , BAD_REQUEST ; +InvalidMultiSearchRemote , InvalidRequest , BAD_REQUEST ; InvalidMultiSearchWeight , InvalidRequest , BAD_REQUEST ; +InvalidNetworkRemotes , InvalidRequest , BAD_REQUEST ; +InvalidNetworkSelf , InvalidRequest , BAD_REQUEST ; +InvalidNetworkSearchApiKey , InvalidRequest , BAD_REQUEST ; +InvalidNetworkUrl , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; @@ -263,6 +282,7 @@ InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; InvalidSearchSemanticRatio , InvalidRequest , BAD_REQUEST ; InvalidSearchLocales , InvalidRequest , BAD_REQUEST ; +InvalidFacetSearchExhaustiveFacetCount, InvalidRequest , BAD_REQUEST ; InvalidFacetSearchFacetName , InvalidRequest , BAD_REQUEST ; InvalidSimilarId , InvalidRequest , BAD_REQUEST ; InvalidSearchFilter , InvalidRequest , BAD_REQUEST ; @@ -279,6 +299,7 @@ InvalidSearchPage , InvalidRequest , BAD_REQUEST ; InvalidSearchQ , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchQuery , InvalidRequest , BAD_REQUEST ; InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; +FacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; @@ -290,6 +311,8 @@ InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; +InvalidSettingsFacetSearch , InvalidRequest , BAD_REQUEST ; +InvalidSettingsPrefixSearch , InvalidRequest , BAD_REQUEST ; InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ; InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ; @@ -336,14 +359,22 @@ MissingDocumentId , InvalidRequest , BAD_REQUEST ; MissingFacetSearchFacetName , InvalidRequest , BAD_REQUEST ; MissingIndexUid , InvalidRequest , BAD_REQUEST ; MissingMasterKey , Auth , UNAUTHORIZED ; +MissingNetworkUrl , InvalidRequest , BAD_REQUEST ; MissingPayload , InvalidRequest , BAD_REQUEST ; MissingSearchHybrid , InvalidRequest , BAD_REQUEST ; MissingSwapIndexes , InvalidRequest , BAD_REQUEST ; MissingTaskFilters , InvalidRequest , BAD_REQUEST ; NoSpaceLeftOnDevice , System , UNPROCESSABLE_ENTITY; PayloadTooLarge , InvalidRequest , PAYLOAD_TOO_LARGE ; +RemoteBadResponse , System , BAD_GATEWAY ; +RemoteBadRequest , InvalidRequest , BAD_REQUEST ; +RemoteCouldNotSendRequest , System , BAD_GATEWAY ; +RemoteInvalidApiKey , Auth , FORBIDDEN ; +RemoteRemoteError , System , BAD_GATEWAY ; +RemoteTimeout , System , BAD_GATEWAY ; TooManySearchRequests , System , SERVICE_UNAVAILABLE ; TaskNotFound , InvalidRequest , NOT_FOUND ; +TaskFileNotFound , InvalidRequest , NOT_FOUND ; BatchNotFound , InvalidRequest , NOT_FOUND ; TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ; TooManyVectors , InvalidRequest , BAD_REQUEST ; @@ -376,7 +407,7 @@ impl ErrorCode for milli::Error { match error { // TODO: wait for spec for new error codes. UserError::SerdeJson(_) - | UserError::InvalidLmdbOpenOptions + | UserError::EnvAlreadyOpened | UserError::DocumentLimitReached | UserError::UnknownInternalDocumentId { .. } => Code::Internal, UserError::InvalidStoreFile => Code::InvalidStoreFile, @@ -385,6 +416,7 @@ impl ErrorCode for milli::Error { UserError::AttributeLimitReached => Code::MaxFieldsLimitExceeded, UserError::InvalidFilter(_) => Code::InvalidSearchFilter, UserError::InvalidFilterExpression(..) => Code::InvalidSearchFilter, + UserError::FilterOperatorNotAllowed { .. } => Code::InvalidSearchFilter, UserError::MissingDocumentId { .. } => Code::MissingDocumentId, UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => { Code::InvalidDocumentId @@ -399,9 +431,10 @@ impl ErrorCode for milli::Error { | UserError::InvalidUrl { .. } | UserError::InvalidSettingsDocumentTemplateMaxBytes { .. } | UserError::InvalidPrompt(_) - | UserError::InvalidDisableBinaryQuantization { .. } => { - Code::InvalidSettingsEmbedders - } + | UserError::InvalidDisableBinaryQuantization { .. } + | UserError::InvalidSourceForNested { .. } + | UserError::MissingSourceForNested { .. } + | UserError::InvalidSettingsEmbedder { .. } => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound, @@ -429,7 +462,8 @@ impl ErrorCode for milli::Error { UserError::InvalidMinTypoWordLenSetting(_, _) => { Code::InvalidSettingsTypoTolerance } - UserError::InvalidEmbedder(_) => Code::InvalidEmbedder, + UserError::InvalidSearchEmbedder(_) => Code::InvalidSearchEmbedder, + UserError::InvalidSimilarEmbedder(_) => Code::InvalidSimilarEmbedder, UserError::VectorEmbeddingError(_) | UserError::DocumentEmbeddingError(_) => { Code::VectorEmbeddingError } @@ -470,8 +504,7 @@ impl ErrorCode for HeedError { HeedError::Mdb(_) | HeedError::Encoding(_) | HeedError::Decoding(_) - | HeedError::DatabaseClosing - | HeedError::BadOpenOptions { .. } => Code::Internal, + | HeedError::EnvAlreadyOpened => Code::Internal, } } } @@ -547,7 +580,7 @@ impl fmt::Display for deserr_codes::InvalidSimilarId { "the value of `id` is invalid. \ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ - and can not be more than 512 bytes." + and can not be more than 511 bytes." ) } } @@ -567,6 +600,18 @@ impl fmt::Display for deserr_codes::InvalidSimilarRankingScoreThreshold { } } +impl fmt::Display for deserr_codes::InvalidNetworkUrl { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "the value of `url` is invalid, expected a string.") + } +} + +impl fmt::Display for deserr_codes::InvalidNetworkSearchApiKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "the value of `searchApiKey` is invalid, expected a string.") + } +} + #[macro_export] macro_rules! internal_error { ($target:ty : $($other:path), *) => { diff --git a/crates/meilisearch-types/src/facet_values_sort.rs b/crates/meilisearch-types/src/facet_values_sort.rs index 278061f19..8e0dd2ca4 100644 --- a/crates/meilisearch-types/src/facet_values_sort.rs +++ b/crates/meilisearch-types/src/facet_values_sort.rs @@ -1,8 +1,9 @@ use deserr::Deserr; use milli::OrderBy; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; -#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Deserr)] +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Deserr, ToSchema)] #[serde(rename_all = "camelCase")] #[deserr(rename_all = camelCase)] pub enum FacetValuesSort { diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index b9805a124..5db8775b6 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -1,13 +1,17 @@ +use std::collections::BTreeMap; + use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq)] #[serde(rename_all = "camelCase", default)] pub struct RuntimeTogglableFeatures { - pub vector_store: bool, pub metrics: bool, pub logs_route: bool, pub edit_documents_by_function: bool, pub contains_filter: bool, + pub network: bool, + pub get_task_documents_route: bool, + pub composite_embedders: bool, } #[derive(Default, Debug, Clone, Copy)] @@ -16,3 +20,20 @@ pub struct InstanceTogglableFeatures { pub logs_route: bool, pub contains_filter: bool, } + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct Remote { + pub url: String, + #[serde(default)] + pub search_api_key: Option, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub struct Network { + #[serde(default, rename = "self")] + pub local: Option, + #[serde(default)] + pub remotes: BTreeMap, +} diff --git a/crates/meilisearch-types/src/index_uid.rs b/crates/meilisearch-types/src/index_uid.rs index 03a31a82f..87efd261c 100644 --- a/crates/meilisearch-types/src/index_uid.rs +++ b/crates/meilisearch-types/src/index_uid.rs @@ -4,13 +4,16 @@ use std::fmt; use std::str::FromStr; use deserr::Deserr; +use serde::Serialize; +use utoipa::ToSchema; use crate::error::{Code, ErrorCode}; /// An index uid is composed of only ascii alphanumeric characters, - and _, between 1 and 400 /// bytes long -#[derive(Debug, Clone, PartialEq, Eq, Deserr, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, Deserr, PartialOrd, Ord, Serialize, ToSchema)] #[deserr(try_from(String) = IndexUid::try_from -> IndexUidFormatError)] +#[schema(value_type = String, example = "movies")] pub struct IndexUid(String); impl IndexUid { diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index 436854d7d..681320525 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -10,6 +10,7 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::format_description::well_known::Rfc3339; use time::macros::{format_description, time}; use time::{Date, OffsetDateTime, PrimitiveDateTime}; +use utoipa::ToSchema; use uuid::Uuid; use crate::deserr::{immutable_field_error, DeserrError, DeserrJsonError}; @@ -33,19 +34,31 @@ impl MergeWithError for Dese } } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] pub struct CreateApiKey { + /// A description for the key. `null` if empty. + #[schema(example = json!(null))] #[deserr(default, error = DeserrJsonError)] pub description: Option, + /// A human-readable name for the key. `null` if empty. + #[schema(example = "Indexing Products API key")] #[deserr(default, error = DeserrJsonError)] pub name: Option, + /// A uuid v4 to identify the API Key. If not specified, it's generated by Meilisearch. + #[schema(value_type = Uuid, example = json!(null))] #[deserr(default = Uuid::new_v4(), error = DeserrJsonError, try_from(&String) = Uuid::from_str -> uuid::Error)] pub uid: KeyId, + /// A list of actions permitted for the key. `["*"]` for all actions. The `*` character can be used as a wildcard when located at the last position. e.g. `documents.*` to authorize access on all documents endpoints. + #[schema(example = json!(["documents.add"]))] #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_api_key_actions)] pub actions: Vec, + /// A list of accesible indexes permitted for the key. `["*"]` for all indexes. The `*` character can be used as a wildcard when located at the last position. e.g. `products_*` to allow access to all indexes whose names start with `products_`. #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_api_key_indexes)] + #[schema(value_type = Vec, example = json!(["products"]))] pub indexes: Vec, + /// Represent the expiration date and time as RFC 3339 format. `null` equals to no expiration time. #[deserr(error = DeserrJsonError, try_from(Option) = parse_expiration_date -> ParseOffsetDateTimeError, missing_field_error = DeserrJsonError::missing_api_key_expires_at)] pub expires_at: Option, } @@ -87,12 +100,15 @@ fn deny_immutable_fields_api_key( } } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields = deny_immutable_fields_api_key)] +#[schema(rename_all = "camelCase")] pub struct PatchApiKey { #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = "This key is used to update documents in the products index")] pub description: Setting, #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = "Indexing Products API key")] pub name: Setting, } @@ -180,11 +196,13 @@ fn parse_expiration_date( } } +#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord, ToSchema)] +#[repr(transparent)] +pub struct Action(u32); + bitflags! { - #[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)] - #[repr(transparent)] // NOTE: For `Sequence` impl to work, the values of these must be in ascending order - pub struct Action: u32 { + impl Action: u32 { const Search = 1; // Documents const DocumentsAdd = 1 << 1; @@ -225,15 +243,19 @@ bitflags! { const KeysGet = 1 << 20; const KeysUpdate = 1 << 21; const KeysDelete = 1 << 22; + // Experimental Features const ExperimentalFeaturesGet = 1 << 23; const ExperimentalFeaturesUpdate = 1 << 24; + // Network + const NetworkGet = 1 << 25; + const NetworkUpdate = 1 << 26; // All - const All = 0xFFFFFFFF >> (32 - 1 - 24); + const All = 0xFFFFFFFF >> (32 - 1 - 26); } } impl Action { - const SERDE_MAP_ARR: [(&'static str, Self); 34] = [ + const SERDE_MAP_ARR: [(&'static str, Self); 36] = [ ("search", Self::Search), ("documents.add", Self::DocumentsAdd), ("documents.get", Self::DocumentsGet), @@ -267,6 +289,8 @@ impl Action { ("keys.delete", Self::KeysDelete), ("experimental.get", Self::ExperimentalFeaturesGet), ("experimental.update", Self::ExperimentalFeaturesUpdate), + ("network.get", Self::NetworkGet), + ("network.update", Self::NetworkUpdate), ("*", Self::All), ]; @@ -335,6 +359,8 @@ pub mod actions { pub const KEYS_DELETE: u32 = A::KeysDelete.bits(); pub const EXPERIMENTAL_FEATURES_GET: u32 = A::ExperimentalFeaturesGet.bits(); pub const EXPERIMENTAL_FEATURES_UPDATE: u32 = A::ExperimentalFeaturesUpdate.bits(); + pub const NETWORK_GET: u32 = A::NetworkGet.bits(); + pub const NETWORK_UPDATE: u32 = A::NetworkUpdate.bits(); pub const ALL: u32 = A::All.bits(); } @@ -382,7 +408,7 @@ impl<'de> Deserialize<'de> for Action { D: Deserializer<'de>, { struct Visitor; - impl<'de> serde::de::Visitor<'de> for Visitor { + impl serde::de::Visitor<'_> for Visitor { type Value = Action; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { diff --git a/crates/meilisearch-types/src/locales.rs b/crates/meilisearch-types/src/locales.rs index 8d746779e..b3fb90493 100644 --- a/crates/meilisearch-types/src/locales.rs +++ b/crates/meilisearch-types/src/locales.rs @@ -1,12 +1,13 @@ use deserr::Deserr; -use milli::LocalizedAttributesRule; +use milli::{AttributePatterns, LocalizedAttributesRule}; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; -#[derive(Debug, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize, ToSchema)] #[deserr(rename_all = camelCase)] #[serde(rename_all = "camelCase")] pub struct LocalizedAttributesRuleView { - pub attribute_patterns: Vec, + pub attribute_patterns: AttributePatterns, pub locales: Vec, } @@ -33,7 +34,7 @@ impl From for LocalizedAttributesRule { /// this enum implements `Deserr` in order to be used in the API. macro_rules! make_locale { ($(($iso_639_1:ident, $iso_639_1_str:expr) => ($iso_639_3:ident, $iso_639_3_str:expr),)+) => { - #[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize, Ord, PartialOrd)] + #[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize, Ord, PartialOrd, ToSchema)] #[deserr(rename_all = camelCase)] #[serde(rename_all = "camelCase")] pub enum Locale { diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index e3803fa28..6ace0f4ee 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -8,11 +8,12 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; -use milli::index::IndexEmbeddingConfig; +use milli::index::{IndexEmbeddingConfig, PrefixSearch}; use milli::proximity::ProximityPrecision; use milli::update::Setting; -use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; +use milli::{Criterion, CriterionError, FilterableAttributesRule, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; +use utoipa::ToSchema; use crate::deserr::DeserrJsonError; use crate::error::deserr_codes::*; @@ -39,10 +40,10 @@ where .serialize(s) } -#[derive(Clone, Default, Debug, Serialize, PartialEq, Eq)] +#[derive(Clone, Default, Debug, Serialize, PartialEq, Eq, ToSchema)] pub struct Checked; -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, ToSchema)] pub struct Unchecked; impl Deserr for Unchecked @@ -69,54 +70,63 @@ fn validate_min_word_size_for_typo_setting( Ok(s) } -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(deny_unknown_fields, rename_all = camelCase, validate = validate_min_word_size_for_typo_setting -> DeserrJsonError)] pub struct MinWordSizeTyposSetting { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option, example = json!(5))] pub one_typo: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option, example = json!(9))] pub two_typos: Setting, } -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(deny_unknown_fields, rename_all = camelCase, where_predicate = __Deserr_E: deserr::MergeWithError>)] pub struct TypoSettings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option, example = json!(true))] pub enabled: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!({ "oneTypo": 5, "twoTypo": 9 }))] pub min_word_size_for_typos: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option>, example = json!(["iPhone", "phone"]))] pub disable_on_words: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option>, example = json!(["uuid", "url"]))] pub disable_on_attributes: Setting>, } -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub struct FacetingSettings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option, example = json!(10))] pub max_values_per_facet: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option>, example = json!({ "genre": FacetValuesSort::Count }))] pub sort_facet_values_by: Setting>, } -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub struct PaginationSettings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option, example = json!(250))] pub max_total_hits: Setting, } @@ -134,74 +144,150 @@ impl MergeWithError for DeserrJsonError)] + pub inner: Setting, +} + +impl fmt::Debug for SettingEmbeddingSettings { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.fmt(f) + } +} + +impl Deserr for SettingEmbeddingSettings { + fn deserialize_from_value( + value: deserr::Value, + location: ValuePointerRef, + ) -> Result { + Setting::::deserialize_from_value( + value, location, + ) + .map(|inner| Self { inner }) + } +} + /// Holds all the settings for an index. `T` can either be `Checked` if they represents settings /// whose validity is guaranteed, or `Unchecked` if they need to be validated. In the later case, a /// call to `check` will return a `Settings` from a `Settings`. -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] #[serde( deny_unknown_fields, rename_all = "camelCase", bound(serialize = "T: Serialize", deserialize = "T: Deserialize<'static>") )] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] pub struct Settings { + /// Fields displayed in the returned documents. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(["id", "title", "description", "url"]))] pub displayed_attributes: WildcardSetting, - + /// Fields in which to search for matching query words sorted by order of importance. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(["title", "description"]))] pub searchable_attributes: WildcardSetting, - + /// Attributes to use for faceting and filtering. See [Filtering and Faceted Search](https://www.meilisearch.com/docs/learn/filtering_and_sorting/search_with_facet_filters). #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] - pub filterable_attributes: Setting>, + #[schema(value_type = Option>, example = json!(["release_date", "genre"]))] + pub filterable_attributes: Setting>, + /// Attributes to use when sorting search results. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(["release_date"]))] pub sortable_attributes: Setting>, + /// List of ranking rules sorted by order of importance. The order is customizable. + /// [A list of ordered built-in ranking rules](https://www.meilisearch.com/docs/learn/relevancy/relevancy). #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!([RankingRuleView::Words, RankingRuleView::Typo, RankingRuleView::Proximity, RankingRuleView::Attribute, RankingRuleView::Exactness]))] pub ranking_rules: Setting>, + /// List of words ignored when present in search queries. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(["the", "a", "them", "their"]))] pub stop_words: Setting>, + /// List of characters not delimiting where one term begins and ends. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!([" ", "\n"]))] pub non_separator_tokens: Setting>, + /// List of characters delimiting where one term begins and ends. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(["S"]))] pub separator_tokens: Setting>, + /// List of strings Meilisearch should parse as a single term. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(["iPhone pro"]))] pub dictionary: Setting>, + /// List of associated words treated similarly. A word associated to an array of word as synonyms. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>>, example = json!({ "he": ["she", "they", "them"], "phone": ["iPhone", "android"]}))] pub synonyms: Setting>>, + /// Search returns documents with distinct (different) values of the given field. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!("sku"))] pub distinct_attribute: Setting, + /// Precision level when calculating the proximity ranking rule. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!(ProximityPrecisionView::ByAttribute))] pub proximity_precision: Setting, + /// Customize typo tolerance feature. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!({ "enabled": true, "disableOnAttributes": ["title"]}))] pub typo_tolerance: Setting, + /// Faceting settings. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!({ "maxValuesPerFacet": 10, "sortFacetValuesBy": { "genre": FacetValuesSort::Count }}))] pub faceting: Setting, + /// Pagination settings. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!({ "maxValuesPerFacet": 10, "sortFacetValuesBy": { "genre": FacetValuesSort::Count }}))] pub pagination: Setting, + /// Embedder required for performing semantic search queries. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] - pub embedders: Setting>>, + #[schema(value_type = Option>)] + pub embedders: Setting>, + /// Maximum duration of a search query. #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!(50))] pub search_cutoff_ms: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option>, example = json!(50))] pub localized_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!(true))] + pub facet_search: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!("Hemlo"))] + pub prefix_search: Setting, #[serde(skip)] #[deserr(skip)] @@ -215,7 +301,7 @@ impl Settings { }; for mut embedder in embedders.values_mut() { - let Setting::Set(embedder) = &mut embedder else { + let SettingEmbeddingSettings { inner: Setting::Set(embedder) } = &mut embedder else { continue; }; @@ -266,6 +352,8 @@ impl Settings { embedders: Setting::Reset, search_cutoff_ms: Setting::Reset, localized_attributes: Setting::Reset, + facet_search: Setting::Reset, + prefix_search: Setting::Reset, _kind: PhantomData, } } @@ -290,6 +378,8 @@ impl Settings { embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind, } = self; @@ -312,6 +402,8 @@ impl Settings { embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind: PhantomData, } } @@ -360,6 +452,8 @@ impl Settings { embedders: self.embedders, search_cutoff_ms: self.search_cutoff_ms, localized_attributes: self.localized_attributes, + facet_search: self.facet_search, + prefix_search: self.prefix_search, _kind: PhantomData, } } @@ -372,8 +466,9 @@ impl Settings { let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; for (name, config) in configs.iter_mut() { let config_to_check = std::mem::take(config); - let checked_config = milli::update::validate_embedding_settings(config_to_check, name)?; - *config = checked_config + let checked_config = + milli::update::validate_embedding_settings(config_to_check.inner, name)?; + *config = SettingEmbeddingSettings { inner: checked_config }; } self.embedders = Setting::Set(configs); Ok(self) @@ -433,6 +528,8 @@ impl Settings { Setting::Set(this) } }, + prefix_search: other.prefix_search.or(self.prefix_search), + facet_search: other.facet_search.or(self.facet_search), _kind: PhantomData, } } @@ -469,6 +566,8 @@ pub fn apply_settings_to_builder( embedders, search_cutoff_ms, localized_attributes: localized_attributes_rules, + facet_search, + prefix_search, _kind, } = settings; @@ -647,7 +746,9 @@ pub fn apply_settings_to_builder( } match embedders { - Setting::Set(value) => builder.set_embedder_settings(value.clone()), + Setting::Set(value) => builder.set_embedder_settings( + value.iter().map(|(k, v)| (k.clone(), v.inner.clone())).collect(), + ), Setting::Reset => builder.reset_embedder_settings(), Setting::NotSet => (), } @@ -657,6 +758,20 @@ pub fn apply_settings_to_builder( Setting::Reset => builder.reset_search_cutoff(), Setting::NotSet => (), } + + match prefix_search { + Setting::Set(prefix_search) => { + builder.set_prefix_search(PrefixSearch::from(*prefix_search)) + } + Setting::Reset => builder.reset_prefix_search(), + Setting::NotSet => (), + } + + match facet_search { + Setting::Set(facet_search) => builder.set_facet_search(*facet_search), + Setting::Reset => builder.reset_facet_search(), + Setting::NotSet => (), + } } pub enum SecretPolicy { @@ -676,7 +791,7 @@ pub fn settings( .user_defined_searchable_fields(rtxn)? .map(|fields| fields.into_iter().map(String::from).collect()); - let filterable_attributes = index.filterable_fields(rtxn)?.into_iter().collect(); + let filterable_attributes = index.filterable_attributes_rules(rtxn)?.into_iter().collect(); let sortable_attributes = index.sortable_fields(rtxn)?.into_iter().collect(); @@ -747,14 +862,20 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) + .map(|IndexEmbeddingConfig { name, config, .. }| { + (name, SettingEmbeddingSettings { inner: Setting::Set(config.into()) }) + }) .collect(); - let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; + let embedders = Setting::Set(embedders); let search_cutoff_ms = index.search_cutoff(rtxn)?; let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; + let prefix_search = index.prefix_search(rtxn)?.map(PrefixSearchSettings::from); + + let facet_search = index.facet_search(rtxn)?; + let mut settings = Settings { displayed_attributes: match displayed_attributes { Some(attrs) => Setting::Set(attrs), @@ -791,17 +912,18 @@ pub fn settings( Some(rules) => Setting::Set(rules.into_iter().map(|r| r.into()).collect()), None => Setting::Reset, }, + prefix_search: Setting::Set(prefix_search.unwrap_or_default()), + facet_search: Setting::Set(facet_search), _kind: PhantomData, }; if let SecretPolicy::HideSecrets = secret_policy { settings.hide_secrets() } - Ok(settings) } -#[derive(Debug, Clone, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, PartialEq, Eq, Deserr, ToSchema)] #[deserr(try_from(&String) = FromStr::from_str -> CriterionError)] pub enum RankingRuleView { /// Sorted by decreasing number of matched query terms. @@ -838,7 +960,7 @@ impl<'de> Deserialize<'de> for RankingRuleView { D: serde::Deserializer<'de>, { struct Visitor; - impl<'de> serde::de::Visitor<'de> for Visitor { + impl serde::de::Visitor<'_> for Visitor { type Value = RankingRuleView; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { write!(formatter, "the name of a valid ranking rule (string)") @@ -897,7 +1019,7 @@ impl From for Criterion { } } -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)] +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub enum ProximityPrecisionView { @@ -964,6 +1086,33 @@ impl std::ops::Deref for WildcardSetting { } } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +pub enum PrefixSearchSettings { + #[default] + IndexingTime, + Disabled, +} + +impl From for PrefixSearchSettings { + fn from(value: PrefixSearch) -> Self { + match value { + PrefixSearch::IndexingTime => PrefixSearchSettings::IndexingTime, + PrefixSearch::Disabled => PrefixSearchSettings::Disabled, + } + } +} +impl From for PrefixSearch { + fn from(value: PrefixSearchSettings) -> Self { + match value { + PrefixSearchSettings::IndexingTime => PrefixSearch::IndexingTime, + PrefixSearchSettings::Disabled => PrefixSearch::Disabled, + } + } +} + #[cfg(test)] pub(crate) mod test { use super::*; @@ -990,6 +1139,8 @@ pub(crate) mod test { embedders: Setting::NotSet, localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData::, }; @@ -1019,6 +1170,8 @@ pub(crate) mod test { embedders: Setting::NotSet, localized_attributes: Setting::NotSet, search_cutoff_ms: Setting::NotSet, + facet_search: Setting::NotSet, + prefix_search: Setting::NotSet, _kind: PhantomData::, }; diff --git a/crates/meilisearch-types/src/star_or.rs b/crates/meilisearch-types/src/star_or.rs index cd26a1fb0..52804ccfa 100644 --- a/crates/meilisearch-types/src/star_or.rs +++ b/crates/meilisearch-types/src/star_or.rs @@ -6,6 +6,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, MergeWithError, ValueKind}; use serde::de::Visitor; use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use utoipa::PartialSchema; use crate::deserr::query_params::FromQueryParameter; @@ -65,7 +66,7 @@ where /// not supported on untagged enums. struct StarOrVisitor(PhantomData); - impl<'de, T, FE> Visitor<'de> for StarOrVisitor + impl Visitor<'_> for StarOrVisitor where T: FromStr, FE: fmt::Display, @@ -229,7 +230,7 @@ pub enum OptionStarOrList { List(Vec), } -impl OptionStarOrList { +impl OptionStarOrList { pub fn is_some(&self) -> bool { match self { Self::None => false, diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 64dbd58f7..7a6faee39 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,32 +1,49 @@ use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; +use utoipa::ToSchema; use crate::batches::BatchId; use crate::error::ResponseError; use crate::settings::{Settings, Unchecked}; use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; -#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct TaskView { + /// The unique sequential identifier of the task. + #[schema(value_type = u32, example = 4312)] pub uid: TaskId, + /// The unique identifier of the index where this task is operated. + #[schema(value_type = Option, example = json!("movies"))] pub batch_uid: Option, #[serde(default)] pub index_uid: Option, pub status: Status, + /// The type of the task. #[serde(rename = "type")] pub kind: Kind, + /// The uid of the task that performed the taskCancelation if the task has been canceled. + #[schema(value_type = Option, example = json!(4326))] pub canceled_by: Option, #[serde(skip_serializing_if = "Option::is_none")] pub details: Option, pub error: Option, + /// Total elasped time the engine was in processing state expressed as a `ISO-8601` duration format. + #[schema(value_type = Option, example = json!(null))] #[serde(serialize_with = "serialize_duration", default)] pub duration: Option, + /// An `RFC 3339` format for date/time/duration. + #[schema(value_type = String, example = json!("2024-08-08_14:12:09.393Z"))] #[serde(with = "time::serde::rfc3339")] pub enqueued_at: OffsetDateTime, + /// An `RFC 3339` format for date/time/duration. + #[schema(value_type = String, example = json!("2024-08-08_14:12:09.393Z"))] #[serde(with = "time::serde::rfc3339::option", default)] pub started_at: Option, + /// An `RFC 3339` format for date/time/duration. + #[schema(value_type = String, example = json!("2024-08-08_14:12:09.393Z"))] #[serde(with = "time::serde::rfc3339::option", default)] pub finished_at: Option, } @@ -50,40 +67,57 @@ impl TaskView { } } -#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] +#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct DetailsView { + /// Number of documents received for documentAdditionOrUpdate task. #[serde(skip_serializing_if = "Option::is_none")] pub received_documents: Option, + /// Number of documents finally indexed for documentAdditionOrUpdate task or a documentAdditionOrUpdate batch of tasks. #[serde(skip_serializing_if = "Option::is_none")] pub indexed_documents: Option>, + /// Number of documents edited for editDocumentByFunction task. #[serde(skip_serializing_if = "Option::is_none")] pub edited_documents: Option>, + /// Value for the primaryKey field encountered if any for indexCreation or indexUpdate task. #[serde(skip_serializing_if = "Option::is_none")] pub primary_key: Option>, + /// Number of provided document ids for the documentDeletion task. #[serde(skip_serializing_if = "Option::is_none")] pub provided_ids: Option, + /// Number of documents finally deleted for documentDeletion and indexDeletion tasks. #[serde(skip_serializing_if = "Option::is_none")] pub deleted_documents: Option>, + /// Number of tasks that match the request for taskCancelation or taskDeletion tasks. #[serde(skip_serializing_if = "Option::is_none")] pub matched_tasks: Option, + /// Number of tasks canceled for taskCancelation. #[serde(skip_serializing_if = "Option::is_none")] pub canceled_tasks: Option>, + /// Number of tasks deleted for taskDeletion. #[serde(skip_serializing_if = "Option::is_none")] pub deleted_tasks: Option>, + /// Original filter query for taskCancelation or taskDeletion tasks. #[serde(skip_serializing_if = "Option::is_none")] pub original_filter: Option>, + /// Identifier generated for the dump for dumpCreation task. #[serde(skip_serializing_if = "Option::is_none")] pub dump_uid: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub context: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub function: Option, + /// [Learn more about the settings in this guide](https://www.meilisearch.com/docs/reference/api/settings). #[serde(skip_serializing_if = "Option::is_none")] #[serde(flatten)] pub settings: Option>>, #[serde(skip_serializing_if = "Option::is_none")] pub swaps: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub upgrade_from: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub upgrade_to: Option, } impl DetailsView { @@ -204,6 +238,18 @@ impl DetailsView { Some(left) } }, + // We want the earliest version + upgrade_from: match (self.upgrade_from.clone(), other.upgrade_from.clone()) { + (None, None) => None, + (None, Some(from)) | (Some(from), None) => Some(from), + (Some(from), Some(_)) => Some(from), + }, + // And the latest + upgrade_to: match (self.upgrade_to.clone(), other.upgrade_to.clone()) { + (None, None) => None, + (None, Some(to)) | (Some(to), None) => Some(to), + (Some(_), Some(to)) => Some(to), + }, } } } @@ -281,6 +327,11 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } + Details::UpgradeDatabase { from, to } => DetailsView { + upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), + upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), + ..Default::default() + }, } } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index ebd28f526..6b237ee1f 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -4,19 +4,19 @@ use std::fmt::{Display, Write}; use std::str::FromStr; use enum_iterator::Sequence; -use milli::update::new::indexer::document_changes::Progress; use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; use time::{Duration, OffsetDateTime}; +use utoipa::ToSchema; use uuid::Uuid; use crate::batches::BatchId; use crate::error::ResponseError; use crate::keys::Key; use crate::settings::{Settings, Unchecked}; -use crate::InstanceUid; +use crate::{versioning, InstanceUid}; pub type TaskId = u32; @@ -41,62 +41,6 @@ pub struct Task { pub kind: KindWithContent, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "camelCase")] -pub struct TaskProgress { - pub current_step: &'static str, - pub finished_steps: u16, - pub total_steps: u16, - pub finished_substeps: Option, - pub total_substeps: Option, -} - -impl Default for TaskProgress { - fn default() -> Self { - Self::new() - } -} - -impl TaskProgress { - pub fn new() -> Self { - Self { - current_step: "start", - finished_steps: 0, - total_steps: 1, - finished_substeps: None, - total_substeps: None, - } - } - - pub fn update(&mut self, progress: Progress) -> TaskProgress { - if self.finished_steps > progress.finished_steps { - return *self; - } - - if self.current_step != progress.step_name { - self.current_step = progress.step_name - } - - self.total_steps = progress.total_steps; - - if self.finished_steps < progress.finished_steps { - self.finished_substeps = None; - self.total_substeps = None; - } - self.finished_steps = progress.finished_steps; - if let Some((finished_substeps, total_substeps)) = progress.finished_total_substep { - if let Some(task_finished_substeps) = self.finished_substeps { - if task_finished_substeps > finished_substeps { - return *self; - } - } - self.finished_substeps = Some(finished_substeps); - self.total_substeps = Some(total_substeps); - } - *self - } -} - impl Task { pub fn index_uid(&self) -> Option<&str> { use KindWithContent::*; @@ -106,6 +50,7 @@ impl Task { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | UpgradeDatabase { .. } | IndexSwap { .. } => None, DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } @@ -140,7 +85,8 @@ impl Task { | KindWithContent::TaskCancelation { .. } | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } - | KindWithContent::SnapshotCreation => None, + | KindWithContent::SnapshotCreation + | KindWithContent::UpgradeDatabase { .. } => None, } } } @@ -206,9 +152,12 @@ pub enum KindWithContent { instance_uid: Option, }, SnapshotCreation, + UpgradeDatabase { + from: (u32, u32, u32), + }, } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct IndexSwap { pub indexes: (String, String), @@ -231,6 +180,7 @@ impl KindWithContent { KindWithContent::TaskDeletion { .. } => Kind::TaskDeletion, KindWithContent::DumpCreation { .. } => Kind::DumpCreation, KindWithContent::SnapshotCreation => Kind::SnapshotCreation, + KindWithContent::UpgradeDatabase { .. } => Kind::UpgradeDatabase, } } @@ -241,7 +191,8 @@ impl KindWithContent { DumpCreation { .. } | SnapshotCreation | TaskCancelation { .. } - | TaskDeletion { .. } => vec![], + | TaskDeletion { .. } + | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } | DocumentDeletion { index_uid, .. } @@ -318,6 +269,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { + from: (from.0, from.1, from.2), + to: ( + versioning::VERSION_MAJOR.parse().unwrap(), + versioning::VERSION_MINOR.parse().unwrap(), + versioning::VERSION_PATCH.parse().unwrap(), + ), + }), } } @@ -376,6 +335,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { + from: *from, + to: ( + versioning::VERSION_MAJOR.parse().unwrap(), + versioning::VERSION_MINOR.parse().unwrap(), + versioning::VERSION_PATCH.parse().unwrap(), + ), + }), } } } @@ -416,13 +383,34 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { + from: *from, + to: ( + versioning::VERSION_MAJOR.parse().unwrap(), + versioning::VERSION_MINOR.parse().unwrap(), + versioning::VERSION_PATCH.parse().unwrap(), + ), + }), } } } +/// The status of a task. #[derive( - Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Sequence, PartialOrd, Ord, + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + Serialize, + Deserialize, + Sequence, + PartialOrd, + Ord, + ToSchema, )] +#[schema(example = json!(Status::Processing))] #[serde(rename_all = "camelCase")] pub enum Status { Enqueued, @@ -481,10 +469,23 @@ impl fmt::Display for ParseTaskStatusError { } impl std::error::Error for ParseTaskStatusError {} +/// The type of the task. #[derive( - Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Sequence, PartialOrd, Ord, + Debug, + Clone, + Copy, + PartialEq, + Eq, + Hash, + Serialize, + Deserialize, + Sequence, + PartialOrd, + Ord, + ToSchema, )] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase", example = json!(enum_iterator::all::().collect::>()))] pub enum Kind { DocumentAdditionOrUpdate, DocumentEdition, @@ -498,6 +499,7 @@ pub enum Kind { TaskDeletion, DumpCreation, SnapshotCreation, + UpgradeDatabase, } impl Kind { @@ -514,6 +516,7 @@ impl Kind { | Kind::TaskCancelation | Kind::TaskDeletion | Kind::DumpCreation + | Kind::UpgradeDatabase | Kind::SnapshotCreation => false, } } @@ -533,6 +536,7 @@ impl Display for Kind { Kind::TaskDeletion => write!(f, "taskDeletion"), Kind::DumpCreation => write!(f, "dumpCreation"), Kind::SnapshotCreation => write!(f, "snapshotCreation"), + Kind::UpgradeDatabase => write!(f, "upgradeDatabase"), } } } @@ -564,6 +568,8 @@ impl FromStr for Kind { Ok(Kind::DumpCreation) } else if kind.eq_ignore_ascii_case("snapshotCreation") { Ok(Kind::SnapshotCreation) + } else if kind.eq_ignore_ascii_case("upgradeDatabase") { + Ok(Kind::UpgradeDatabase) } else { Err(ParseTaskKindError(kind.to_owned())) } @@ -637,6 +643,10 @@ pub enum Details { IndexSwap { swaps: Vec, }, + UpgradeDatabase { + from: (u32, u32, u32), + to: (u32, u32, u32), + }, } impl Details { @@ -657,6 +667,7 @@ impl Details { Self::SettingsUpdate { .. } | Self::IndexInfo { .. } | Self::Dump { .. } + | Self::UpgradeDatabase { .. } | Self::IndexSwap { .. } => (), } @@ -717,7 +728,9 @@ pub fn serialize_duration( #[cfg(test)] mod tests { - use super::Details; + use std::str::FromStr; + + use super::{Details, Kind}; use crate::heed::types::SerdeJson; use crate::heed::{BytesDecode, BytesEncode}; @@ -733,4 +746,13 @@ mod tests { meili_snap::snapshot!(format!("{:?}", details), @r###"TaskDeletion { matched_tasks: 1, deleted_tasks: None, original_filter: "hello" }"###); meili_snap::snapshot!(format!("{:?}", deserialised), @r###"TaskDeletion { matched_tasks: 1, deleted_tasks: None, original_filter: "hello" }"###); } + + #[test] + fn all_kind_can_be_from_str() { + for kind in enum_iterator::all::() { + let s = kind.to_string(); + let k = Kind::from_str(&s).map_err(|e| format!("Could not from_str {s}: {e}")).unwrap(); + assert_eq!(kind, k, "{kind}.to_string() returned {s} which was parsed as {k}"); + } + } } diff --git a/crates/meilisearch-types/src/versioning.rs b/crates/meilisearch-types/src/versioning.rs index 2ec9d9b0c..07e42c2ce 100644 --- a/crates/meilisearch-types/src/versioning.rs +++ b/crates/meilisearch-types/src/versioning.rs @@ -1,16 +1,19 @@ use std::fs; -use std::io::{self, ErrorKind}; +use std::io::{ErrorKind, Write}; use std::path::Path; +use milli::heed; +use tempfile::NamedTempFile; + /// The name of the file that contains the version of the database. pub const VERSION_FILE_NAME: &str = "VERSION"; -static VERSION_MAJOR: &str = env!("CARGO_PKG_VERSION_MAJOR"); -static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR"); -static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH"); +pub static VERSION_MAJOR: &str = env!("CARGO_PKG_VERSION_MAJOR"); +pub static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR"); +pub static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH"); /// Persists the version of the current Meilisearch binary to a VERSION file -pub fn create_current_version_file(db_path: &Path) -> io::Result<()> { +pub fn create_current_version_file(db_path: &Path) -> anyhow::Result<()> { create_version_file(db_path, VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) } @@ -19,39 +22,50 @@ pub fn create_version_file( major: &str, minor: &str, patch: &str, -) -> io::Result<()> { +) -> anyhow::Result<()> { let version_path = db_path.join(VERSION_FILE_NAME); - fs::write(version_path, format!("{}.{}.{}", major, minor, patch)) -} - -/// Ensures Meilisearch version is compatible with the database, returns an error versions mismatch. -pub fn check_version_file(db_path: &Path) -> anyhow::Result<()> { - let (major, minor, patch) = get_version(db_path)?; - - if major != VERSION_MAJOR || minor != VERSION_MINOR { - return Err(VersionFileError::VersionMismatch { major, minor, patch }.into()); - } - + // In order to persist the file later we must create it in the `data.ms` and not in `/tmp` + let mut file = NamedTempFile::new_in(db_path)?; + file.write_all(format!("{}.{}.{}", major, minor, patch).as_bytes())?; + file.flush()?; + file.persist(version_path)?; Ok(()) } -pub fn get_version(db_path: &Path) -> Result<(String, String, String), VersionFileError> { +pub fn get_version(db_path: &Path) -> Result<(u32, u32, u32), VersionFileError> { let version_path = db_path.join(VERSION_FILE_NAME); match fs::read_to_string(version_path) { Ok(version) => parse_version(&version), Err(error) => match error.kind() { ErrorKind::NotFound => Err(VersionFileError::MissingVersionFile), - _ => Err(error.into()), + _ => Err(anyhow::Error::from(error).into()), }, } } -pub fn parse_version(version: &str) -> Result<(String, String, String), VersionFileError> { - let version_components = version.split('.').collect::>(); +pub fn parse_version(version: &str) -> Result<(u32, u32, u32), VersionFileError> { + let version_components = version.trim().split('.').collect::>(); let (major, minor, patch) = match &version_components[..] { - [major, minor, patch] => (major.to_string(), minor.to_string(), patch.to_string()), - _ => return Err(VersionFileError::MalformedVersionFile), + [major, minor, patch] => ( + major.parse().map_err(|e| VersionFileError::MalformedVersionFile { + context: format!("Could not parse the major: {e}"), + })?, + minor.parse().map_err(|e| VersionFileError::MalformedVersionFile { + context: format!("Could not parse the minor: {e}"), + })?, + patch.parse().map_err(|e| VersionFileError::MalformedVersionFile { + context: format!("Could not parse the patch: {e}"), + })?, + ), + _ => { + return Err(VersionFileError::MalformedVersionFile { + context: format!( + "The version contains {} parts instead of 3 (major, minor and patch)", + version_components.len() + ), + }) + } }; Ok((major, minor, patch)) } @@ -64,15 +78,21 @@ pub enum VersionFileError { env!("CARGO_PKG_VERSION").to_string() )] MissingVersionFile, - #[error("Version file is corrupted and thus Meilisearch is unable to determine the version of the database.")] - MalformedVersionFile, + #[error("Version file is corrupted and thus Meilisearch is unable to determine the version of the database. {context}")] + MalformedVersionFile { context: String }, #[error( "Your database version ({major}.{minor}.{patch}) is incompatible with your current engine version ({}).\n\ To migrate data between Meilisearch versions, please follow our guide on https://www.meilisearch.com/docs/learn/update_and_migration/updating.", env!("CARGO_PKG_VERSION").to_string() )] - VersionMismatch { major: String, minor: String, patch: String }, + VersionMismatch { major: u32, minor: u32, patch: u32 }, + #[error("Database version {major}.{minor}.{patch} is higher than the Meilisearch version {VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_PATCH}. Downgrade is not supported")] + DowngradeNotSupported { major: u32, minor: u32, patch: u32 }, + #[error("Database version {major}.{minor}.{patch} is too old for the experimental dumpless upgrade feature. Please generate a dump using the v{major}.{minor}.{patch} and import it in the v{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_PATCH}")] + TooOldForAutomaticUpgrade { major: u32, minor: u32, patch: u32 }, + #[error("Error while modifying the database: {0}")] + ErrorWhileModifyingTheDatabase(#[from] heed::Error), #[error(transparent)] - IoError(#[from] std::io::Error), + AnyhowError(#[from] anyhow::Error), } diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index b11d90151..e25fd9400 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -14,42 +14,42 @@ default-run = "meilisearch" [dependencies] actix-cors = "0.7.0" -actix-http = { version = "3.8.0", default-features = false, features = [ +actix-http = { version = "3.9.0", default-features = false, features = [ "compress-brotli", "compress-gzip", "rustls-0_23", ] } actix-utils = "3.0.1" -actix-web = { version = "4.8.0", default-features = false, features = [ +actix-web = { version = "4.9.0", default-features = false, features = [ "macros", "compress-brotli", "compress-gzip", "cookies", "rustls-0_23", ] } -anyhow = { version = "1.0.86", features = ["backtrace"] } -async-trait = "0.1.81" -bstr = "1.9.1" -byte-unit = { version = "5.1.4", default-features = false, features = [ +anyhow = { version = "1.0.95", features = ["backtrace"] } +async-trait = "0.1.85" +bstr = "1.11.3" +byte-unit = { version = "5.1.6", default-features = false, features = [ "std", "byte", "serde", ] } -bytes = "1.6.0" -clap = { version = "4.5.9", features = ["derive", "env"] } -crossbeam-channel = "0.5.13" -deserr = { version = "0.6.2", features = ["actix-web"] } +bytes = "1.9.0" +clap = { version = "4.5.24", features = ["derive", "env"] } +crossbeam-channel = "0.5.14" +deserr = { version = "0.6.3", features = ["actix-web"] } dump = { path = "../dump" } either = "1.13.0" file-store = { path = "../file-store" } -flate2 = "1.0.30" +flate2 = "1.0.35" fst = "0.4.7" -futures = "0.3.30" -futures-util = "0.3.30" +futures = "0.3.31" +futures-util = "0.3.31" index-scheduler = { path = "../index-scheduler" } -indexmap = { version = "2.2.6", features = ["serde"] } -is-terminal = "0.4.12" -itertools = "0.13.0" +indexmap = { version = "2.7.0", features = ["serde"] } +is-terminal = "0.4.13" +itertools = "0.14.0" jsonwebtoken = "9.3.0" lazy_static = "1.5.0" meilisearch-auth = { path = "../meilisearch-auth" } @@ -57,82 +57,95 @@ meilisearch-types = { path = "../meilisearch-types" } mimalloc = { version = "0.1.43", default-features = false } mime = "0.3.17" num_cpus = "1.16.0" -obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } -once_cell = "1.19.0" -ordered-float = "4.2.1" +obkv = "0.3.0" +once_cell = "1.20.2" +ordered-float = "4.6.0" parking_lot = "0.12.3" permissive-json-pointer = { path = "../permissive-json-pointer" } -pin-project-lite = "0.2.14" +pin-project-lite = "0.2.16" platform-dirs = "0.3.0" prometheus = { version = "0.13.4", features = ["process"] } rand = "0.8.5" rayon = "1.10.0" -regex = "1.10.5" -reqwest = { version = "0.12.5", features = [ +regex = "1.11.1" +reqwest = { version = "0.12.12", features = [ "rustls-tls", "json", ], default-features = false } -rustls = { version = "0.23.11", features = ["ring"], default-features = false } -rustls-pki-types = { version = "1.7.0", features = ["alloc"] } -rustls-pemfile = "2.1.2" -segment = { version = "0.2.4" } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order"] } +rustls = { version = "0.23.20", features = ["ring"], default-features = false } +rustls-pki-types = { version = "1.10.1", features = ["alloc"] } +rustls-pemfile = "2.2.0" +segment = { version = "0.2.5" } +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.135", features = ["preserve_order"] } sha2 = "0.10.8" siphasher = "1.0.1" slice-group-by = "0.3.1" static-files = { version = "0.2.4", optional = true } -sysinfo = "0.30.13" -tar = "0.4.41" -tempfile = "3.10.1" -thiserror = "1.0.61" -time = { version = "0.3.36", features = [ +sysinfo = "0.33.1" +tar = "0.4.43" +tempfile = "3.15.0" +thiserror = "2.0.9" +time = { version = "0.3.37", features = [ "serde-well-known", "formatting", "parsing", "macros", ] } -tokio = { version = "1.38.0", features = ["full"] } -toml = "0.8.14" -uuid = { version = "1.10.0", features = ["serde", "v4"] } +tokio = { version = "1.42.0", features = ["full"] } +toml = "0.8.19" +uuid = { version = "1.11.0", features = ["serde", "v4"] } serde_urlencoded = "0.7.1" termcolor = "1.4.1" -url = { version = "2.5.2", features = ["serde"] } -tracing = "0.1.40" -tracing-subscriber = { version = "0.3.18", features = ["json"] } +url = { version = "2.5.4", features = ["serde"] } +tracing = "0.1.41" +tracing-subscriber = { version = "0.3.19", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } -tracing-actix-web = "0.7.11" +tracing-actix-web = "0.7.15" build-info = { version = "1.7.0", path = "../build-info" } -roaring = "0.10.2" +roaring = "0.10.10" mopa-maintained = "0.2.3" +utoipa = { version = "5.3.1", features = [ + "actix_extras", + "macros", + "non_strict_integers", + "preserve_order", + "uuid", + "time", + "openapi_extensions", +] } +utoipa-scalar = { version = "0.3.0", optional = true, features = ["actix-web"] } [dev-dependencies] actix-rt = "2.10.0" brotli = "6.0.0" -insta = "1.39.0" +# fixed version due to format breakages in v1.40 +insta = "=1.39.0" manifest-dir-macros = "0.1.18" maplit = "1.0.2" meili-snap = { path = "../meili-snap" } temp-env = "0.3.6" urlencoding = "2.1.3" -wiremock = "0.6.0" +wiremock = "0.6.2" yaup = "0.3.1" [build-dependencies] -anyhow = { version = "1.0.86", optional = true } -cargo_toml = { version = "0.20.3", optional = true } +anyhow = { version = "1.0.95", optional = true } +cargo_toml = { version = "0.21.0", optional = true } hex = { version = "0.4.3", optional = true } -reqwest = { version = "0.12.5", features = [ +reqwest = { version = "0.12.12", features = [ "blocking", "rustls-tls", ], default-features = false, optional = true } sha-1 = { version = "0.10.1", optional = true } static-files = { version = "0.2.4", optional = true } -tempfile = { version = "3.10.1", optional = true } -zip = { version = "2.1.3", optional = true } +tempfile = { version = "3.15.0", optional = true } +zip = { version = "2.2.2", optional = true } [features] default = ["meilisearch-types/all-tokenizations", "mini-dashboard"] +swagger = ["utoipa-scalar"] +test-ollama = [] mini-dashboard = [ "static-files", "anyhow", @@ -157,5 +170,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" -sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.18/build.zip" +sha1 = "b408a30dcb6e20cddb0c153c23385bcac4c8e912" diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 7dc746b14..504701739 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -31,6 +31,7 @@ use crate::routes::{create_all_stats, Stats}; use crate::Opt; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; +const MEILI_SERVER_PROVIDER: &str = "MEILI_SERVER_PROVIDER"; /// Write the instance-uid in the `data.ms` and in `~/.config/MeiliSearch/path-to-db-instance-uid`. Ignore the errors. fn write_user_id(db_path: &Path, user_id: &InstanceUid) { @@ -177,23 +178,28 @@ impl SegmentAnalytics { /// This structure represent the `infos` field we send in the analytics. /// It's quite close to the `Opt` structure except all sensitive informations /// have been simplified to a boolean. -/// It's send as-is in amplitude thus you should never update a name of the +/// It's sent as-is in amplitude thus you should never update a name of the /// struct without the approval of the PM. #[derive(Debug, Clone, Serialize)] struct Infos { env: String, experimental_contains_filter: bool, - experimental_vector_store: bool, experimental_enable_metrics: bool, experimental_edit_documents_by_function: bool, experimental_search_queue_size: usize, experimental_drop_search_after: usize, experimental_nb_searches_per_core: usize, experimental_logs_mode: LogMode, + experimental_dumpless_upgrade: bool, experimental_replication_parameters: bool, experimental_enable_logs_route: bool, experimental_reduce_indexing_memory_usage: bool, experimental_max_number_of_batched_tasks: usize, + experimental_limit_batched_tasks_total_size: u64, + experimental_network: bool, + experimental_get_task_documents_route: bool, + experimental_composite_embedders: bool, + experimental_embedding_cache_entries: usize, gpu_enabled: bool, db_path: bool, import_dump: bool, @@ -235,10 +241,13 @@ impl Infos { experimental_drop_search_after, experimental_nb_searches_per_core, experimental_logs_mode, + experimental_dumpless_upgrade, experimental_replication_parameters, experimental_enable_logs_route, experimental_reduce_indexing_memory_usage, experimental_max_number_of_batched_tasks, + experimental_limit_batched_tasks_total_size, + experimental_embedding_cache_entries, http_addr, master_key: _, env, @@ -278,11 +287,13 @@ impl Infos { indexer_options; let RuntimeTogglableFeatures { - vector_store, metrics, logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, + composite_embedders, } = features; // We're going to override every sensible information. @@ -290,16 +301,20 @@ impl Infos { Self { env, experimental_contains_filter: experimental_contains_filter | contains_filter, - experimental_vector_store: vector_store, experimental_edit_documents_by_function: edit_documents_by_function, experimental_enable_metrics: experimental_enable_metrics | metrics, experimental_search_queue_size, experimental_drop_search_after: experimental_drop_search_after.into(), experimental_nb_searches_per_core: experimental_nb_searches_per_core.into(), experimental_logs_mode, + experimental_dumpless_upgrade, experimental_replication_parameters, experimental_enable_logs_route: experimental_enable_logs_route | logs_route, experimental_reduce_indexing_memory_usage, + experimental_network: network, + experimental_get_task_documents_route: get_task_documents_route, + experimental_composite_embedders: composite_embedders, + experimental_embedding_cache_entries, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), @@ -314,6 +329,7 @@ impl Infos { http_addr: http_addr != default_http_addr(), http_payload_size_limit, experimental_max_number_of_batched_tasks, + experimental_limit_batched_tasks_total_size, task_queue_webhook: task_webhook_url.is_some(), task_webhook_authorization_header: task_webhook_authorization_header.is_some(), log_level: log_level.to_string(), @@ -354,7 +370,7 @@ impl Segment { "cores": sys.cpus().len(), "ram_size": sys.total_memory(), "disk_size": disks.iter().map(|disk| disk.total_space()).max(), - "server_provider": std::env::var("MEILI_SERVER_PROVIDER").ok(), + "server_provider": std::env::var(MEILI_SERVER_PROVIDER).ok(), }) }); let number_of_documents = @@ -377,10 +393,18 @@ impl Segment { index_scheduler: Arc, auth_controller: Arc, ) { - const INTERVAL: Duration = Duration::from_secs(60 * 60); // one hour - // The first batch must be sent after one hour. + let interval: Duration = match std::env::var(MEILI_SERVER_PROVIDER) { + Ok(provider) if provider.starts_with("meili_cloud:") => { + Duration::from_secs(60 * 60) // one hour + } + _ => { + // We're an open source instance + Duration::from_secs(60 * 60 * 24) // one day + } + }; + let mut interval = - tokio::time::interval_at(tokio::time::Instant::now() + INTERVAL, INTERVAL); + tokio::time::interval_at(tokio::time::Instant::now() + interval, interval); loop { select! { @@ -426,13 +450,9 @@ impl Segment { &AuthFilter::default(), ) { // Replace the version number with the prototype name if any. - let version = if let Some(prototype) = build_info::DescribeResult::from_build() + let version = build_info::DescribeResult::from_build() .and_then(|describe| describe.as_prototype()) - { - prototype - } else { - env!("CARGO_PKG_VERSION") - }; + .unwrap_or(env!("CARGO_PKG_VERSION")); let _ = self .batcher diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index 5c4ce171f..b13eb8d7c 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -4,6 +4,7 @@ use byte_unit::{Byte, UnitType}; use meilisearch_types::document_formats::{DocumentFormatError, PayloadType}; use meilisearch_types::error::{Code, ErrorCode, ResponseError}; use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError}; +use meilisearch_types::milli; use meilisearch_types::milli::OrderBy; use serde_json::Value; use tokio::task::JoinError; @@ -18,15 +19,15 @@ pub enum MeilisearchHttpError { #[error("The Content-Type `{0}` does not support the use of a csv delimiter. The csv delimiter can only be used with the Content-Type `text/csv`.")] CsvDelimiterWithWrongContentType(String), #[error( - "The Content-Type `{0}` is invalid. Accepted values for the Content-Type header are: {}", - .1.iter().map(|s| format!("`{}`", s)).collect::>().join(", ") + "The Content-Type `{}` is invalid. Accepted values for the Content-Type header are: {}", + .0, .1.iter().map(|s| format!("`{}`", s)).collect::>().join(", ") )] InvalidContentType(String, Vec), #[error("Document `{0}` not found.")] DocumentNotFound(String), #[error("Sending an empty filter is forbidden.")] EmptyFilter, - #[error("Invalid syntax for the filter parameter: `expected {}, found: {1}`.", .0.join(", "))] + #[error("Invalid syntax for the filter parameter: `expected {}, found: {}`.", .0.join(", "), .1)] InvalidExpression(&'static [&'static str], Value), #[error("Using `federationOptions` is not allowed in a non-federated search.\n - Hint: remove `federationOptions` from query #{0} or add `federation` to the request.")] FederationOptionsInNonFederatedRequest(usize), @@ -62,8 +63,11 @@ pub enum MeilisearchHttpError { HeedError(#[from] meilisearch_types::heed::Error), #[error(transparent)] IndexScheduler(#[from] index_scheduler::Error), - #[error(transparent)] - Milli(#[from] meilisearch_types::milli::Error), + #[error("{}", match .index_name { + Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name), + _ => format!("{error}") + })] + Milli { error: milli::Error, index_name: Option }, #[error(transparent)] Payload(#[from] PayloadError), #[error(transparent)] @@ -76,6 +80,12 @@ pub enum MeilisearchHttpError { MissingSearchHybrid, } +impl MeilisearchHttpError { + pub(crate) fn from_milli(error: milli::Error, index_name: Option) -> Self { + Self::Milli { error, index_name } + } +} + impl ErrorCode for MeilisearchHttpError { fn error_code(&self) -> Code { match self { @@ -95,7 +105,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::SerdeJson(_) => Code::Internal, MeilisearchHttpError::HeedError(_) => Code::Internal, MeilisearchHttpError::IndexScheduler(e) => e.error_code(), - MeilisearchHttpError::Milli(e) => e.error_code(), + MeilisearchHttpError::Milli { error, .. } => error.error_code(), MeilisearchHttpError::Payload(e) => e.error_code(), MeilisearchHttpError::FileStore(_) => Code::Internal, MeilisearchHttpError::DocumentFormat(e) => e.error_code(), diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 633ad2776..2a32a6be8 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -7,6 +7,8 @@ pub mod extractors; pub mod metrics; pub mod middleware; pub mod option; +#[cfg(test)] +mod option_test; pub mod routes; pub mod search; pub mod search_queue; @@ -30,14 +32,18 @@ use analytics::Analytics; use anyhow::bail; use error::PayloadError; use extractors::payload::PayloadConfig; +use index_scheduler::versioning::Versioning; use index_scheduler::{IndexScheduler, IndexSchedulerOptions}; -use meilisearch_auth::AuthController; +use meilisearch_auth::{open_auth_store_env, AuthController}; +use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use meilisearch_types::milli::update::{IndexDocumentsConfig, IndexDocumentsMethod}; use meilisearch_types::settings::apply_settings_to_builder; use meilisearch_types::tasks::KindWithContent; -use meilisearch_types::versioning::{check_version_file, create_current_version_file}; -use meilisearch_types::{compression, milli, VERSION_FILE_NAME}; +use meilisearch_types::versioning::{ + create_current_version_file, get_version, VersionFileError, VERSION_MINOR, VERSION_PATCH, +}; +use meilisearch_types::{compression, heed, milli, VERSION_FILE_NAME}; pub use option::Opt; use option::ScheduleSnapshot; use search_queue::SearchQueue; @@ -186,13 +192,13 @@ impl tracing_actix_web::RootSpanBuilder for AwebTracingLogger { if let Some(error) = response.response().error() { // use the status code already constructed for the outgoing HTTP response - span.record("error", &tracing::field::display(error.as_response_error())); + span.record("error", tracing::field::display(error.as_response_error())); } } Err(error) => { let code: i32 = error.error_response().status().as_u16().into(); span.record("status_code", code); - span.record("error", &tracing::field::display(error.as_response_error())); + span.record("error", tracing::field::display(error.as_response_error())); } }; } @@ -204,13 +210,48 @@ enum OnFailure { } pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc)> { + let index_scheduler_opt = IndexSchedulerOptions { + version_file_path: opt.db_path.join(VERSION_FILE_NAME), + auth_path: opt.db_path.join("auth"), + tasks_path: opt.db_path.join("tasks"), + update_file_path: opt.db_path.join("update_files"), + indexes_path: opt.db_path.join("indexes"), + snapshots_path: opt.snapshot_dir.clone(), + dumps_path: opt.dump_dir.clone(), + webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), + webhook_authorization_header: opt.task_webhook_authorization_header.clone(), + task_db_size: opt.max_task_db_size.as_u64() as usize, + index_base_map_size: opt.max_index_size.as_u64() as usize, + enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, + indexer_config: Arc::new((&opt.indexer_options).try_into()?), + autobatching_enabled: true, + cleanup_enabled: !opt.experimental_replication_parameters, + max_number_of_tasks: 1_000_000, + max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, + batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size, + index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize, + index_count: DEFAULT_INDEX_COUNT, + instance_features: opt.to_instance_features(), + auto_upgrade: opt.experimental_dumpless_upgrade, + embedding_cache_cap: opt.experimental_embedding_cache_entries, + }; + let bin_major: u32 = VERSION_MAJOR.parse().unwrap(); + let bin_minor: u32 = VERSION_MINOR.parse().unwrap(); + let bin_patch: u32 = VERSION_PATCH.parse().unwrap(); + let binary_version = (bin_major, bin_minor, bin_patch); + let empty_db = is_empty_db(&opt.db_path); let (index_scheduler, auth_controller) = if let Some(ref snapshot_path) = opt.import_snapshot { let snapshot_path_exists = snapshot_path.exists(); // the db is empty and the snapshot exists, import it if empty_db && snapshot_path_exists { match compression::from_tar_gz(snapshot_path, &opt.db_path) { - Ok(()) => open_or_create_database_unchecked(opt, OnFailure::RemoveDb)?, + Ok(()) => open_or_create_database_unchecked( + opt, + index_scheduler_opt, + OnFailure::RemoveDb, + binary_version, // the db is empty + )?, Err(e) => { std::fs::remove_dir_all(&opt.db_path)?; return Err(e); @@ -227,14 +268,18 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< bail!("snapshot doesn't exist at {}", snapshot_path.display()) // the snapshot and the db exist, and we can ignore the snapshot because of the ignore_snapshot_if_db_exists flag } else { - open_or_create_database(opt, empty_db)? + open_or_create_database(opt, index_scheduler_opt, empty_db, binary_version)? } } else if let Some(ref path) = opt.import_dump { let src_path_exists = path.exists(); // the db is empty and the dump exists, import it if empty_db && src_path_exists { - let (mut index_scheduler, mut auth_controller) = - open_or_create_database_unchecked(opt, OnFailure::RemoveDb)?; + let (mut index_scheduler, mut auth_controller) = open_or_create_database_unchecked( + opt, + index_scheduler_opt, + OnFailure::RemoveDb, + binary_version, // the db is empty + )?; match import_dump(&opt.db_path, path, &mut index_scheduler, &mut auth_controller) { Ok(()) => (index_scheduler, auth_controller), Err(e) => { @@ -254,10 +299,10 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< // the dump and the db exist and we can ignore the dump because of the ignore_dump_if_db_exists flag // or, the dump is missing but we can ignore that because of the ignore_missing_dump flag } else { - open_or_create_database(opt, empty_db)? + open_or_create_database(opt, index_scheduler_opt, empty_db, binary_version)? } } else { - open_or_create_database(opt, empty_db)? + open_or_create_database(opt, index_scheduler_opt, empty_db, binary_version)? }; // We create a loop in a thread that registers snapshotCreation tasks @@ -285,41 +330,23 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< /// Try to start the IndexScheduler and AuthController without checking the VERSION file or anything. fn open_or_create_database_unchecked( opt: &Opt, + index_scheduler_opt: IndexSchedulerOptions, on_failure: OnFailure, + version: (u32, u32, u32), ) -> anyhow::Result<(IndexScheduler, AuthController)> { // we don't want to create anything in the data.ms yet, thus we // wrap our two builders in a closure that'll be executed later. - let auth_controller = AuthController::new(&opt.db_path, &opt.master_key); - let instance_features = opt.to_instance_features(); + std::fs::create_dir_all(&index_scheduler_opt.auth_path)?; + let auth_env = open_auth_store_env(&index_scheduler_opt.auth_path).unwrap(); + let auth_controller = AuthController::new(auth_env.clone(), &opt.master_key); let index_scheduler_builder = || -> anyhow::Result<_> { - Ok(IndexScheduler::new(IndexSchedulerOptions { - version_file_path: opt.db_path.join(VERSION_FILE_NAME), - auth_path: opt.db_path.join("auth"), - tasks_path: opt.db_path.join("tasks"), - update_file_path: opt.db_path.join("update_files"), - indexes_path: opt.db_path.join("indexes"), - snapshots_path: opt.snapshot_dir.clone(), - dumps_path: opt.dump_dir.clone(), - webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()), - webhook_authorization_header: opt.task_webhook_authorization_header.clone(), - task_db_size: opt.max_task_db_size.as_u64() as usize, - index_base_map_size: opt.max_index_size.as_u64() as usize, - enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage, - indexer_config: (&opt.indexer_options).try_into()?, - autobatching_enabled: true, - cleanup_enabled: !opt.experimental_replication_parameters, - max_number_of_tasks: 1_000_000, - max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, - index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize, - index_count: DEFAULT_INDEX_COUNT, - instance_features, - })?) + Ok(IndexScheduler::new(index_scheduler_opt, auth_env, version)?) }; match ( index_scheduler_builder(), auth_controller.map_err(anyhow::Error::from), - create_current_version_file(&opt.db_path).map_err(anyhow::Error::from), + create_current_version_file(&opt.db_path), ) { (Ok(i), Ok(a), Ok(())) => Ok((i, a)), (Err(e), _, _) | (_, Err(e), _) | (_, _, Err(e)) => { @@ -331,16 +358,103 @@ fn open_or_create_database_unchecked( } } +/// Ensures Meilisearch version is compatible with the database, returns an error in case of version mismatch. +/// Returns the version that was contained in the version file +fn check_version( + opt: &Opt, + index_scheduler_opt: &IndexSchedulerOptions, + binary_version: (u32, u32, u32), +) -> anyhow::Result<(u32, u32, u32)> { + let (bin_major, bin_minor, bin_patch) = binary_version; + let (db_major, db_minor, db_patch) = get_version(&opt.db_path)?; + + if db_major != bin_major || db_minor != bin_minor || db_patch != bin_patch { + if opt.experimental_dumpless_upgrade { + update_version_file_for_dumpless_upgrade( + opt, + index_scheduler_opt, + (db_major, db_minor, db_patch), + (bin_major, bin_minor, bin_patch), + )?; + } else { + return Err(VersionFileError::VersionMismatch { + major: db_major, + minor: db_minor, + patch: db_patch, + } + .into()); + } + } + + Ok((db_major, db_minor, db_patch)) +} + +/// Persists the version of the current Meilisearch binary to a VERSION file +pub fn update_version_file_for_dumpless_upgrade( + opt: &Opt, + index_scheduler_opt: &IndexSchedulerOptions, + from: (u32, u32, u32), + to: (u32, u32, u32), +) -> Result<(), VersionFileError> { + let (from_major, from_minor, from_patch) = from; + let (to_major, to_minor, to_patch) = to; + + // Early exit in case of error + if from_major > to_major + || (from_major == to_major && from_minor > to_minor) + || (from_major == to_major && from_minor == to_minor && from_patch > to_patch) + { + return Err(VersionFileError::DowngradeNotSupported { + major: from_major, + minor: from_minor, + patch: from_patch, + }); + } else if from_major < 1 || (from_major == to_major && from_minor < 12) { + return Err(VersionFileError::TooOldForAutomaticUpgrade { + major: from_major, + minor: from_minor, + patch: from_patch, + }); + } + + // In the case of v1.12, the index-scheduler didn't store its internal version at the time. + // => We must write it immediately **in the index-scheduler** otherwise we'll update the version file + // there is a risk of DB corruption if a restart happens after writing the version file but before + // writing the version in the index-scheduler. See + if from_major == 1 && from_minor == 12 { + let env = unsafe { + heed::EnvOpenOptions::new() + .read_txn_without_tls() + .max_dbs(Versioning::nb_db()) + .map_size(index_scheduler_opt.task_db_size) + .open(&index_scheduler_opt.tasks_path) + }?; + let mut wtxn = env.write_txn()?; + let versioning = Versioning::raw_new(&env, &mut wtxn)?; + versioning.set_version(&mut wtxn, (from_major, from_minor, from_patch))?; + wtxn.commit()?; + // Should be instant since we're the only one using the env + env.prepare_for_closing().wait(); + } + + create_current_version_file(&opt.db_path)?; + Ok(()) +} + /// Ensure you're in a valid state and open the IndexScheduler + AuthController for you. fn open_or_create_database( opt: &Opt, + index_scheduler_opt: IndexSchedulerOptions, empty_db: bool, + binary_version: (u32, u32, u32), ) -> anyhow::Result<(IndexScheduler, AuthController)> { - if !empty_db { - check_version_file(&opt.db_path)?; - } + let version = if !empty_db { + check_version(opt, &index_scheduler_opt, binary_version)? + } else { + binary_version + }; - open_or_create_database_unchecked(opt, OnFailure::KeepDb) + open_or_create_database_unchecked(opt, index_scheduler_opt, OnFailure::KeepDb, version) } fn import_dump( @@ -382,10 +496,13 @@ fn import_dump( keys.push(key); } - // 3. Import the runtime features. + // 3. Import the runtime features and network let features = dump_reader.features()?.unwrap_or_default(); index_scheduler.put_runtime_features(features)?; + let network = dump_reader.network()?.cloned().unwrap_or_default(); + index_scheduler.put_network(network)?; + let indexer_config = index_scheduler.indexer_config(); // /!\ The tasks must be imported AFTER importing the indexes or else the scheduler might @@ -395,6 +512,7 @@ fn import_dump( for index_reader in dump_reader.indexes()? { let mut index_reader = index_reader?; let metadata = index_reader.metadata(); + let uid = metadata.uid.clone(); tracing::info!("Importing index `{}`.", metadata.uid); let date = Some((metadata.created_at, metadata.updated_at)); @@ -432,7 +550,7 @@ fn import_dump( let reader = DocumentsBatchReader::from_reader(reader)?; let embedder_configs = index.embedding_configs(&wtxn)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; let builder = milli::update::IndexDocuments::new( &mut wtxn, @@ -454,11 +572,19 @@ fn import_dump( builder.execute()?; wtxn.commit()?; tracing::info!("All documents successfully imported."); + + index_scheduler.refresh_index_stats(&uid)?; } + // 5. Import the queue let mut index_scheduler_dump = index_scheduler.register_dumped_task()?; + // 5.1. Import the batches + for ret in dump_reader.batches()? { + let batch = ret?; + index_scheduler_dump.register_dumped_batch(batch)?; + } - // 5. Import the tasks. + // 5.2. Import the tasks for ret in dump_reader.tasks()? { let (task, file) = ret?; index_scheduler_dump.register_dumped_task(task, file)?; diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index c0652bf1e..b16dda097 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -20,14 +20,14 @@ use meilisearch::{ LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; -use mimalloc::MiMalloc; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt as _; use tracing_subscriber::Layer; +#[cfg(not(windows))] #[global_allocator] -static ALLOC: MiMalloc = MiMalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; fn default_log_route_layer() -> LogRouteType { None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF)) @@ -69,7 +69,7 @@ fn setup(opt: &Opt) -> anyhow::Result<(LogRouteHandle, LogStderrHandle)> { Ok((route_layer_handle, stderr_layer_handle)) } -fn on_panic(info: &std::panic::PanicInfo) { +fn on_panic(info: &std::panic::PanicHookInfo) { let info = info.to_string().replace('\n', " "); tracing::error!(%info); } @@ -129,6 +129,11 @@ async fn try_main() -> anyhow::Result<()> { print_launch_resume(&opt, analytics.clone(), config_read_from); + tokio::spawn(async move { + tokio::signal::ctrl_c().await.unwrap(); + std::process::exit(130); + }); + run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; Ok(()) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index be9fbfc49..a48554e6c 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -1,7 +1,7 @@ use lazy_static::lazy_static; use prometheus::{ - opts, register_histogram_vec, register_int_counter_vec, register_int_gauge, - register_int_gauge_vec, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, + opts, register_gauge, register_histogram_vec, register_int_counter_vec, register_int_gauge, + register_int_gauge_vec, Gauge, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, }; lazy_static! { @@ -63,4 +63,9 @@ lazy_static! { "Meilisearch Searches Being Processed" )) .expect("Can't create a metric"); + pub static ref MEILISEARCH_TASK_QUEUE_LATENCY_SECONDS: Gauge = register_gauge!( + "meilisearch_task_queue_latency_seconds", + "Meilisearch Task Queue Latency in Seconds", + ) + .expect("Can't create a metric"); } diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 7e87a5a2c..94849c199 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -16,7 +16,7 @@ use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::ThreadPoolNoAbortBuilder; use rustls::server::{ServerSessionMemoryCache, WebPkiClientVerifier}; use rustls::RootCertStore; -use rustls_pemfile::{certs, rsa_private_keys}; +use rustls_pemfile::{certs, ec_private_keys, rsa_private_keys}; use serde::{Deserialize, Serialize}; use sysinfo::{MemoryRefreshKind, RefreshKind, System}; use url::Url; @@ -49,6 +49,7 @@ const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS"; const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR"; const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL"; const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE"; +const MEILI_EXPERIMENTAL_DUMPLESS_UPGRADE: &str = "MEILI_EXPERIMENTAL_DUMPLESS_UPGRADE"; const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS"; const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE"; const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER"; @@ -60,7 +61,10 @@ const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS"; - +const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str = + "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_SIZE"; +const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str = + "MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_DB_PATH: &str = "./data.ms"; const DEFAULT_HTTP_ADDR: &str = "localhost:7700"; @@ -200,20 +204,23 @@ pub struct Opt { #[clap(long, env = MEILI_TASK_WEBHOOK_URL)] pub task_webhook_url: Option, - /// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified. + /// The Authorization header to send on the webhook URL whenever + /// a task finishes so a third party can be notified. #[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)] pub task_webhook_authorization_header: Option, /// Deactivates Meilisearch's built-in telemetry when provided. /// - /// Meilisearch automatically collects data from all instances that do not opt out using this flag. - /// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted + /// Meilisearch automatically collects data from all instances that + /// do not opt out using this flag. All gathered data is used solely + /// for the purpose of improving Meilisearch, and can be deleted /// at any time. #[serde(default)] // we can't send true #[clap(long, env = MEILI_NO_ANALYTICS)] pub no_analytics: bool, - /// Sets the maximum size of the index. Value must be given in bytes or explicitly stating a base unit (for instance: 107374182400, '107.7Gb', or '107374 Mb'). + /// Sets the maximum size of the index. Value must be given in bytes or explicitly + /// stating a base unit (for instance: 107374182400, '107.7Gb', or '107374 Mb'). #[clap(skip = default_max_index_size())] #[serde(skip, default = "default_max_index_size")] pub max_index_size: Byte, @@ -333,43 +340,53 @@ pub struct Opt { /// Defines how much detail should be present in Meilisearch's logs. /// - /// Meilisearch currently supports six log levels, listed in order of increasing verbosity: OFF, ERROR, WARN, INFO, DEBUG, TRACE. + /// Meilisearch currently supports six log levels, listed in order of + /// increasing verbosity: OFF, ERROR, WARN, INFO, DEBUG, TRACE. #[clap(long, env = MEILI_LOG_LEVEL, default_value_t)] #[serde(default)] pub log_level: LogLevel, - /// Experimental contains filter feature. For more information, see: + /// Experimental contains filter feature. For more information, + /// see: /// /// Enables the experimental contains filter operator. #[clap(long, env = MEILI_EXPERIMENTAL_CONTAINS_FILTER)] #[serde(default)] pub experimental_contains_filter: bool, - /// Experimental metrics feature. For more information, see: + /// Experimental metrics feature. For more information, + /// see: /// /// Enables the Prometheus metrics on the `GET /metrics` endpoint. #[clap(long, env = MEILI_EXPERIMENTAL_ENABLE_METRICS)] #[serde(default)] pub experimental_enable_metrics: bool, - /// Experimental search queue size. For more information, see: + /// Experimental search queue size. For more information, + /// see: + /// + /// Lets you customize the size of the search queue. Meilisearch processes + /// your search requests as fast as possible but once the queue is full + /// it starts returning HTTP 503, Service Unavailable. /// - /// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the - /// queue is full it starts returning HTTP 503, Service Unavailable. /// The default value is 1000. #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())] #[serde(default = "default_experimental_search_queue_size")] pub experimental_search_queue_size: usize, - /// Experimental drop search after. For more information, see: + /// Experimental drop search after. For more information, + /// see: + /// + /// Let you customize after how many seconds Meilisearch should consider + /// a search request irrelevant and drop it. /// - /// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it. /// The default value is 60. #[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())] #[serde(default = "default_drop_search_after")] pub experimental_drop_search_after: NonZeroUsize, - /// Experimental number of searches per core. For more information, see: + /// Experimental number of searches per core. For more information, + /// see: /// /// Lets you customize how many search requests can run on each core concurrently. /// The default value is 4. @@ -377,16 +394,26 @@ pub struct Opt { #[serde(default = "default_nb_searches_per_core")] pub experimental_nb_searches_per_core: NonZeroUsize, - /// Experimental logs mode feature. For more information, see: + /// Experimental logs mode feature. For more information, + /// see: /// /// Change the mode of the logs on the console. #[clap(long, env = MEILI_EXPERIMENTAL_LOGS_MODE, default_value_t)] #[serde(default)] pub experimental_logs_mode: LogMode, - /// Experimental logs route feature. For more information, see: + /// Experimental dumpless upgrade. For more information, see: /// - /// Enables the log routes on the `POST /logs/stream`, `POST /logs/stderr` endpoints, and the `DELETE /logs/stream` to stop receiving logs. + /// When set, Meilisearch will auto-update its database without using a dump. + #[clap(long, env = MEILI_EXPERIMENTAL_DUMPLESS_UPGRADE, default_value_t)] + #[serde(default)] + pub experimental_dumpless_upgrade: bool, + + /// Experimental logs route feature. For more information, + /// see: + /// + /// Enables the log routes on the `POST /logs/stream`, `POST /logs/stderr` endpoints, + /// and the `DELETE /logs/stream` to stop receiving logs. #[clap(long, env = MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE)] #[serde(default)] pub experimental_enable_logs_route: bool, @@ -396,21 +423,38 @@ pub struct Opt { /// /// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now /// - Lets you specify a custom task ID upon registering a task - /// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed) + /// - Lets you execute dry-register a task (get an answer from the route but nothing is actually + /// registered in meilisearch and it won't be processed) #[clap(long, env = MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS)] #[serde(default)] pub experimental_replication_parameters: bool, - /// Experimental RAM reduction during indexing, do not use in production, see: + /// Experimental RAM reduction during indexing, do not use in production, + /// see: #[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)] #[serde(default)] pub experimental_reduce_indexing_memory_usage: bool, - /// Experimentally reduces the maximum number of tasks that will be processed at once, see: + /// Experimentally reduces the maximum number of tasks that will be processed at once, + /// see: #[clap(long, env = MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())] #[serde(default = "default_limit_batched_tasks")] pub experimental_max_number_of_batched_tasks: usize, + /// Experimentally reduces the maximum total size, in bytes, of tasks that will be processed at once, + /// see: + #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())] + #[serde(default = "default_limit_batched_tasks_total_size")] + pub experimental_limit_batched_tasks_total_size: u64, + + /// Enables experimental caching of search query embeddings. The value represents the maximal number of entries in the cache of each + /// distinct embedder. + /// + /// For more information, see . + #[clap(long, env = MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES, default_value_t = default_embedding_cache_entries())] + #[serde(default = "default_embedding_cache_entries")] + pub experimental_embedding_cache_entries: usize, + #[serde(flatten)] #[clap(flatten)] pub indexer_options: IndexerOpts, @@ -482,7 +526,6 @@ impl Opt { max_index_size: _, max_task_db_size: _, http_payload_size_limit, - experimental_max_number_of_batched_tasks, ssl_cert_path, ssl_key_path, ssl_auth_path, @@ -509,9 +552,13 @@ impl Opt { experimental_drop_search_after, experimental_nb_searches_per_core, experimental_logs_mode, + experimental_dumpless_upgrade, experimental_enable_logs_route, experimental_replication_parameters, experimental_reduce_indexing_memory_usage, + experimental_max_number_of_batched_tasks, + experimental_limit_batched_tasks_total_size, + experimental_embedding_cache_entries, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr); @@ -534,10 +581,6 @@ impl Opt { MEILI_HTTP_PAYLOAD_SIZE_LIMIT, http_payload_size_limit.to_string(), ); - export_to_env_if_not_present( - MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, - experimental_max_number_of_batched_tasks.to_string(), - ); if let Some(ssl_cert_path) = ssl_cert_path { export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path); } @@ -584,6 +627,10 @@ impl Opt { MEILI_EXPERIMENTAL_LOGS_MODE, experimental_logs_mode.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_DUMPLESS_UPGRADE, + experimental_dumpless_upgrade.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS, experimental_replication_parameters.to_string(), @@ -596,6 +643,18 @@ impl Opt { MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE, experimental_reduce_indexing_memory_usage.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, + experimental_max_number_of_batched_tasks.to_string(), + ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, + experimental_limit_batched_tasks_total_size.to_string(), + ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES, + experimental_embedding_cache_entries.to_string(), + ); indexer_options.export_to_env(); } @@ -760,8 +819,8 @@ impl MaxMemory { /// Returns the total amount of bytes available or `None` if this system isn't supported. fn total_memory_bytes() -> Option { if sysinfo::IS_SUPPORTED_SYSTEM { - let memory_kind = RefreshKind::new().with_memory(MemoryRefreshKind::new().with_ram()); - let mut system = System::new_with_specifics(memory_kind); + let mem_kind = RefreshKind::nothing().with_memory(MemoryRefreshKind::nothing().with_ram()); + let mut system = System::new_with_specifics(mem_kind); system.refresh_memory(); Some(system.total_memory()) } else { @@ -815,7 +874,7 @@ fn load_private_key( filename: PathBuf, ) -> anyhow::Result> { let rsa_keys = { - let keyfile = fs::File::open(filename.clone()) + let keyfile = fs::File::open(&filename) .map_err(|_| anyhow::anyhow!("cannot open private key file"))?; let mut reader = BufReader::new(keyfile); rsa_private_keys(&mut reader) @@ -824,7 +883,7 @@ fn load_private_key( }; let pkcs8_keys = { - let keyfile = fs::File::open(filename) + let keyfile = fs::File::open(&filename) .map_err(|_| anyhow::anyhow!("cannot open private key file"))?; let mut reader = BufReader::new(keyfile); rustls_pemfile::pkcs8_private_keys(&mut reader).collect::, _>>().map_err( @@ -836,12 +895,23 @@ fn load_private_key( )? }; + let ec_keys = { + let keyfile = fs::File::open(&filename) + .map_err(|_| anyhow::anyhow!("cannot open private key file"))?; + let mut reader = BufReader::new(keyfile); + ec_private_keys(&mut reader) + .collect::, _>>() + .map_err(|_| anyhow::anyhow!("file contains invalid ec private key"))? + }; + // prefer to load pkcs8 keys if !pkcs8_keys.is_empty() { Ok(rustls::pki_types::PrivateKeyDer::Pkcs8(pkcs8_keys[0].clone_key())) - } else { - assert!(!rsa_keys.is_empty()); + } else if !rsa_keys.is_empty() { Ok(rustls::pki_types::PrivateKeyDer::Pkcs1(rsa_keys[0].clone_key())) + } else { + assert!(!ec_keys.is_empty()); + Ok(rustls::pki_types::PrivateKeyDer::Sec1(ec_keys[0].clone_key())) } } @@ -870,7 +940,6 @@ where } /// Functions used to get default value for `Opt` fields, needs to be function because of serde's default attribute. - fn default_db_path() -> PathBuf { PathBuf::from(DEFAULT_DB_PATH) } @@ -899,6 +968,14 @@ fn default_limit_batched_tasks() -> usize { usize::MAX } +fn default_limit_batched_tasks_total_size() -> u64 { + u64::MAX +} + +fn default_embedding_cache_entries() -> usize { + 0 +} + fn default_snapshot_dir() -> PathBuf { PathBuf::from(DEFAULT_SNAPSHOT_DIR) } @@ -970,7 +1047,7 @@ where { struct BoolOrInt; - impl<'de> serde::de::Visitor<'de> for BoolOrInt { + impl serde::de::Visitor<'_> for BoolOrInt { type Value = ScheduleSnapshot; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { @@ -1018,43 +1095,3 @@ where } deserializer.deserialize_any(BoolOrInt) } - -#[cfg(test)] -mod test { - - use super::*; - - #[test] - fn test_valid_opt() { - assert!(Opt::try_parse_from(Some("")).is_ok()); - } - - #[test] - #[ignore] - fn test_meilli_config_file_path_valid() { - temp_env::with_vars( - vec![("MEILI_CONFIG_FILE_PATH", Some("../config.toml"))], // Relative path in meilisearch package - || { - assert!(Opt::try_build().is_ok()); - }, - ); - } - - #[test] - #[ignore] - fn test_meilli_config_file_path_invalid() { - temp_env::with_vars(vec![("MEILI_CONFIG_FILE_PATH", Some("../configgg.toml"))], || { - let possible_error_messages = [ - "unable to open or read the \"../configgg.toml\" configuration file: No such file or directory (os error 2).", - "unable to open or read the \"../configgg.toml\" configuration file: The system cannot find the file specified. (os error 2).", // Windows - ]; - let error_message = Opt::try_build().unwrap_err().to_string(); - assert!( - possible_error_messages.contains(&error_message.as_str()), - "Expected onf of {:?}, got {:?}.", - possible_error_messages, - error_message - ); - }); - } -} diff --git a/crates/meilisearch/src/option_test.rs b/crates/meilisearch/src/option_test.rs new file mode 100644 index 000000000..80d21dad3 --- /dev/null +++ b/crates/meilisearch/src/option_test.rs @@ -0,0 +1,37 @@ +use clap::Parser; + +use crate::option::Opt; + +#[test] +fn test_valid_opt() { + assert!(Opt::try_parse_from(Some("")).is_ok()); +} + +#[test] +#[ignore] +fn test_meilli_config_file_path_valid() { + temp_env::with_vars( + vec![("MEILI_CONFIG_FILE_PATH", Some("../config.toml"))], // Relative path in meilisearch package + || { + assert!(Opt::try_build().is_ok()); + }, + ); +} + +#[test] +#[ignore] +fn test_meilli_config_file_path_invalid() { + temp_env::with_vars(vec![("MEILI_CONFIG_FILE_PATH", Some("../configgg.toml"))], || { + let possible_error_messages = [ + "unable to open or read the \"../configgg.toml\" configuration file: No such file or directory (os error 2).", + "unable to open or read the \"../configgg.toml\" configuration file: The system cannot find the file specified. (os error 2).", // Windows + ]; + let error_message = Opt::try_build().unwrap_err().to_string(); + assert!( + possible_error_messages.contains(&error_message.as_str()), + "Expected onf of {:?}, got {:?}.", + possible_error_messages, + error_message + ); + }); +} diff --git a/crates/meilisearch/src/routes/api_key.rs b/crates/meilisearch/src/routes/api_key.rs index 0bd4b9d59..3130006e3 100644 --- a/crates/meilisearch/src/routes/api_key.rs +++ b/crates/meilisearch/src/routes/api_key.rs @@ -13,14 +13,28 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::keys::{CreateApiKey, Key, PatchApiKey}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; +use utoipa::{IntoParams, OpenApi, ToSchema}; use uuid::Uuid; -use super::PAGINATION_DEFAULT_LIMIT; +use super::{PaginationView, PAGINATION_DEFAULT_LIMIT, PAGINATION_DEFAULT_LIMIT_FN}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::Pagination; +#[derive(OpenApi)] +#[openapi( + paths(create_api_key, list_api_keys, get_api_key, patch_api_key, delete_api_key), + tags(( + name = "Keys", + description = "Manage API `keys` for a Meilisearch instance. Each key has a given set of permissions. +You must have the master key or the default admin key to access the keys route. More information about the keys and their rights. +Accessing any route under `/keys` without having set a master key will result in an error.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/keys"), + )), +)] +pub struct ApiKeyApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -35,6 +49,51 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } +/// Create an API Key +/// +/// Create an API Key. +#[utoipa::path( + post, + path = "", + tag = "Keys", + security(("Bearer" = ["keys.create", "keys.*", "*"])), + request_body = CreateApiKey, + responses( + (status = 202, description = "Key has been created", body = KeyView, content_type = "application/json", example = json!( + { + "uid": "01b4bc42-eb33-4041-b481-254d00cce834", + "key": "d0552b41536279a0ad88bd595327b96f01176a60c2243e906c52ac02375f9bc4", + "name": "Indexing Products API key", + "description": null, + "actions": [ + "documents.add" + ], + "indexes": [ + "products" + ], + "expiresAt": "2021-11-13T00:00:00Z", + "createdAt": "2021-11-12T10:00:00Z", + "updatedAt": "2021-11-12T10:00:00Z" + } + )), + (status = 401, description = "The route has been hit on an unprotected instance", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Meilisearch is running without a master key. To access this API endpoint, you must have set a master key at launch.", + "code": "missing_master_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_master_key" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn create_api_key( auth_controller: GuardedData, Data>, body: AwebJson, @@ -51,12 +110,15 @@ pub async fn create_api_key( Ok(HttpResponse::Created().json(res)) } -#[derive(Deserr, Debug, Clone, Copy)] +#[derive(Deserr, Debug, Clone, Copy, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] pub struct ListApiKeys { #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = usize, default = 0)] pub offset: Param, #[deserr(default = Param(PAGINATION_DEFAULT_LIMIT), error = DeserrQueryParamError)] + #[param(value_type = usize, default = PAGINATION_DEFAULT_LIMIT_FN)] pub limit: Param, } @@ -66,6 +128,58 @@ impl ListApiKeys { } } +/// Get API Keys +/// +/// List all API Keys +#[utoipa::path( + get, + path = "", + tag = "Keys", + security(("Bearer" = ["keys.get", "keys.*", "*"])), + params(ListApiKeys), + responses( + (status = 202, description = "List of keys", body = PaginationView, content_type = "application/json", example = json!( + { + "results": [ + { + "uid": "01b4bc42-eb33-4041-b481-254d00cce834", + "key": "d0552b41536279a0ad88bd595327b96f01176a60c2243e906c52ac02375f9bc4", + "name": "An API Key", + "description": null, + "actions": [ + "documents.add" + ], + "indexes": [ + "movies" + ], + "expiresAt": "2022-11-12T10:00:00Z", + "createdAt": "2021-11-12T10:00:00Z", + "updatedAt": "2021-11-12T10:00:00Z" + } + ], + "limit": 20, + "offset": 0, + "total": 1 + } + )), + (status = 401, description = "The route has been hit on an unprotected instance", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Meilisearch is running without a master key. To access this API endpoint, you must have set a master key at launch.", + "code": "missing_master_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_master_key" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn list_api_keys( auth_controller: GuardedData, Data>, list_api_keys: AwebQueryParameter, @@ -84,6 +198,51 @@ pub async fn list_api_keys( Ok(HttpResponse::Ok().json(page_view)) } +/// Get an API Key +/// +/// Get an API key from its `uid` or its `key` field. +#[utoipa::path( + get, + path = "/{uidOrKey}", + tag = "Keys", + security(("Bearer" = ["keys.get", "keys.*", "*"])), + params(("uidOrKey" = String, Path, format = Password, example = "7b198a7f-52a0-4188-8762-9ad93cd608b2", description = "The `uid` or `key` field of an existing API key", nullable = false)), + responses( + (status = 200, description = "The key is returned", body = KeyView, content_type = "application/json", example = json!( + { + "uid": "01b4bc42-eb33-4041-b481-254d00cce834", + "key": "d0552b41536279a0ad88bd595327b96f01176a60c2243e906c52ac02375f9bc4", + "name": "An API Key", + "description": null, + "actions": [ + "documents.add" + ], + "indexes": [ + "movies" + ], + "expiresAt": "2022-11-12T10:00:00Z", + "createdAt": "2021-11-12T10:00:00Z", + "updatedAt": "2021-11-12T10:00:00Z" + } + )), + (status = 401, description = "The route has been hit on an unprotected instance", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Meilisearch is running without a master key. To access this API endpoint, you must have set a master key at launch.", + "code": "missing_master_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_master_key" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_api_key( auth_controller: GuardedData, Data>, path: web::Path, @@ -103,6 +262,53 @@ pub async fn get_api_key( Ok(HttpResponse::Ok().json(res)) } +/// Update a Key +/// +/// Update the name and description of an API key. +/// Updates to keys are partial. This means you should provide only the fields you intend to update, as any fields not present in the payload will remain unchanged. +#[utoipa::path( + patch, + path = "/{uidOrKey}", + tag = "Keys", + security(("Bearer" = ["keys.update", "keys.*", "*"])), + params(("uidOrKey" = String, Path, format = Password, example = "7b198a7f-52a0-4188-8762-9ad93cd608b2", description = "The `uid` or `key` field of an existing API key", nullable = false)), + request_body = PatchApiKey, + responses( + (status = 200, description = "The key have been updated", body = KeyView, content_type = "application/json", example = json!( + { + "uid": "01b4bc42-eb33-4041-b481-254d00cce834", + "key": "d0552b41536279a0ad88bd595327b96f01176a60c2243e906c52ac02375f9bc4", + "name": "An API Key", + "description": null, + "actions": [ + "documents.add" + ], + "indexes": [ + "movies" + ], + "expiresAt": "2022-11-12T10:00:00Z", + "createdAt": "2021-11-12T10:00:00Z", + "updatedAt": "2021-11-12T10:00:00Z" + } + )), + (status = 401, description = "The route has been hit on an unprotected instance", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Meilisearch is running without a master key. To access this API endpoint, you must have set a master key at launch.", + "code": "missing_master_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_master_key" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn patch_api_key( auth_controller: GuardedData, Data>, body: AwebJson, @@ -123,6 +329,35 @@ pub async fn patch_api_key( Ok(HttpResponse::Ok().json(res)) } +/// Delete a key +/// +/// Delete the specified API key. +#[utoipa::path( + delete, + path = "/{uidOrKey}", + tag = "Keys", + security(("Bearer" = ["keys.delete", "keys.*", "*"])), + params(("uidOrKey" = String, Path, format = Password, example = "7b198a7f-52a0-4188-8762-9ad93cd608b2", description = "The `uid` or `key` field of an existing API key", nullable = false)), + responses( + (status = NO_CONTENT, description = "The key have been removed"), + (status = 401, description = "The route has been hit on an unprotected instance", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Meilisearch is running without a master key. To access this API endpoint, you must have set a master key at launch.", + "code": "missing_master_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_master_key" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn delete_api_key( auth_controller: GuardedData, Data>, path: web::Path, @@ -144,19 +379,30 @@ pub struct AuthParam { key: String, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] -struct KeyView { +pub(super) struct KeyView { + /// The name of the API Key if any name: Option, + /// The description of the API Key if any description: Option, + /// The actual API Key you can send to Meilisearch key: String, + /// The `Uuid` specified while creating the key or autogenerated by Meilisearch. uid: Uuid, + /// The actions accessible with this key. actions: Vec, + /// The indexes accessible with this key. indexes: Vec, + /// The expiration date of the key. Once this timestamp is exceeded the key is not deleted but cannot be used anymore. #[serde(serialize_with = "time::serde::rfc3339::option::serialize")] expires_at: Option, + /// The date of creation of this API Key. + #[schema(read_only)] #[serde(serialize_with = "time::serde::rfc3339::serialize")] created_at: OffsetDateTime, + /// The date of the last update made on this key. + #[schema(read_only)] #[serde(serialize_with = "time::serde::rfc3339::serialize")] updated_at: OffsetDateTime, } diff --git a/crates/meilisearch/src/routes/batches.rs b/crates/meilisearch/src/routes/batches.rs index 6faedc021..8ca9f1537 100644 --- a/crates/meilisearch/src/routes/batches.rs +++ b/crates/meilisearch/src/routes/batches.rs @@ -1,24 +1,83 @@ -use actix_web::{ - web::{self, Data}, - HttpResponse, -}; +use actix_web::web::{self, Data}; +use actix_web::HttpResponse; use deserr::actix_web::AwebQueryParameter; use index_scheduler::{IndexScheduler, Query}; -use meilisearch_types::{ - batch_view::BatchView, batches::BatchId, deserr::DeserrQueryParamError, error::ResponseError, - keys::actions, -}; +use meilisearch_types::batch_view::BatchView; +use meilisearch_types::batches::BatchId; +use meilisearch_types::deserr::DeserrQueryParamError; +use meilisearch_types::error::ResponseError; +use meilisearch_types::keys::actions; use serde::Serialize; +use utoipa::{OpenApi, ToSchema}; -use crate::extractors::{authentication::GuardedData, sequential_extractor::SeqHandler}; +use super::tasks::TasksFilterQuery; +use super::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::extractors::sequential_extractor::SeqHandler; -use super::{tasks::TasksFilterQuery, ActionPolicy}; +#[derive(OpenApi)] +#[openapi( + paths(get_batch, get_batches), + tags(( + name = "Batches", + description = "The /batches route gives information about the progress of batches of asynchronous operations.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/batches"), + )), +)] +pub struct BatchesApi; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::get().to(SeqHandler(get_batches)))) .service(web::resource("/{batch_id}").route(web::get().to(SeqHandler(get_batch)))); } +/// Get one batch +/// +/// Get a single batch. +#[utoipa::path( + get, + path = "/{batchUid}", + tag = "Batches", + security(("Bearer" = ["tasks.get", "tasks.*", "*"])), + params( + ("batchUid" = String, Path, example = "8685", description = "The unique batch id", nullable = false), + ), + responses( + (status = OK, description = "Return the batch", body = BatchView, content_type = "application/json", example = json!( + { + "uid": 1, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "progress": null, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "INDEX_NAME": 1 + } + }, + "duration": "PT0.364788S", + "startedAt": "2024-12-10T15:48:49.672141Z", + "finishedAt": "2024-12-10T15:48:50.036929Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn get_batch( index_scheduler: GuardedData, Data>, batch_uid: web::Path, @@ -36,17 +95,17 @@ async fn get_batch( let query = index_scheduler::Query { batch_uids: Some(vec![batch_uid]), ..Query::default() }; let filters = index_scheduler.filters(); - let (batches, _) = index_scheduler.get_batches_from_authorized_indexes(query, filters)?; + let (batches, _) = index_scheduler.get_batches_from_authorized_indexes(&query, filters)?; if let Some(batch) = batches.first() { - let task_view = BatchView::from_batch(batch); - Ok(HttpResponse::Ok().json(task_view)) + let batch_view = BatchView::from_batch(batch); + Ok(HttpResponse::Ok().json(batch_view)) } else { Err(index_scheduler::Error::BatchNotFound(batch_uid).into()) } } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, ToSchema)] pub struct AllBatches { results: Vec, total: u64, @@ -55,6 +114,63 @@ pub struct AllBatches { next: Option, } +/// Get batches +/// +/// List all batches, regardless of index. The batch objects are contained in the results array. +/// Batches are always returned in descending order of uid. This means that by default, the most recently created batch objects appear first. +/// Batch results are paginated and can be filtered with query parameters. +#[utoipa::path( + get, + path = "", + tag = "Batches", + security(("Bearer" = ["tasks.get", "tasks.*", "*"])), + params(TasksFilterQuery), + responses( + (status = OK, description = "Return the batches", body = AllBatches, content_type = "application/json", example = json!( + { + "results": [ + { + "uid": 2, + "details": { + "stopWords": [ + "of", + "the" + ] + }, + "progress": null, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "INDEX_NAME": 1 + } + }, + "duration": "PT0.110083S", + "startedAt": "2024-12-10T15:49:04.995321Z", + "finishedAt": "2024-12-10T15:49:05.105404Z" + } + ], + "total": 3, + "limit": 1, + "from": 2, + "next": 1 + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn get_batches( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, @@ -66,7 +182,7 @@ async fn get_batches( let query = params.into_query(); let filters = index_scheduler.filters(); - let (tasks, total) = index_scheduler.get_batches_from_authorized_indexes(query, filters)?; + let (tasks, total) = index_scheduler.get_batches_from_authorized_indexes(&query, filters)?; let mut results: Vec<_> = tasks.iter().map(BatchView::from_batch).collect(); // If we were able to fetch the number +1 tasks we asked diff --git a/crates/meilisearch/src/routes/dump.rs b/crates/meilisearch/src/routes/dump.rs index c78dc4dad..bc16409a2 100644 --- a/crates/meilisearch/src/routes/dump.rs +++ b/crates/meilisearch/src/routes/dump.rs @@ -5,6 +5,7 @@ use meilisearch_auth::AuthController; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; use tracing::debug; +use utoipa::OpenApi; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; @@ -13,12 +14,60 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; +#[derive(OpenApi)] +#[openapi( + paths(create_dump), + tags(( + name = "Dumps", + description = "The `dumps` route allows the creation of database dumps. +Dumps are `.dump` files that can be used to launch Meilisearch. Dumps are compatible between Meilisearch versions. +Creating a dump is also referred to as exporting it, whereas launching Meilisearch with a dump is referred to as importing it. +During a [dump export](https://www.meilisearch.com/docs/reference/api/dump#create-a-dump), all indexes of the current instance are +exported—together with their documents and settings—and saved as a single `.dump` file. During a dump import, +all indexes contained in the indicated `.dump` file are imported along with their associated documents and settings. +Any existing index with the same uid as an index in the dump file will be overwritten. +Dump imports are [performed at launch](https://www.meilisearch.com/docs/learn/advanced/dumps#importing-a-dump) using an option.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/dump"), + )), +)] +pub struct DumpApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); } crate::empty_analytics!(DumpAnalytics, "Dump Created"); +/// Create a dump +/// +/// Triggers a dump creation process. Once the process is complete, a dump is created in the +/// [dump directory](https://www.meilisearch.com/docs/learn/self_hosted/configure_meilisearch_at_launch#dump-directory). +/// If the dump directory does not exist yet, it will be created. +#[utoipa::path( + post, + path = "", + tag = "Dumps", + security(("Bearer" = ["dumps.create", "dumps.*", "*"])), + responses( + (status = 202, description = "Dump is being created", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 0, + "indexUid": null, + "status": "enqueued", + "type": "DumpCreation", + "enqueuedAt": "2021-01-01T09:39:00.000000Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn create_dump( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index 5d93adc02..eb8e7ac04 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -8,12 +8,26 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; +use utoipa::{OpenApi, ToSchema}; use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +#[derive(OpenApi)] +#[openapi( + paths(get_features, patch_features), + tags(( + name = "Experimental features", + description = "The `/experimental-features` route allows you to activate or deactivate some of Meilisearch's experimental features. + +This route is **synchronous**. This means that no task object will be returned, and any activated or deactivated features will be made available or unavailable immediately.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/experimental_features"), + )), +)] +pub struct ExperimentalFeaturesApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -22,6 +36,34 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } +/// Get all experimental features +/// +/// Get a list of all experimental features that can be activated via the /experimental-features route and whether or not they are currently activated. +#[utoipa::path( + get, + path = "", + tag = "Experimental features", + security(("Bearer" = ["experimental_features.get", "experimental_features.*", "*"])), + responses( + (status = OK, description = "Experimental features are returned", body = RuntimeTogglableFeatures, content_type = "application/json", example = json!(RuntimeTogglableFeatures { + metrics: Some(true), + logs_route: Some(false), + edit_documents_by_function: Some(false), + contains_filter: Some(false), + network: Some(false), + get_task_documents_route: Some(false), + composite_embedders: Some(false), + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn get_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>, @@ -31,15 +73,16 @@ async fn get_features( let features = index_scheduler.features(); let features = features.runtime_features(); + let features: RuntimeTogglableFeatures = features.into(); debug!(returns = ?features, "Get features"); HttpResponse::Ok().json(features) } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema, Serialize)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct RuntimeTogglableFeatures { - #[deserr(default)] - pub vector_store: Option, #[deserr(default)] pub metrics: Option, #[deserr(default)] @@ -48,15 +91,47 @@ pub struct RuntimeTogglableFeatures { pub edit_documents_by_function: Option, #[deserr(default)] pub contains_filter: Option, + #[deserr(default)] + pub network: Option, + #[deserr(default)] + pub get_task_documents_route: Option, + #[deserr(default)] + pub composite_embedders: Option, +} + +impl From for RuntimeTogglableFeatures { + fn from(value: meilisearch_types::features::RuntimeTogglableFeatures) -> Self { + let meilisearch_types::features::RuntimeTogglableFeatures { + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + network, + get_task_documents_route, + composite_embedders, + } = value; + + Self { + metrics: Some(metrics), + logs_route: Some(logs_route), + edit_documents_by_function: Some(edit_documents_by_function), + contains_filter: Some(contains_filter), + network: Some(network), + get_task_documents_route: Some(get_task_documents_route), + composite_embedders: Some(composite_embedders), + } + } } #[derive(Serialize)] pub struct PatchExperimentalFeatureAnalytics { - vector_store: bool, metrics: bool, logs_route: bool, edit_documents_by_function: bool, contains_filter: bool, + network: bool, + get_task_documents_route: bool, + composite_embedders: bool, } impl Aggregate for PatchExperimentalFeatureAnalytics { @@ -66,11 +141,13 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - vector_store: new.vector_store, metrics: new.metrics, logs_route: new.logs_route, edit_documents_by_function: new.edit_documents_by_function, contains_filter: new.contains_filter, + network: new.network, + get_task_documents_route: new.get_task_documents_route, + composite_embedders: new.composite_embedders, }) } @@ -79,6 +156,34 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { } } +/// Configure experimental features +/// +/// Activate or deactivate experimental features. +#[utoipa::path( + patch, + path = "", + tag = "Experimental features", + security(("Bearer" = ["experimental_features.update", "experimental_features.*", "*"])), + responses( + (status = OK, description = "Experimental features are returned", body = RuntimeTogglableFeatures, content_type = "application/json", example = json!(RuntimeTogglableFeatures { + metrics: Some(true), + logs_route: Some(false), + edit_documents_by_function: Some(false), + contains_filter: Some(false), + network: Some(false), + get_task_documents_route: Some(false), + composite_embedders: Some(false), + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn patch_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>, @@ -93,7 +198,6 @@ async fn patch_features( let old_features = features.runtime_features(); let new_features = meilisearch_types::features::RuntimeTogglableFeatures { - vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store), metrics: new_features.0.metrics.unwrap_or(old_features.metrics), logs_route: new_features.0.logs_route.unwrap_or(old_features.logs_route), edit_documents_by_function: new_features @@ -101,30 +205,44 @@ async fn patch_features( .edit_documents_by_function .unwrap_or(old_features.edit_documents_by_function), contains_filter: new_features.0.contains_filter.unwrap_or(old_features.contains_filter), + network: new_features.0.network.unwrap_or(old_features.network), + get_task_documents_route: new_features + .0 + .get_task_documents_route + .unwrap_or(old_features.get_task_documents_route), + composite_embedders: new_features + .0 + .composite_embedders + .unwrap_or(old_features.composite_embedders), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because - // the it renames to camelCase, which we don't want for analytics. + // it renames to camelCase, which we don't want for analytics. // **Do not** ignore fields with `..` or `_` here, because we want to add them in the future. let meilisearch_types::features::RuntimeTogglableFeatures { - vector_store, metrics, logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, + composite_embedders, } = new_features; analytics.publish( PatchExperimentalFeatureAnalytics { - vector_store, metrics, logs_route, edit_documents_by_function, contains_filter, + network, + get_task_documents_route, + composite_embedders, }, &req, ); index_scheduler.put_runtime_features(new_features)?; + let new_features: RuntimeTogglableFeatures = new_features.into(); debug!(returns = ?new_features, "Patch features"); Ok(HttpResponse::Ok().json(new_features)) } diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 47f73ef42..50eec46fe 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use std::io::ErrorKind; +use std::io::{ErrorKind, Seek as _}; use std::marker::PhantomData; use actix_web::http::header::CONTENT_TYPE; @@ -20,17 +20,20 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; +use meilisearch_types::serde_cs::vec::CS; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; use tokio::fs::File; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tracing::debug; +use utoipa::{IntoParams, OpenApi, ToSchema}; use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::error::MeilisearchHttpError; @@ -42,7 +45,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::{parse_filter, RetrieveVectors}; +use crate::search::{parse_filter, ExternalDocumentId, RetrieveVectors}; use crate::{aggregate_methods, Opt}; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -71,6 +74,19 @@ pub struct DocumentParam { document_id: String, } +#[derive(OpenApi)] +#[openapi( + paths(get_document, get_documents, delete_document, replace_documents, update_documents, clear_all_documents, delete_documents_batch, delete_documents_by_filter, edit_documents_by_function, documents_by_query_post), + tags( + ( + name = "Documents", + description = "Documents are objects composed of fields that can store any type of data. Each field contains an attribute and its associated value. Documents are stored inside [indexes](https://www.meilisearch.com/docs/learn/getting_started/indexes).", + external_docs(url = "https://www.meilisearch.com/docs/learn/getting_started/documents"), + ), + ), +)] +pub struct DocumentsApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -93,12 +109,18 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, IntoParams, ToSchema)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] +#[schema(rename_all = "camelCase")] pub struct GetDocument { #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Option>)] + #[schema(value_type = Option>)] fields: OptionStarOrList, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Option)] + #[schema(value_type = Option)] retrieve_vectors: Param, } @@ -117,6 +139,9 @@ pub struct DocumentsFetchAggregator { #[serde(rename = "vector.retrieve_vectors")] retrieve_vectors: bool, + // maximum size of `ids` array. 0 if always empty or `null` + max_document_ids: usize, + // pagination #[serde(rename = "pagination.max_limit")] max_limit: usize, @@ -129,7 +154,7 @@ pub struct DocumentsFetchAggregator { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { PerDocumentId { retrieve_vectors: bool }, - Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool, ids: usize }, } impl DocumentsFetchAggregator { @@ -141,12 +166,18 @@ impl DocumentsFetchAggregator { } }; + let ids = match query { + DocumentFetchKind::Normal { ids, .. } => *ids, + DocumentFetchKind::PerDocumentId { .. } => 0, + }; + Self { per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, retrieve_vectors, + max_document_ids: ids, marker: PhantomData, } @@ -165,6 +196,7 @@ impl Aggregate for DocumentsFetchAggregator { retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, max_limit: self.max_limit.max(new.max_limit), max_offset: self.max_offset.max(new.max_offset), + max_document_ids: self.max_document_ids.max(new.max_document_ids), marker: PhantomData, }) } @@ -174,6 +206,55 @@ impl Aggregate for DocumentsFetchAggregator { } } +/// Get one document +/// +/// Get one document from its primary key. +#[utoipa::path( + get, + path = "{indexUid}/documents/{documentId}", + tag = "Documents", + security(("Bearer" = ["documents.get", "documents.*", "*"])), + params( + ("indexUid" = String, Path, example = "movies", description = "Index Unique Identifier", nullable = false), + ("documentId" = String, Path, example = "85087", description = "The document identifier", nullable = false), + GetDocument, + ), + responses( + (status = 200, description = "The document is returned", body = serde_json::Value, content_type = "application/json", example = json!( + { + "id": 25684, + "title": "American Ninja 5", + "poster": "https://image.tmdb.org/t/p/w1280/iuAQVI4mvjI83wnirpD8GVNRVuY.jpg", + "overview": "When a scientists daughter is kidnapped, American Ninja, attempts to find her, but this time he teams up with a youngster he has trained in the ways of the ninja.", + "release_date": 725846400 + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 404, description = "Document not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Document `a` not found.", + "code": "document_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#document_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_document( index_scheduler: GuardedData, Data>, document_param: web::Path, @@ -188,8 +269,7 @@ pub async fn get_document( let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); - let features = index_scheduler.features(); - let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; + let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0); analytics.publish( DocumentsFetchAggregator:: { @@ -198,6 +278,7 @@ pub async fn get_document( per_filter: false, max_limit: 0, max_offset: 0, + max_document_ids: 0, marker: PhantomData, }, &req, @@ -237,6 +318,38 @@ impl Aggregate for DocumentsDeletionAggregator { } } +/// Delete a document +/// +/// Delete a single document by id. +#[utoipa::path( + delete, + path = "{indexUid}/documents/{documentId}", + tag = "Documents", + security(("Bearer" = ["documents.delete", "documents.*", "*"])), + params( + ("indexUid" = String, Path, example = "movies", description = "Index Unique Identifier", nullable = false), + ("documentId" = String, Path, example = "853", description = "Document Identifier", nullable = false), + ), + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn delete_document( index_scheduler: GuardedData, Data>, path: web::Path, @@ -271,36 +384,104 @@ pub async fn delete_document( Ok(HttpResponse::Accepted().json(task)) } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] pub struct BrowseQueryGet { + #[param(default, value_type = Option)] #[deserr(default, error = DeserrQueryParamError)] offset: Param, + #[param(default, value_type = Option)] #[deserr(default = Param(PAGINATION_DEFAULT_LIMIT), error = DeserrQueryParamError)] limit: Param, + #[param(default, value_type = Option>)] #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[param(default, value_type = Option)] #[deserr(default, error = DeserrQueryParamError)] retrieve_vectors: Param, + #[param(default, value_type = Option>)] + #[deserr(default, error = DeserrQueryParamError)] + ids: Option>, + #[param(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrQueryParamError)] filter: Option, } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] pub struct BrowseQuery { + #[schema(default, example = 150)] #[deserr(default, error = DeserrJsonError)] offset: usize, + #[schema(default = 20, example = 1)] #[deserr(default = PAGINATION_DEFAULT_LIMIT, error = DeserrJsonError)] limit: usize, + #[schema(example = json!(["title, description"]))] #[deserr(default, error = DeserrJsonError)] fields: Option>, + #[schema(default, example = true)] #[deserr(default, error = DeserrJsonError)] retrieve_vectors: bool, + #[schema(value_type = Option>, example = json!(["cody", "finn", "brandy", "gambit"]))] + #[deserr(default, error = DeserrJsonError)] + ids: Option>, + #[schema(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrJsonError)] filter: Option, } +/// Get documents with POST +/// +/// Get a set of documents. +#[utoipa::path( + post, + path = "{indexUid}/documents/fetch", + tag = "Documents", + security(("Bearer" = ["documents.delete", "documents.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = BrowseQuery, + responses( + (status = 200, description = "Task successfully enqueued", body = PaginationView, content_type = "application/json", example = json!( + { + "results":[ + { + "title":"The Travels of Ibn Battuta", + "genres":[ + "Travel", + "Adventure" + ], + "language":"English", + "rating":4.5 + }, + { + "title":"Pride and Prejudice", + "genres":[ + "Classics", + "Fiction", + "Romance", + "Literature" + ], + "language":"English", + "rating":4 + }, + ], + "offset":0, + "limit":2, + "total":5 + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn documents_by_query_post( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -317,6 +498,7 @@ pub async fn documents_by_query_post( retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, max_offset: body.offset, + max_document_ids: body.ids.as_ref().map(Vec::len).unwrap_or_default(), per_document_id: false, marker: PhantomData, }, @@ -326,6 +508,60 @@ pub async fn documents_by_query_post( documents_by_query(&index_scheduler, index_uid, body) } +/// Get documents +/// +/// Get documents by batches. +#[utoipa::path( + get, + path = "{indexUid}/documents", + tag = "Documents", + security(("Bearer" = ["documents.get", "documents.*", "*"])), + params( + ("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false), + BrowseQueryGet + ), + responses( + (status = 200, description = "The documents are returned", body = PaginationView, content_type = "application/json", example = json!( + { + "results": [ + { + "id": 25684, + "title": "American Ninja 5", + "poster": "https://image.tmdb.org/t/p/w1280/iuAQVI4mvjI83wnirpD8GVNRVuY.jpg", + "overview": "When a scientists daughter is kidnapped, American Ninja, attempts to find her, but this time he teams up with a youngster he has trained in the ways of the ninja.", + "release_date": 725846400 + }, + { + "id": 45881, + "title": "The Bridge of San Luis Rey", + "poster": "https://image.tmdb.org/t/p/w500/4X7quIcdkc24Cveg5XdpfRqxtYA.jpg", + "overview": "The Bridge of San Luis Rey is American author Thornton Wilder's second novel, first published in 1927 to worldwide acclaim. It tells the story of several interrelated people who die in the collapse of an Inca rope-fiber suspension bridge in Peru, and the events that lead up to their being on the bridge.[ A friar who has witnessed the tragic accident then goes about inquiring into the lives of the victims, seeking some sort of cosmic answer to the question of why each had to die. The novel won the Pulitzer Prize in 1928.", + "release_date": 1072915200 + } + ], + "limit": 20, + "offset": 0, + "total": 2 + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -335,7 +571,8 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids } = + params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -345,12 +582,15 @@ pub async fn get_documents( None => None, }; + let ids = ids.map(|ids| ids.into_iter().map(Into::into).collect()); + let query = BrowseQuery { offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), retrieve_vectors: retrieve_vectors.0, filter, + ids, }; analytics.publish( @@ -359,6 +599,7 @@ pub async fn get_documents( retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, max_offset: query.offset, + max_document_ids: query.ids.as_ref().map(Vec::len).unwrap_or_default(), per_document_id: false, marker: PhantomData, }, @@ -374,16 +615,30 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids } = query; - let features = index_scheduler.features(); - let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?; + let retrieve_vectors = RetrieveVectors::new(retrieve_vectors); + + let ids = if let Some(ids) = ids { + let mut parsed_ids = Vec::with_capacity(ids.len()); + for (index, id) in ids.into_iter().enumerate() { + let id = id.try_into().map_err(|error| { + let msg = format!("In `.ids[{index}]`: {error}"); + ResponseError::from_msg(msg, Code::InvalidDocumentIds) + })?; + parsed_ids.push(id) + } + Some(parsed_ids) + } else { + None + }; let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents( &index, offset, limit, + ids, filter, fields, retrieve_vectors, @@ -396,11 +651,17 @@ fn documents_by_query( Ok(HttpResponse::Ok().json(ret)) } -#[derive(Deserialize, Debug, Deserr)] +#[derive(Deserialize, Debug, Deserr, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(parameter_in = Query, rename_all = "camelCase")] pub struct UpdateDocumentsQuery { + /// The primary key of the documents. primaryKey is optional. If you want to set the primary key of your index through this route, + /// it only has to be done the first time you add documents to the index. After which it will be ignored if given. + #[param(example = "id")] #[deserr(default, error = DeserrQueryParamError)] pub primary_key: Option, + /// Customize the csv delimiter when importing CSV documents. + #[param(value_type = char, default = ",", example = ";")] #[deserr(default, try_from(char) = from_char_csv_delimiter -> DeserrQueryParamError, error = DeserrQueryParamError)] pub csv_delimiter: Option, } @@ -451,6 +712,52 @@ impl Aggregate for DocumentsAggregator { } } +/// Add or replace documents +/// +/// Add a list of documents or replace them if they already exist. +/// +/// If you send an already existing document (same id) the whole existing document will be overwritten by the new document. Fields previously in the document not present in the new document are removed. +/// +/// For a partial update of the document see Add or update documents route. +/// > info +/// > If the provided index does not exist, it will be created. +/// > info +/// > Use the reserved `_geo` object to add geo coordinates to a document. `_geo` is an object made of `lat` and `lng` field. +/// > +/// > When the vectorStore feature is enabled you can use the reserved `_vectors` field in your documents. +/// > It can accept an array of floats, multiple arrays of floats in an outer array or an object. +/// > This object accepts keys corresponding to the different embedders defined your index settings. +#[utoipa::path( + post, + path = "{indexUid}/documents", + tag = "Documents", + security(("Bearer" = ["documents.add", "documents.*", "*"])), + params( + ("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false), + // Here we can use the post version of the browse query since it contains the exact same parameter + UpdateDocumentsQuery, + ), + request_body = serde_json::Value, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn replace_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -508,6 +815,50 @@ pub async fn replace_documents( Ok(HttpResponse::Accepted().json(task)) } +/// Add or update documents +/// +/// Add a list of documents or update them if they already exist. +/// If you send an already existing document (same id) the old document will be only partially updated according to the fields of the new document. Thus, any fields not present in the new document are kept and remained unchanged. +/// To completely overwrite a document, see Add or replace documents route. +/// > info +/// > If the provided index does not exist, it will be created. +/// > info +/// > Use the reserved `_geo` object to add geo coordinates to a document. `_geo` is an object made of `lat` and `lng` field. +/// > +/// > When the vectorStore feature is enabled you can use the reserved `_vectors` field in your documents. +/// > It can accept an array of floats, multiple arrays of floats in an outer array or an object. +/// > This object accepts keys corresponding to the different embedders defined your index settings. +#[utoipa::path( + put, + path = "{indexUid}/documents", + tag = "Documents", + security(("Bearer" = ["documents.add", "documents.*", "*"])), + params( + ("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false), + // Here we can use the post version of the browse query since it contains the exact same parameter + UpdateDocumentsQuery, + ), + request_body = serde_json::Value, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn update_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -572,7 +923,7 @@ async fn document_addition( index_uid: IndexUid, primary_key: Option, csv_delimiter: Option, - mut body: Payload, + body: Payload, method: IndexDocumentsMethod, task_id: Option, dry_run: bool, @@ -608,55 +959,61 @@ async fn document_addition( } }; - let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?; + let (uuid, mut update_file) = index_scheduler.queue.create_update_file(dry_run)?; + let documents_count = match format { + PayloadType::Ndjson => { + let (path, file) = update_file.into_parts(); + let file = match file { + Some(file) => { + let (file, path) = file.into_parts(); + let mut file = copy_body_to_file(file, body, format).await?; + file.rewind().map_err(|e| { + index_scheduler::Error::FileStore(file_store::Error::IoError(e)) + })?; + Some(tempfile::NamedTempFile::from_parts(file, path)) + } + None => None, + }; - let temp_file = match tempfile() { - Ok(file) => file, - Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + let documents_count = tokio::task::spawn_blocking(move || { + let documents_count = file.as_ref().map_or(Ok(0), |ntf| { + read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat) + })?; + + let update_file = file_store::File::from_parts(path, file); + update_file.persist()?; + + Ok(documents_count) + }) + .await?; + + Ok(documents_count) + } + PayloadType::Json | PayloadType::Csv { delimiter: _ } => { + let temp_file = match tempfile() { + Ok(file) => file, + Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + }; + + let read_file = copy_body_to_file(temp_file, body, format).await?; + tokio::task::spawn_blocking(move || { + let documents_count = match format { + PayloadType::Json => read_json(&read_file, &mut update_file)?, + PayloadType::Csv { delimiter } => { + read_csv(&read_file, &mut update_file, delimiter)? + } + PayloadType::Ndjson => { + unreachable!("We already wrote the user content into the update file") + } + }; + // we NEED to persist the file here because we moved the `udpate_file` in another task. + update_file.persist()?; + Ok(documents_count) + }) + .await + } }; - let async_file = File::from_std(temp_file); - let mut buffer = BufWriter::new(async_file); - - let mut buffer_write_size: usize = 0; - while let Some(result) = body.next().await { - let byte = result?; - - if byte.is_empty() && buffer_write_size == 0 { - return Err(MeilisearchHttpError::MissingPayload(format)); - } - - match buffer.write_all(&byte).await { - Ok(()) => buffer_write_size += 1, - Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), - } - } - - if let Err(e) = buffer.flush().await { - return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); - } - - if buffer_write_size == 0 { - return Err(MeilisearchHttpError::MissingPayload(format)); - } - - if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await { - return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); - } - - let read_file = buffer.into_inner().into_std().await; - let documents_count = tokio::task::spawn_blocking(move || { - let documents_count = match format { - PayloadType::Json => read_json(&read_file, &mut update_file)?, - PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?, - PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?, - }; - // we NEED to persist the file here because we moved the `udpate_file` in another task. - update_file.persist()?; - Ok(documents_count) - }) - .await; - let documents_count = match documents_count { Ok(Ok(documents_count)) => documents_count, // in this case the file has not possibly be persisted. @@ -664,7 +1021,7 @@ async fn document_addition( Err(e) => { // Here the file MAY have been persisted or not. // We don't know thus we ignore the file not found error. - match index_scheduler.delete_update_file(uuid) { + match index_scheduler.queue.delete_update_file(uuid) { Ok(()) => (), Err(index_scheduler::Error::FileStore(file_store::Error::IoError(e))) if e.kind() == ErrorKind::NotFound => {} @@ -695,7 +1052,7 @@ async fn document_addition( { Ok(task) => task, Err(e) => { - index_scheduler.delete_update_file(uuid)?; + index_scheduler.queue.delete_update_file(uuid)?; return Err(e.into()); } }; @@ -703,6 +1060,71 @@ async fn document_addition( Ok(task.into()) } +async fn copy_body_to_file( + output: std::fs::File, + mut body: Payload, + format: PayloadType, +) -> Result { + let async_file = File::from_std(output); + let mut buffer = BufWriter::new(async_file); + let mut buffer_write_size: usize = 0; + while let Some(result) = body.next().await { + let byte = result?; + + if byte.is_empty() && buffer_write_size == 0 { + return Err(MeilisearchHttpError::MissingPayload(format)); + } + + match buffer.write_all(&byte).await { + Ok(()) => buffer_write_size += 1, + Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))), + } + } + if let Err(e) = buffer.flush().await { + return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + } + if buffer_write_size == 0 { + return Err(MeilisearchHttpError::MissingPayload(format)); + } + if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await { + return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))); + } + let read_file = buffer.into_inner().into_std().await; + Ok(read_file) +} + +/// Delete documents by batch +/// +/// Delete a set of documents based on an array of document ids. +#[utoipa::path( + post, + path = "{indexUid}/delete-batch", + tag = "Documents", + security(("Bearer" = ["documents.delete", "documents.*", "*"])), + params( + ("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false), + ), + request_body = Vec, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentAdditionOrUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn delete_documents_batch( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -742,13 +1164,44 @@ pub async fn delete_documents_batch( Ok(HttpResponse::Accepted().json(task)) } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] pub struct DocumentDeletionByFilter { #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_document_filter)] filter: Value, } +/// Delete documents by filter +/// +/// Delete a set of documents based on a filter. +#[utoipa::path( + post, + path = "{indexUid}/documents/delete", + tag = "Documents", + security(("Bearer" = ["documents.delete", "documents.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = DocumentDeletionByFilter, + responses( + (status = ACCEPTED, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentDeletion", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn delete_documents_by_filter( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -789,13 +1242,16 @@ pub async fn delete_documents_by_filter( Ok(HttpResponse::Accepted().json(task)) } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct DocumentEditionByFunction { + /// A string containing a RHAI function. #[deserr(default, error = DeserrJsonError)] pub filter: Option, + /// A string containing a filter expression. #[deserr(default, error = DeserrJsonError)] pub context: Option, + /// An object with data Meilisearch should make available for the editing function. #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_document_edition_function)] pub function: String, } @@ -828,6 +1284,38 @@ impl Aggregate for EditDocumentsByFunctionAggregator { } } +/// Edit documents by function. +/// +/// Use a [RHAI function](https://rhai.rs/book/engine/hello-world.html) to edit one or more documents directly in Meilisearch. +#[utoipa::path( + post, + path = "{indexUid}/documents/edit", + tag = "Documents", + security(("Bearer" = ["documents.*", "*"])), + params( + ("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false), + ), + request_body = DocumentEditionByFunction, + responses( + (status = 202, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentDeletion", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn edit_documents_by_function( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -897,6 +1385,35 @@ pub async fn edit_documents_by_function( Ok(HttpResponse::Accepted().json(task)) } +/// Delete all documents +/// +/// Delete all documents in the specified index. +#[utoipa::path( + delete, + path = "{indexUid}/documents", + tag = "Documents", + security(("Bearer" = ["documents.delete", "documents.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "documentDeletion", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn clear_all_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -941,7 +1458,6 @@ fn some_documents<'a, 't: 'a>( ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; match retrieve_vectors { - RetrieveVectors::Ignore => {} RetrieveVectors::Hide => { document.remove("_vectors"); } @@ -975,10 +1491,12 @@ fn some_documents<'a, 't: 'a>( })) } +#[allow(clippy::too_many_arguments)] fn retrieve_documents>( index: &Index, offset: usize, limit: usize, + ids: Option>, filter: Option, attributes_to_retrieve: Option>, retrieve_vectors: RetrieveVectors, @@ -992,16 +1510,28 @@ fn retrieve_documents>( None }; - let candidates = if let Some(filter) = filter { - filter.evaluate(&rtxn, index).map_err(|err| match err { + let mut candidates = if let Some(ids) = ids { + let external_document_ids = index.external_documents_ids(); + let mut candidates = RoaringBitmap::new(); + for id in ids.iter() { + let Some(docid) = external_document_ids.get(&rtxn, id)? else { + continue; + }; + candidates.insert(docid); + } + candidates + } else { + index.documents_ids(&rtxn)? + }; + + if let Some(filter) = filter { + candidates &= filter.evaluate(&rtxn, index).map_err(|err| match err { milli::Error::UserError(milli::UserError::InvalidFilter(_)) => { ResponseError::from_msg(err.to_string(), Code::InvalidDocumentFilter) } e => e.into(), })? - } else { - index.documents_ids(&rtxn)? - }; + } let (it, number_of_documents) = { let number_of_documents = candidates.len(); diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 99a4a4f28..41f306746 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -11,6 +11,7 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::locales::Locale; use serde_json::Value; use tracing::debug; +use utoipa::{OpenApi, ToSchema}; use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; @@ -18,20 +19,33 @@ use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy, - RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + RankingScoreThreshold, SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; use crate::search_queue::SearchQueue; +#[derive(OpenApi)] +#[openapi( + paths(search), + tags( + ( + name = "Facet Search", + description = "The `/facet-search` route allows you to search for facet values. Facet search supports prefix search and typo tolerance. The returned hits are sorted lexicographically in ascending order. You can configure how facets are sorted using the sortFacetValuesBy property of the faceting index settings.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/facet_search"), + ), + ), +)] +pub struct FacetSearchApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(search))); } -/// # Important -/// -/// Intentionally don't use `deny_unknown_fields` to ignore search parameters sent by user -#[derive(Debug, Clone, Default, PartialEq, deserr::Deserr)] +// # Important +// +// Intentionally don't use `deny_unknown_fields` to ignore search parameters sent by user +#[derive(Debug, Clone, Default, PartialEq, deserr::Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase)] pub struct FacetSearchQuery { #[deserr(default, error = DeserrJsonError)] @@ -42,7 +56,7 @@ pub struct FacetSearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] pub filter: Option, @@ -54,6 +68,8 @@ pub struct FacetSearchQuery { pub ranking_score_threshold: Option, #[deserr(default, error = DeserrJsonError, default)] pub locales: Option>, + #[deserr(default, error = DeserrJsonError, default)] + pub exhaustive_facet_count: Option, } #[derive(Default)] @@ -84,6 +100,7 @@ impl FacetSearchAggregator { hybrid, ranking_score_threshold, locales, + exhaustive_facet_count, } = query; Self { @@ -96,7 +113,8 @@ impl FacetSearchAggregator { || attributes_to_search_on.is_some() || hybrid.is_some() || ranking_score_threshold.is_some() - || locales.is_some(), + || locales.is_some() + || exhaustive_facet_count.is_some(), ..Default::default() } } @@ -158,6 +176,60 @@ impl Aggregate for FacetSearchAggregator { } } +/// Perform a facet search +/// +/// Search for a facet value within a given facet. +#[utoipa::path( + post, + path = "{indexUid}/facet-search", + tag = "Facet Search", + security(("Bearer" = ["search", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = FacetSearchQuery, + responses( + (status = 200, description = "The documents are returned", body = SearchResult, content_type = "application/json", example = json!( + { + "hits": [ + { + "id": 2770, + "title": "American Pie 2", + "poster": "https://image.tmdb.org/t/p/w1280/q4LNgUnRfltxzp3gf1MAGiK5LhV.jpg", + "overview": "The whole gang are back and as close as ever. They decide to get even closer by spending the summer together at a beach house. They decide to hold the biggest…", + "release_date": 997405200 + }, + { + "id": 190859, + "title": "American Sniper", + "poster": "https://image.tmdb.org/t/p/w1280/svPHnYE7N5NAGO49dBmRhq0vDQ3.jpg", + "overview": "U.S. Navy SEAL Chris Kyle takes his sole mission—protect his comrades—to heart and becomes one of the most lethal snipers in American history. His pinpoint accuracy not only saves countless lives but also makes him a prime…", + "release_date": 1418256000 + } + ], + "offset": 0, + "limit": 2, + "estimatedTotalHits": 976, + "processingTimeMs": 35, + "query": "american " + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn search( index_scheduler: GuardedData, Data>, search_queue: Data, @@ -184,8 +256,7 @@ pub async fn search( } let index = index_scheduler.index(&index_uid)?; - let features = index_scheduler.features(); - let search_kind = search_kind(&search_query, &index_scheduler, &index, features)?; + let search_kind = search_kind(&search_query, &index_scheduler, index_uid.to_string(), &index)?; let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { perform_facet_search( @@ -226,13 +297,24 @@ impl From for SearchQuery { hybrid, ranking_score_threshold, locales, + exhaustive_facet_count, } = value; + // If exhaustive_facet_count is true, we need to set the page to 0 + // because the facet search is not exhaustive by default. + let page = if exhaustive_facet_count.is_some_and(|exhaustive| exhaustive) { + // setting the page to 0 will force the search to be exhaustive when computing the number of hits, + // but it will skip the bucket sort saving time. + Some(0) + } else { + None + }; + SearchQuery { q, offset: DEFAULT_SEARCH_OFFSET(), limit: DEFAULT_SEARCH_LIMIT(), - page: None, + page, hits_per_page: None, attributes_to_retrieve: None, retrieve_vectors: false, diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 7d073ec5f..5aebf5cac 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -16,8 +16,11 @@ use meilisearch_types::tasks::KindWithContent; use serde::Serialize; use time::OffsetDateTime; use tracing::debug; +use utoipa::{IntoParams, OpenApi, ToSchema}; -use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; +use super::{ + get_task_id, Pagination, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, +}; use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -29,11 +32,32 @@ pub mod documents; pub mod facet_search; pub mod search; mod search_analytics; +#[cfg(test)] +mod search_test; pub mod settings; mod settings_analytics; pub mod similar; mod similar_analytics; +#[derive(OpenApi)] +#[openapi( + nest( + (path = "/", api = documents::DocumentsApi), + (path = "/", api = facet_search::FacetSearchApi), + (path = "/", api = similar::SimilarApi), + (path = "/", api = settings::SettingsApi), + ), + paths(list_indexes, create_index, get_index, update_index, delete_index, get_index_stats), + tags( + ( + name = "Indexes", + description = "An index is an entity that gathers a set of [documents](https://www.meilisearch.com/docs/learn/getting_started/documents) with its own [settings](https://www.meilisearch.com/docs/reference/api/settings). Learn more about indexes.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/indexes"), + ), + ), +)] +pub struct IndexesApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -57,14 +81,18 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } -#[derive(Debug, Serialize, Clone)] +#[derive(Debug, Serialize, Clone, ToSchema)] #[serde(rename_all = "camelCase")] pub struct IndexView { + /// Unique identifier for the index pub uid: String, + /// An `RFC 3339` format for date/time/duration. #[serde(with = "time::serde::rfc3339")] pub created_at: OffsetDateTime, + /// An `RFC 3339` format for date/time/duration. #[serde(with = "time::serde::rfc3339")] pub updated_at: OffsetDateTime, + /// Custom primaryKey for documents pub primary_key: Option, } @@ -82,46 +110,94 @@ impl IndexView { } } -#[derive(Deserr, Debug, Clone, Copy)] +#[derive(Deserr, Debug, Clone, Copy, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] pub struct ListIndexes { + /// The number of indexes to skip before starting to retrieve anything + #[param(value_type = Option, default, example = 100)] #[deserr(default, error = DeserrQueryParamError)] pub offset: Param, + /// The number of indexes to retrieve + #[param(value_type = Option, default = 20, example = 1)] #[deserr(default = Param(PAGINATION_DEFAULT_LIMIT), error = DeserrQueryParamError)] pub limit: Param, } + impl ListIndexes { fn as_pagination(self) -> Pagination { Pagination { offset: self.offset.0, limit: self.limit.0 } } } +/// List indexes +/// +/// List all indexes. +#[utoipa::path( + get, + path = "", + tag = "Indexes", + security(("Bearer" = ["indexes.get", "indexes.*", "*"])), + params(ListIndexes), + responses( + (status = 200, description = "Indexes are returned", body = PaginationView, content_type = "application/json", example = json!( + { + "results": [ + { + "uid": "movies", + "primaryKey": "movie_id", + "createdAt": "2019-11-20T09:40:33.711324Z", + "updatedAt": "2019-11-20T09:40:33.711324Z" + } + ], + "limit": 1, + "offset": 0, + "total": 1 + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn list_indexes( index_scheduler: GuardedData, Data>, paginate: AwebQueryParameter, ) -> Result { debug!(parameters = ?paginate, "List indexes"); let filters = index_scheduler.filters(); - let indexes: Vec> = - index_scheduler.try_for_each_index(|uid, index| -> Result, _> { - if !filters.is_index_authorized(uid) { - return Ok(None); - } - Ok(Some(IndexView::new(uid.to_string(), index)?)) - })?; - // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened. - let indexes: Vec = indexes.into_iter().flatten().collect(); - let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter()); + let (total, indexes) = + index_scheduler.get_paginated_indexes_stats(filters, *paginate.offset, *paginate.limit)?; + let indexes = indexes + .into_iter() + .map(|(name, stats)| IndexView { + uid: name, + created_at: stats.created_at, + updated_at: stats.updated_at, + primary_key: stats.primary_key, + }) + .collect::>(); + let ret = paginate.as_pagination().format_with(total, indexes); debug!(returns = ?ret, "List indexes"); Ok(HttpResponse::Ok().json(ret)) } -#[derive(Deserr, Debug)] +#[derive(Deserr, Debug, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] pub struct IndexCreateRequest { + /// The name of the index + #[schema(example = "movies")] #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_index_uid)] uid: IndexUid, + /// The primary key of the index + #[schema(example = "id")] #[deserr(default, error = DeserrJsonError)] primary_key: Option, } @@ -145,6 +221,35 @@ impl Aggregate for IndexCreatedAggregate { } } +/// Create index +/// +/// Create an index. +#[utoipa::path( + post, + path = "", + tag = "Indexes", + security(("Bearer" = ["indexes.create", "indexes.*", "*"])), + request_body = IndexCreateRequest, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": "movies", + "status": "enqueued", + "type": "indexCreation", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn create_index( index_scheduler: GuardedData, Data>, body: AwebJson, @@ -194,13 +299,42 @@ fn deny_immutable_fields_index( } } -#[derive(Deserr, Debug)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields = deny_immutable_fields_index)] -pub struct UpdateIndexRequest { - #[deserr(default, error = DeserrJsonError)] - primary_key: Option, -} - +/// Get index +/// +/// Get information about an index. +#[utoipa::path( + get, + path = "/{indexUid}", + tag = "Indexes", + security(("Bearer" = ["indexes.get", "indexes.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = 200, description = "The index is returned", body = IndexView, content_type = "application/json", example = json!( + { + "uid": "movies", + "primaryKey": "movie_id", + "createdAt": "2019-11-20T09:40:33.711324Z", + "updatedAt": "2019-11-20T09:40:33.711324Z" + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -233,6 +367,47 @@ impl Aggregate for IndexUpdatedAggregate { serde_json::to_value(*self).unwrap_or_default() } } + +#[derive(Deserr, Debug, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields = deny_immutable_fields_index)] +#[schema(rename_all = "camelCase")] +pub struct UpdateIndexRequest { + /// The new primary key of the index + #[deserr(default, error = DeserrJsonError)] + primary_key: Option, +} + +/// Update index +/// +/// Update the `primaryKey` of an index. +/// Return an error if the index doesn't exists yet or if it contains documents. +#[utoipa::path( + patch, + path = "/{indexUid}", + tag = "Indexes", + security(("Bearer" = ["indexes.update", "indexes.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = UpdateIndexRequest, + responses( + (status = ACCEPTED, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 0, + "indexUid": "movies", + "status": "enqueued", + "type": "indexUpdate", + "enqueuedAt": "2021-01-01T09:39:00.000000Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn update_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -265,6 +440,35 @@ pub async fn update_index( Ok(HttpResponse::Accepted().json(task)) } +/// Delete index +/// +/// Delete an index. +#[utoipa::path( + delete, + path = "/{indexUid}", + tag = "Indexes", + security(("Bearer" = ["indexes.delete", "indexes.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = ACCEPTED, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 0, + "indexUid": "movies", + "status": "enqueued", + "type": "indexDeletion", + "enqueuedAt": "2021-01-01T09:39:00.000000Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn delete_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -285,27 +489,87 @@ pub async fn delete_index( } /// Stats of an `Index`, as known to the `stats` route. -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, ToSchema)] #[serde(rename_all = "camelCase")] pub struct IndexStats { /// Number of documents in the index pub number_of_documents: u64, - /// Whether the index is currently performing indexation, according to the scheduler. + /// Size of the documents database, in bytes. + pub raw_document_db_size: u64, + /// Average size of a document in the documents database. + pub avg_document_size: u64, + /// Whether or not the index is currently ingesting document pub is_indexing: bool, + /// Number of embeddings in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embeddings: Option, + /// Number of embedded documents in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embedded_documents: Option, /// Association of every field name with the number of times it occurs in the documents. + #[schema(value_type = HashMap)] pub field_distribution: FieldDistribution, } impl From for IndexStats { fn from(stats: index_scheduler::IndexStats) -> Self { IndexStats { - number_of_documents: stats.inner_stats.number_of_documents, + number_of_documents: stats + .inner_stats + .number_of_documents + .unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, + number_of_embeddings: stats.inner_stats.number_of_embeddings, + number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, field_distribution: stats.inner_stats.field_distribution, } } } +/// Get stats of index +/// +/// Get the stats of an index. +#[utoipa::path( + get, + path = "/{indexUid}/stats", + tag = "Stats", + security(("Bearer" = ["stats.get", "stats.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( + { + "numberOfDocuments": 10, + "rawDocumentDbSize": 10, + "avgDocumentSize": 10, + "numberOfEmbeddings": 10, + "numberOfEmbeddedDocuments": 10, + "isIndexing": true, + "fieldDistribution": { + "genre": 10, + "author": 9 + } + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_index_stats( index_scheduler: GuardedData, Data>, index_uid: web::Path, diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 2f5cb4a36..333ae1944 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -1,7 +1,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::{AwebJson, AwebQueryParameter}; -use index_scheduler::{IndexScheduler, RoFeatures}; +use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; @@ -12,6 +12,7 @@ use meilisearch_types::milli; use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; +use utoipa::{IntoParams, OpenApi}; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; @@ -22,12 +23,28 @@ use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST}; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, + RetrieveVectors, SearchKind, SearchQuery, SearchResult, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; +#[derive(OpenApi)] +#[openapi( + paths(search_with_url_query, search_with_post), + tags( + ( + name = "Search", + description = "Meilisearch exposes two routes to perform searches: + +- A POST route: this is the preferred route when using API authentication, as it allows [preflight request](https://developer.mozilla.org/en-US/docs/Glossary/Preflight_request) caching and better performance. +- A GET route: the usage of this route is discouraged, unless you have good reason to do otherwise (specific caching abilities for example)", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/search"), + ), + ), +)] +pub struct SearchApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -36,30 +53,41 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } -#[derive(Debug, deserr::Deserr)] +#[derive(Debug, deserr::Deserr, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] pub struct SearchQueryGet { #[deserr(default, error = DeserrQueryParamError)] q: Option, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] vector: Option>, #[deserr(default = Param(DEFAULT_SEARCH_OFFSET()), error = DeserrQueryParamError)] + #[param(value_type = usize, default = DEFAULT_SEARCH_OFFSET)] offset: Param, #[deserr(default = Param(DEFAULT_SEARCH_LIMIT()), error = DeserrQueryParamError)] + #[param(value_type = usize, default = DEFAULT_SEARCH_LIMIT)] limit: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Option)] page: Option>, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Option)] hits_per_page: Option>, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] attributes_to_retrieve: Option>, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool, default)] retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] + #[param(value_type = usize, default = DEFAULT_CROP_LENGTH)] crop_length: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] attributes_to_highlight: Option>, #[deserr(default, error = DeserrQueryParamError)] filter: Option, @@ -68,30 +96,41 @@ pub struct SearchQueryGet { #[deserr(default, error = DeserrQueryParamError)] distinct: Option, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool)] show_matches_position: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool)] show_ranking_score: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool)] show_ranking_score_details: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] facets: Option>, - #[deserr( default = DEFAULT_HIGHLIGHT_PRE_TAG(), error = DeserrQueryParamError)] + #[deserr(default = DEFAULT_HIGHLIGHT_PRE_TAG(), error = DeserrQueryParamError)] + #[param(default = DEFAULT_HIGHLIGHT_PRE_TAG)] highlight_pre_tag: String, - #[deserr( default = DEFAULT_HIGHLIGHT_POST_TAG(), error = DeserrQueryParamError)] + #[deserr(default = DEFAULT_HIGHLIGHT_POST_TAG(), error = DeserrQueryParamError)] + #[param(default = DEFAULT_HIGHLIGHT_POST_TAG)] highlight_post_tag: String, #[deserr(default = DEFAULT_CROP_MARKER(), error = DeserrQueryParamError)] + #[param(default = DEFAULT_CROP_MARKER)] crop_marker: String, #[deserr(default, error = DeserrQueryParamError)] matching_strategy: MatchingStrategy, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] pub attributes_to_search_on: Option>, - #[deserr(default, error = DeserrQueryParamError)] + #[deserr(default, error = DeserrQueryParamError)] pub hybrid_embedder: Option, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = f32)] pub hybrid_semantic_ratio: Option, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = f32)] pub ranking_score_threshold: Option, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec, explode = false)] pub locales: Option>, } @@ -146,7 +185,7 @@ impl TryFrom for SearchQuery { (None, Some(_)) => { return Err(ResponseError::from_msg( "`hybridEmbedder` is mandatory when `hybridSemanticRatio` is present".into(), - meilisearch_types::error::Code::InvalidHybridQuery, + meilisearch_types::error::Code::InvalidSearchHybridQuery, )); } (Some(embedder), None) => { @@ -198,7 +237,7 @@ impl TryFrom for SearchQuery { // TODO: TAMO: split on :asc, and :desc, instead of doing some weird things /// Transform the sort query parameter into something that matches the post expected format. -fn fix_sort_query_parameters(sort_query: &str) -> Vec { +pub fn fix_sort_query_parameters(sort_query: &str) -> Vec { let mut sort_parameters = Vec::new(); let mut merge = false; for current_sort in sort_query.trim_matches('"').split(',').map(|s| s.trim()) { @@ -220,6 +259,62 @@ fn fix_sort_query_parameters(sort_query: &str) -> Vec { sort_parameters } +/// Search an index with GET +/// +/// Search for documents matching a specific query in the given index. +#[utoipa::path( + get, + path = "/{indexUid}/search", + tags = ["Indexes", "Search"], + security(("Bearer" = ["search", "*"])), + params( + ("indexUid" = String, Path, example = "movies", description = "Index Unique Identifier", nullable = false), + SearchQueryGet + ), + responses( + (status = 200, description = "The documents are returned", body = SearchResult, content_type = "application/json", example = json!( + { + "hits": [ + { + "id": 2770, + "title": "American Pie 2", + "poster": "https://image.tmdb.org/t/p/w1280/q4LNgUnRfltxzp3gf1MAGiK5LhV.jpg", + "overview": "The whole gang are back and as close as ever. They decide to get even closer by spending the summer together at a beach house. They decide to hold the biggest…", + "release_date": 997405200 + }, + { + "id": 190859, + "title": "American Sniper", + "poster": "https://image.tmdb.org/t/p/w1280/svPHnYE7N5NAGO49dBmRhq0vDQ3.jpg", + "overview": "U.S. Navy SEAL Chris Kyle takes his sole mission—protect his comrades—to heart and becomes one of the most lethal snipers in American history. His pinpoint accuracy not only saves countless lives but also makes him a prime…", + "release_date": 1418256000 + } + ], + "offset": 0, + "limit": 2, + "estimatedTotalHits": 976, + "processingTimeMs": 35, + "query": "american " + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn search_with_url_query( index_scheduler: GuardedData, Data>, search_queue: web::Data, @@ -241,13 +336,20 @@ pub async fn search_with_url_query( let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; - let features = index_scheduler.features(); - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; - let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index)?; + let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors); let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vector, index_scheduler.features()) + perform_search( + index_uid.to_string(), + &index, + query, + search_kind, + retrieve_vector, + index_scheduler.features(), + ) }) .await; permit.drop().await; @@ -263,6 +365,62 @@ pub async fn search_with_url_query( Ok(HttpResponse::Ok().json(search_result)) } +/// Search with POST +/// +/// Search for documents matching a specific query in the given index. +#[utoipa::path( + post, + path = "/{indexUid}/search", + tags = ["Indexes", "Search"], + security(("Bearer" = ["search", "*"])), + params( + ("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false), + ), + request_body = SearchQuery, + responses( + (status = 200, description = "The documents are returned", body = SearchResult, content_type = "application/json", example = json!( + { + "hits": [ + { + "id": 2770, + "title": "American Pie 2", + "poster": "https://image.tmdb.org/t/p/w1280/q4LNgUnRfltxzp3gf1MAGiK5LhV.jpg", + "overview": "The whole gang are back and as close as ever. They decide to get even closer by spending the summer together at a beach house. They decide to hold the biggest…", + "release_date": 997405200 + }, + { + "id": 190859, + "title": "American Sniper", + "poster": "https://image.tmdb.org/t/p/w1280/svPHnYE7N5NAGO49dBmRhq0vDQ3.jpg", + "overview": "U.S. Navy SEAL Chris Kyle takes his sole mission—protect his comrades—to heart and becomes one of the most lethal snipers in American history. His pinpoint accuracy not only saves countless lives but also makes him a prime…", + "release_date": 1418256000 + } + ], + "offset": 0, + "limit": 2, + "estimatedTotalHits": 976, + "processingTimeMs": 35, + "query": "american " + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn search_with_post( index_scheduler: GuardedData, Data>, search_queue: web::Data, @@ -285,14 +443,20 @@ pub async fn search_with_post( let index = index_scheduler.index(&index_uid)?; - let features = index_scheduler.features(); - - let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; - let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; + let search_kind = + search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index)?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors); let permit = search_queue.try_get_search_permit().await?; let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vectors, index_scheduler.features()) + perform_search( + index_uid.to_string(), + &index, + query, + search_kind, + retrieve_vectors, + index_scheduler.features(), + ) }) .await; permit.drop().await; @@ -314,16 +478,9 @@ pub async fn search_with_post( pub fn search_kind( query: &SearchQuery, index_scheduler: &IndexScheduler, + index_uid: String, index: &milli::Index, - features: RoFeatures, ) -> Result { - if query.vector.is_some() { - features.check_vector("Passing `vector` as a parameter")?; - } - if query.hybrid.is_some() { - features.check_vector("Passing `hybrid` as a parameter")?; - } - // handle with care, the order of cases matters, the semantics is subtle match (query.q.as_deref(), &query.hybrid, query.vector.as_deref()) { // empty query, no vector => placeholder search @@ -332,7 +489,7 @@ pub fn search_kind( (None, _, None) => Ok(SearchKind::KeywordOnly), // hybrid.semantic_ratio == 1.0 => vector (_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { - SearchKind::semantic(index_scheduler, index, embedder, v.map(|v| v.len())) + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) } // hybrid.semantic_ratio == 0.0 => keyword (_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { @@ -340,13 +497,14 @@ pub fn search_kind( } // no query, hybrid, vector => semantic (None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => { - SearchKind::semantic(index_scheduler, index, embedder, Some(v.len())) + SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len())) } // query, no hybrid, no vector => keyword (Some(_), None, None) => Ok(SearchKind::KeywordOnly), // query, hybrid, maybe vector => hybrid (Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( index_scheduler, + index_uid, index, embedder, **semantic_ratio, @@ -356,30 +514,3 @@ pub fn search_kind( (_, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), } } - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_fix_sort_query_parameters() { - let sort = fix_sort_query_parameters("_geoPoint(12, 13):asc"); - assert_eq!(sort, vec!["_geoPoint(12,13):asc".to_string()]); - let sort = fix_sort_query_parameters("doggo:asc,_geoPoint(12.45,13.56):desc"); - assert_eq!(sort, vec!["doggo:asc".to_string(), "_geoPoint(12.45,13.56):desc".to_string(),]); - let sort = fix_sort_query_parameters( - "doggo:asc , _geoPoint(12.45, 13.56, 2590352):desc , catto:desc", - ); - assert_eq!( - sort, - vec![ - "doggo:asc".to_string(), - "_geoPoint(12.45,13.56,2590352):desc".to_string(), - "catto:desc".to_string(), - ] - ); - let sort = fix_sort_query_parameters("doggo:asc , _geoPoint(1, 2), catto:desc"); - // This is ugly but eh, I don't want to write a full parser just for this unused route - assert_eq!(sort, vec!["doggo:asc".to_string(), "_geoPoint(1,2),catto:desc".to_string(),]); - } -} diff --git a/crates/meilisearch/src/routes/indexes/search_test.rs b/crates/meilisearch/src/routes/indexes/search_test.rs new file mode 100644 index 000000000..b56fb4e8b --- /dev/null +++ b/crates/meilisearch/src/routes/indexes/search_test.rs @@ -0,0 +1,22 @@ +use crate::routes::indexes::search::fix_sort_query_parameters; + +#[test] +fn test_fix_sort_query_parameters() { + let sort = fix_sort_query_parameters("_geoPoint(12, 13):asc"); + assert_eq!(sort, vec!["_geoPoint(12,13):asc".to_string()]); + let sort = fix_sort_query_parameters("doggo:asc,_geoPoint(12.45,13.56):desc"); + assert_eq!(sort, vec!["doggo:asc".to_string(), "_geoPoint(12.45,13.56):desc".to_string(),]); + let sort = + fix_sort_query_parameters("doggo:asc , _geoPoint(12.45, 13.56, 2590352):desc , catto:desc"); + assert_eq!( + sort, + vec![ + "doggo:asc".to_string(), + "_geoPoint(12.45,13.56,2590352):desc".to_string(), + "catto:desc".to_string(), + ] + ); + let sort = fix_sort_query_parameters("doggo:asc , _geoPoint(1, 2), catto:desc"); + // This is ugly but eh, I don't want to write a full parser just for this unused route + assert_eq!(sort, vec!["doggo:asc".to_string(), "_geoPoint(1,2),catto:desc".to_string(),]); +} diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index a9d8d3053..92b018c8c 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -5,10 +5,12 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::milli::update::Setting; -use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked}; +use meilisearch_types::settings::{ + settings, SecretPolicy, SettingEmbeddingSettings, Settings, Unchecked, +}; use meilisearch_types::tasks::KindWithContent; use tracing::debug; +use utoipa::OpenApi; use super::settings_analytics::*; use crate::analytics::Analytics; @@ -17,9 +19,54 @@ use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; +/// This macro generates the routes for the settings. +/// +/// It takes a list of settings and generates a module for each setting. +/// Each module contains the `get`, `update` and `delete` routes for the setting. +/// +/// It also generates a `configure` function that configures the routes for the settings. +macro_rules! make_setting_routes { + ($({route: $route:literal, update_verb: $update_verb:ident, value_type: $type:ty, err_type: $err_ty:ty, attr: $attr:ident, camelcase_attr: $camelcase_attr:literal, analytics: $analytics:ident},)*) => { + const _: fn(&meilisearch_types::settings::Settings) = |s| { + // This pattern match will fail at compile time if any field in Settings is not listed in the macro + match *s { + meilisearch_types::settings::Settings { $($attr: _,)* _kind: _ } => {} + } + }; + $( + make_setting_route!($route, $update_verb, $type, $err_ty, $attr, $camelcase_attr, $analytics); + )* + + #[derive(OpenApi)] + #[openapi( + paths(update_all, get_all, delete_all, $( $attr::get, $attr::update, $attr::delete,)*), + tags( + ( + name = "Settings", + description = "Use the /settings route to customize search settings for a given index. You can either modify all index settings at once using the update settings endpoint, or use a child route to configure a single setting.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/settings"), + ), + ), + )] + pub struct SettingsApi; + + pub fn configure(cfg: &mut web::ServiceConfig) { + use crate::extractors::sequential_extractor::SeqHandler; + cfg.service( + web::resource("") + .route(web::patch().to(SeqHandler(update_all))) + .route(web::get().to(SeqHandler(get_all))) + .route(web::delete().to(SeqHandler(delete_all)))) + $(.service($attr::resources()))*; + } + + pub const ALL_SETTINGS_NAMES: &[&str] = &[$(stringify!($attr)),*]; + }; +} + #[macro_export] macro_rules! make_setting_route { - ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { + ($route:literal, $update_verb:ident, $type:ty, $err_type:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { pub mod $attr { use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse, Resource}; @@ -36,7 +83,39 @@ macro_rules! make_setting_route { use $crate::extractors::sequential_extractor::SeqHandler; use $crate::Opt; use $crate::routes::{is_dry_run, get_task_id, SummarizedTaskView}; + #[allow(unused_imports)] + use super::*; + #[utoipa::path( + delete, + path = concat!("{indexUid}/settings", $route), + tag = "Settings", + security(("Bearer" = ["settings.update", "settings.*", "*"])), + operation_id = concat!("delete", $camelcase_attr), + summary = concat!("Reset ", $camelcase_attr), + description = concat!("Reset an index's ", $camelcase_attr, " to its default value"), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = $type, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": "movies", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) + )] pub async fn delete( index_scheduler: GuardedData< ActionPolicy<{ actions::SETTINGS_UPDATE }>, @@ -70,13 +149,44 @@ macro_rules! make_setting_route { Ok(HttpResponse::Accepted().json(task)) } + + #[utoipa::path( + $update_verb, + path = concat!("{indexUid}/settings", $route), + tag = "Settings", + security(("Bearer" = ["settings.update", "settings.*", "*"])), + operation_id = concat!(stringify!($update_verb), $camelcase_attr), + summary = concat!("Update ", $camelcase_attr), + description = concat!("Update an index's user defined ", $camelcase_attr), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = $type, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": "movies", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) + )] pub async fn update( index_scheduler: GuardedData< ActionPolicy<{ actions::SETTINGS_UPDATE }>, Data, >, index_uid: actix_web::web::Path, - body: deserr::actix_web::AwebJson, $err_ty>, + body: deserr::actix_web::AwebJson, $err_type>, req: HttpRequest, opt: web::Data, analytics: web::Data, @@ -125,6 +235,30 @@ macro_rules! make_setting_route { Ok(HttpResponse::Accepted().json(task)) } + + #[utoipa::path( + get, + path = concat!("{indexUid}/settings", $route), + tag = "Settings", + summary = concat!("Get ", $camelcase_attr), + description = concat!("Get an user defined ", $camelcase_attr), + security(("Bearer" = ["settings.get", "settings.*", "*"])), + operation_id = concat!("get", $camelcase_attr), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = 200, description = concat!($camelcase_attr, " is returned"), body = $type, content_type = "application/json", example = json!( + <$type>::default() + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) + )] pub async fn get( index_scheduler: GuardedData< ActionPolicy<{ actions::SETTINGS_GET }>, @@ -153,257 +287,262 @@ macro_rules! make_setting_route { }; } -make_setting_route!( - "/filterable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, - >, - filterable_attributes, - "filterableAttributes", - FilterableAttributesAnalytics +make_setting_routes!( + { + route: "/filterable-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFilterableAttributes, + >, + attr: filterable_attributes, + camelcase_attr: "filterableAttributes", + analytics: FilterableAttributesAnalytics + }, + { + route: "/sortable-attributes", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, + >, + attr: sortable_attributes, + camelcase_attr: "sortableAttributes", + analytics: SortableAttributesAnalytics + }, + { + route: "/displayed-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, + >, + attr: displayed_attributes, + camelcase_attr: "displayedAttributes", + analytics: DisplayedAttributesAnalytics + }, + { + route: "/typo-tolerance", + update_verb: patch, + value_type: meilisearch_types::settings::TypoSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, + >, + attr: typo_tolerance, + camelcase_attr: "typoTolerance", + analytics: TypoToleranceAnalytics + }, + { + route: "/searchable-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, + >, + attr: searchable_attributes, + camelcase_attr: "searchableAttributes", + analytics: SearchableAttributesAnalytics + }, + { + route: "/stop-words", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, + >, + attr: stop_words, + camelcase_attr: "stopWords", + analytics: StopWordsAnalytics + }, + { + route: "/non-separator-tokens", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, + >, + attr: non_separator_tokens, + camelcase_attr: "nonSeparatorTokens", + analytics: NonSeparatorTokensAnalytics + }, + { + route: "/separator-tokens", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, + >, + attr: separator_tokens, + camelcase_attr: "separatorTokens", + analytics: SeparatorTokensAnalytics + }, + { + route: "/dictionary", + update_verb: put, + value_type: std::collections::BTreeSet, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, + >, + attr: dictionary, + camelcase_attr: "dictionary", + analytics: DictionaryAnalytics + }, + { + route: "/synonyms", + update_verb: put, + value_type: std::collections::BTreeMap>, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, + >, + attr: synonyms, + camelcase_attr: "synonyms", + analytics: SynonymsAnalytics + }, + { + route: "/distinct-attribute", + update_verb: put, + value_type: String, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, + >, + attr: distinct_attribute, + camelcase_attr: "distinctAttribute", + analytics: DistinctAttributeAnalytics + }, + { + route: "/proximity-precision", + update_verb: put, + value_type: meilisearch_types::settings::ProximityPrecisionView, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, + >, + attr: proximity_precision, + camelcase_attr: "proximityPrecision", + analytics: ProximityPrecisionAnalytics + }, + { + route: "/localized-attributes", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, + >, + attr: localized_attributes, + camelcase_attr: "localizedAttributes", + analytics: LocalesAnalytics + }, + { + route: "/ranking-rules", + update_verb: put, + value_type: Vec, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, + >, + attr: ranking_rules, + camelcase_attr: "rankingRules", + analytics: RankingRulesAnalytics + }, + { + route: "/faceting", + update_verb: patch, + value_type: meilisearch_types::settings::FacetingSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, + >, + attr: faceting, + camelcase_attr: "faceting", + analytics: FacetingAnalytics + }, + { + route: "/pagination", + update_verb: patch, + value_type: meilisearch_types::settings::PaginationSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPagination, + >, + attr: pagination, + camelcase_attr: "pagination", + analytics: PaginationAnalytics + }, + { + route: "/embedders", + update_verb: patch, + value_type: std::collections::BTreeMap, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, + >, + attr: embedders, + camelcase_attr: "embedders", + analytics: EmbeddersAnalytics + }, + { + route: "/search-cutoff-ms", + update_verb: put, + value_type: u64, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, + >, + attr: search_cutoff_ms, + camelcase_attr: "searchCutoffMs", + analytics: SearchCutoffMsAnalytics + }, + { + route: "/facet-search", + update_verb: put, + value_type: bool, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsFacetSearch, + >, + attr: facet_search, + camelcase_attr: "facetSearch", + analytics: FacetSearchAnalytics + }, + { + route: "/prefix-search", + update_verb: put, + value_type: meilisearch_types::settings::PrefixSearchSettings, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsPrefixSearch, + >, + attr: prefix_search, + camelcase_attr: "prefixSearch", + analytics: PrefixSearchAnalytics + }, ); -make_setting_route!( - "/sortable-attributes", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSortableAttributes, - >, - sortable_attributes, - "sortableAttributes", - SortableAttributesAnalytics -); - -make_setting_route!( - "/displayed-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDisplayedAttributes, - >, - displayed_attributes, - "displayedAttributes", - DisplayedAttributesAnalytics -); - -make_setting_route!( - "/typo-tolerance", +#[utoipa::path( patch, - meilisearch_types::settings::TypoSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsTypoTolerance, - >, - typo_tolerance, - "typoTolerance", - TypoToleranceAnalytics -); - -make_setting_route!( - "/searchable-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSearchableAttributes, - >, - searchable_attributes, - "searchableAttributes", - SearchableAttributesAnalytics -); - -make_setting_route!( - "/stop-words", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsStopWords, - >, - stop_words, - "stopWords", - StopWordsAnalytics -); - -make_setting_route!( - "/non-separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens, - >, - non_separator_tokens, - "nonSeparatorTokens", - NonSeparatorTokensAnalytics -); - -make_setting_route!( - "/separator-tokens", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens, - >, - separator_tokens, - "separatorTokens", - SeparatorTokensAnalytics -); - -make_setting_route!( - "/dictionary", - put, - std::collections::BTreeSet, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDictionary, - >, - dictionary, - "dictionary", - DictionaryAnalytics -); - -make_setting_route!( - "/synonyms", - put, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSynonyms, - >, - synonyms, - "synonyms", - SynonymsAnalytics -); - -make_setting_route!( - "/distinct-attribute", - put, - String, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsDistinctAttribute, - >, - distinct_attribute, - "distinctAttribute", - DistinctAttributeAnalytics -); - -make_setting_route!( - "/proximity-precision", - put, - meilisearch_types::settings::ProximityPrecisionView, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision, - >, - proximity_precision, - "proximityPrecision", - ProximityPrecisionAnalytics -); - -make_setting_route!( - "/localized-attributes", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, - >, - localized_attributes, - "localizedAttributes", - LocalesAnalytics -); - -make_setting_route!( - "/ranking-rules", - put, - Vec, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsRankingRules, - >, - ranking_rules, - "rankingRules", - RankingRulesAnalytics -); - -make_setting_route!( - "/faceting", - patch, - meilisearch_types::settings::FacetingSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsFaceting, - >, - faceting, - "faceting", - FacetingAnalytics -); - -make_setting_route!( - "/pagination", - patch, - meilisearch_types::settings::PaginationSettings, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsPagination, - >, - pagination, - "pagination", - PaginationAnalytics -); - -make_setting_route!( - "/embedders", - patch, - std::collections::BTreeMap>, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders, - >, - embedders, - "embedders", - EmbeddersAnalytics -); - -make_setting_route!( - "/search-cutoff-ms", - put, - u64, - meilisearch_types::deserr::DeserrJsonError< - meilisearch_types::error::deserr_codes::InvalidSettingsSearchCutoffMs, - >, - search_cutoff_ms, - "searchCutoffMs", - SearchCutoffMsAnalytics -); - -macro_rules! generate_configure { - ($($mod:ident),*) => { - pub fn configure(cfg: &mut web::ServiceConfig) { - use crate::extractors::sequential_extractor::SeqHandler; - cfg.service( - web::resource("") - .route(web::patch().to(SeqHandler(update_all))) - .route(web::get().to(SeqHandler(get_all))) - .route(web::delete().to(SeqHandler(delete_all)))) - $(.service($mod::resources()))*; - } - }; -} - -generate_configure!( - filterable_attributes, - sortable_attributes, - displayed_attributes, - localized_attributes, - searchable_attributes, - distinct_attribute, - proximity_precision, - stop_words, - separator_tokens, - non_separator_tokens, - dictionary, - synonyms, - ranking_rules, - typo_tolerance, - pagination, - faceting, - embedders, - search_cutoff_ms -); - + path = "{indexUid}/settings", + tag = "Settings", + security(("Bearer" = ["settings.update", "settings.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = Settings, + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": "movies", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +/// Update settings +/// +/// Update the settings of an index. +/// Passing null to an index setting will reset it to its default value. +/// Updates in the settings route are partial. This means that any parameters not provided in the body will be left unchanged. +/// If the provided index does not exist, it will be created. pub async fn update_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -456,6 +595,8 @@ pub async fn update_all( non_separator_tokens: NonSeparatorTokensAnalytics::new( new_settings.non_separator_tokens.as_ref().set(), ), + facet_search: FacetSearchAnalytics::new(new_settings.facet_search.as_ref().set()), + prefix_search: PrefixSearchAnalytics::new(new_settings.prefix_search.as_ref().set()), }, &req, ); @@ -479,6 +620,29 @@ pub async fn update_all( Ok(HttpResponse::Accepted().json(task)) } +#[utoipa::path( + get, + path = "{indexUid}/settings", + tag = "Settings", + security(("Bearer" = ["settings.update", "settings.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = 200, description = "Settings are returned", body = Settings, content_type = "application/json", example = json!( + Settings::::default() + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +/// All settings +/// +/// This route allows you to retrieve, configure, or reset all of an index's settings at once. pub async fn get_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -492,6 +656,35 @@ pub async fn get_all( Ok(HttpResponse::Ok().json(new_settings)) } +#[utoipa::path( + delete, + path = "{indexUid}/settings", + tag = "Settings", + security(("Bearer" = ["settings.update", "settings.*", "*"])), + params(("indexUid", example = "movies", description = "Index Unique Identifier", nullable = false)), + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": "movies", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +/// Reset settings +/// +/// Reset all the settings of an index to their default value. pub async fn delete_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -525,8 +718,28 @@ fn validate_settings( settings: Settings, index_scheduler: &IndexScheduler, ) -> Result, ResponseError> { - if matches!(settings.embedders, Setting::Set(_)) { - index_scheduler.features().check_vector("Passing `embedders` in settings")? + use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::vector::settings::EmbedderSource; + + let features = index_scheduler.features(); + if let Setting::Set(embedders) = &settings.embedders { + for SettingEmbeddingSettings { inner: embedder } in embedders.values() { + let Setting::Set(embedder) = embedder else { + continue; + }; + if matches!(embedder.source, Setting::Set(EmbedderSource::Composite)) { + features.check_composite_embedders("using `\"composite\"` as source")?; + } + + if matches!(embedder.search_embedder, Setting::Set(_)) { + features.check_composite_embedders("setting `searchEmbedder`")?; + } + + if matches!(embedder.indexing_embedder, Setting::Set(_)) { + features.check_composite_embedders("setting `indexingEmbedder`")?; + } + } } + Ok(settings.validate()?) } diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs index 32bddcbdd..cb5983f02 100644 --- a/crates/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -8,9 +8,10 @@ use std::collections::{BTreeMap, BTreeSet, HashSet}; use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; use meilisearch_types::milli::update::Setting; -use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::milli::FilterableAttributesRule; use meilisearch_types::settings::{ - FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings, + FacetingSettings, PaginationSettings, PrefixSearchSettings, ProximityPrecisionView, + RankingRuleView, SettingEmbeddingSettings, TypoSettings, }; use serde::Serialize; @@ -36,6 +37,8 @@ pub struct SettingsAnalytics { pub dictionary: DictionaryAnalytics, pub separator_tokens: SeparatorTokensAnalytics, pub non_separator_tokens: NonSeparatorTokensAnalytics, + pub facet_search: FacetSearchAnalytics, + pub prefix_search: PrefixSearchAnalytics, } impl Aggregate for SettingsAnalytics { @@ -87,6 +90,10 @@ impl Aggregate for SettingsAnalytics { filterable_attributes: FilterableAttributesAnalytics { total: new.filterable_attributes.total.or(self.filterable_attributes.total), has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo), + has_patterns: new + .filterable_attributes + .has_patterns + .or(self.filterable_attributes.has_patterns), }, distinct_attribute: DistinctAttributeAnalytics { set: self.distinct_attribute.set | new.distinct_attribute.set, @@ -183,6 +190,14 @@ impl Aggregate for SettingsAnalytics { non_separator_tokens: NonSeparatorTokensAnalytics { total: new.non_separator_tokens.total.or(self.non_separator_tokens.total), }, + facet_search: FacetSearchAnalytics { + set: new.facet_search.set | self.facet_search.set, + value: new.facet_search.value.or(self.facet_search.value), + }, + prefix_search: PrefixSearchAnalytics { + set: new.prefix_search.set | self.prefix_search.set, + value: new.prefix_search.value.or(self.prefix_search.value), + }, }) } @@ -318,13 +333,19 @@ impl SortableAttributesAnalytics { pub struct FilterableAttributesAnalytics { pub total: Option, pub has_geo: Option, + pub has_patterns: Option, } impl FilterableAttributesAnalytics { - pub fn new(setting: Option<&BTreeSet>) -> Self { + pub fn new(setting: Option<&Vec>) -> Self { Self { total: setting.as_ref().map(|filter| filter.len()), - has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), + has_geo: setting + .as_ref() + .map(|filter| filter.iter().any(FilterableAttributesRule::has_geo)), + has_patterns: setting.as_ref().map(|filter| { + filter.iter().any(|rule| matches!(rule, FilterableAttributesRule::Pattern(_))) + }), } } @@ -486,13 +507,13 @@ pub struct EmbeddersAnalytics { } impl EmbeddersAnalytics { - pub fn new(setting: Option<&BTreeMap>>) -> Self { + pub fn new(setting: Option<&BTreeMap>) -> Self { let mut sources = std::collections::HashSet::new(); if let Some(s) = &setting { for source in s .values() - .filter_map(|config| config.clone().set()) + .filter_map(|config| config.inner.clone().set()) .filter_map(|config| config.source.set()) { use meilisearch_types::milli::vector::settings::EmbedderSource; @@ -502,6 +523,7 @@ impl EmbeddersAnalytics { EmbedderSource::UserProvided => sources.insert("userProvided".to_string()), EmbedderSource::Ollama => sources.insert("ollama".to_string()), EmbedderSource::Rest => sources.insert("rest".to_string()), + EmbedderSource::Composite => sources.insert("composite".to_string()), }; } }; @@ -511,18 +533,18 @@ impl EmbeddersAnalytics { sources: Some(sources), document_template_used: setting.as_ref().map(|map| { map.values() - .filter_map(|config| config.clone().set()) + .filter_map(|config| config.inner.clone().set()) .any(|config| config.document_template.set().is_some()) }), document_template_max_bytes: setting.as_ref().and_then(|map| { map.values() - .filter_map(|config| config.clone().set()) + .filter_map(|config| config.inner.clone().set()) .filter_map(|config| config.document_template_max_bytes.set()) .max() }), binary_quantization_used: setting.as_ref().map(|map| { map.values() - .filter_map(|config| config.clone().set()) + .filter_map(|config| config.inner.clone().set()) .any(|config| config.binary_quantized.set().is_some()) }), } @@ -620,3 +642,35 @@ impl NonSeparatorTokensAnalytics { SettingsAnalytics { non_separator_tokens: self, ..Default::default() } } } + +#[derive(Serialize, Default)] +pub struct FacetSearchAnalytics { + pub set: bool, + pub value: Option, +} + +impl FacetSearchAnalytics { + pub fn new(settings: Option<&bool>) -> Self { + Self { set: settings.is_some(), value: settings.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { facet_search: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct PrefixSearchAnalytics { + pub set: bool, + pub value: Option, +} + +impl PrefixSearchAnalytics { + pub fn new(settings: Option<&PrefixSearchSettings>) -> Self { + Self { set: settings.is_some(), value: settings.cloned() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { prefix_search: self, ..Default::default() } + } +} diff --git a/crates/meilisearch/src/routes/indexes/similar.rs b/crates/meilisearch/src/routes/indexes/similar.rs index 79f42f0aa..400be331b 100644 --- a/crates/meilisearch/src/routes/indexes/similar.rs +++ b/crates/meilisearch/src/routes/indexes/similar.rs @@ -5,12 +5,13 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; use meilisearch_types::error::deserr_codes::*; -use meilisearch_types::error::{ErrorCode as _, ResponseError}; +use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::actions; use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; +use utoipa::{IntoParams, OpenApi}; use super::ActionPolicy; use crate::analytics::Analytics; @@ -18,10 +19,25 @@ use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST}; use crate::search::{ - add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, - SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, Route, + SearchKind, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; +#[derive(OpenApi)] +#[openapi( + paths(similar_get, similar_post), + tags( + ( + name = "Similar documents", + description = "The /similar route uses AI-powered search to return a number of documents similar to a target document. + +Meilisearch exposes two routes for retrieving similar documents: POST and GET. In the majority of cases, POST will offer better performance and ease of use.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/similar"), + ), + ), +)] +pub struct SimilarApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -30,6 +46,62 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } +/// Get similar documents with GET +/// +/// Retrieve documents similar to a specific search result. +#[utoipa::path( + get, + path = "{indexUid}/similar", + tag = "Similar documents", + security(("Bearer" = ["search", "*"])), + params( + ("indexUid" = String, Path, example = "movies", description = "Index Unique Identifier", nullable = false), + SimilarQueryGet + ), + responses( + (status = 200, description = "The documents are returned", body = SimilarResult, content_type = "application/json", example = json!( + { + "hits": [ + { + "id": 2770, + "title": "American Pie 2", + "poster": "https://image.tmdb.org/t/p/w1280/q4LNgUnRfltxzp3gf1MAGiK5LhV.jpg", + "overview": "The whole gang are back and as close as ever. They decide to get even closer by spending the summer together at a beach house. They decide to hold the biggest…", + "release_date": 997405200 + }, + { + "id": 190859, + "title": "American Sniper", + "poster": "https://image.tmdb.org/t/p/w1280/svPHnYE7N5NAGO49dBmRhq0vDQ3.jpg", + "overview": "U.S. Navy SEAL Chris Kyle takes his sole mission—protect his comrades—to heart and becomes one of the most lethal snipers in American history. His pinpoint accuracy not only saves countless lives but also makes him a prime…", + "release_date": 1418256000 + } + ], + "offset": 0, + "limit": 2, + "estimatedTotalHits": 976, + "processingTimeMs": 35, + "query": "american " + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn similar_get( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -39,7 +111,7 @@ pub async fn similar_get( ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let query = params.0.try_into()?; + let query = params.0.into(); let mut aggregate = SimilarAggregator::::from_query(&query); @@ -58,6 +130,60 @@ pub async fn similar_get( Ok(HttpResponse::Ok().json(similar)) } +/// Get similar documents with POST +/// +/// Retrieve documents similar to a specific search result. +#[utoipa::path( + post, + path = "{indexUid}/similar", + tag = "Similar documents", + security(("Bearer" = ["search", "*"])), + params(("indexUid" = String, Path, example = "movies", description = "Index Unique Identifier", nullable = false)), + request_body = SimilarQuery, + responses( + (status = 200, description = "The documents are returned", body = SimilarResult, content_type = "application/json", example = json!( + { + "hits": [ + { + "id": 2770, + "title": "American Pie 2", + "poster": "https://image.tmdb.org/t/p/w1280/q4LNgUnRfltxzp3gf1MAGiK5LhV.jpg", + "overview": "The whole gang are back and as close as ever. They decide to get even closer by spending the summer together at a beach house. They decide to hold the biggest…", + "release_date": 997405200 + }, + { + "id": 190859, + "title": "American Sniper", + "poster": "https://image.tmdb.org/t/p/w1280/svPHnYE7N5NAGO49dBmRhq0vDQ3.jpg", + "overview": "U.S. Navy SEAL Chris Kyle takes his sole mission—protect his comrades—to heart and becomes one of the most lethal snipers in American history. His pinpoint accuracy not only saves countless lives but also makes him a prime…", + "release_date": 1418256000 + } + ], + "offset": 0, + "limit": 2, + "estimatedTotalHits": 976, + "processingTimeMs": 35, + "query": "american " + } + )), + (status = 404, description = "Index not found", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Index `movies` not found.", + "code": "index_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_not_found" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn similar_post( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -90,11 +216,7 @@ async fn similar( index_uid: IndexUid, mut query: SimilarQuery, ) -> Result { - let features = index_scheduler.features(); - - features.check_vector("Using the similar API")?; - - let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors); // Tenant token search_rules. if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { @@ -103,8 +225,14 @@ async fn similar( let index = index_scheduler.index(&index_uid)?; - let (embedder_name, embedder, quantized) = - SearchKind::embedder(&index_scheduler, &index, &query.embedder, None)?; + let (embedder_name, embedder, quantized) = SearchKind::embedder( + &index_scheduler, + index_uid.to_string(), + &index, + &query.embedder, + None, + Route::Similar, + )?; tokio::task::spawn_blocking(move || { perform_similar( @@ -120,28 +248,37 @@ async fn similar( .await? } -#[derive(Debug, deserr::Deserr)] +#[derive(Debug, deserr::Deserr, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(parameter_in = Query)] pub struct SimilarQueryGet { #[deserr(error = DeserrQueryParamError)] + #[param(value_type = String)] id: Param, #[deserr(default = Param(DEFAULT_SEARCH_OFFSET()), error = DeserrQueryParamError)] + #[param(value_type = usize, default = DEFAULT_SEARCH_OFFSET)] offset: Param, #[deserr(default = Param(DEFAULT_SEARCH_LIMIT()), error = DeserrQueryParamError)] + #[param(value_type = usize, default = DEFAULT_SEARCH_LIMIT)] limit: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = Vec)] attributes_to_retrieve: Option>, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool, default)] retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool, default)] show_ranking_score: Param, #[deserr(default, error = DeserrQueryParamError)] + #[param(value_type = bool, default)] show_ranking_score_details: Param, #[deserr(default, error = DeserrQueryParamError, default)] + #[param(value_type = Option)] pub ranking_score_threshold: Option, - #[deserr(error = DeserrQueryParamError)] + #[deserr(error = DeserrQueryParamError)] pub embedder: String, } @@ -158,10 +295,8 @@ impl std::convert::TryFrom for RankingScoreThresholdGet { } } -impl TryFrom for SimilarQuery { - type Error = ResponseError; - - fn try_from( +impl From for SimilarQuery { + fn from( SimilarQueryGet { id, offset, @@ -174,7 +309,7 @@ impl TryFrom for SimilarQuery { embedder, ranking_score_threshold, }: SimilarQueryGet, - ) -> Result { + ) -> Self { let filter = match filter { Some(f) => match serde_json::from_str(&f) { Ok(v) => Some(v), @@ -183,10 +318,8 @@ impl TryFrom for SimilarQuery { None => None, }; - Ok(SimilarQuery { - id: id.0.try_into().map_err(|code: InvalidSimilarId| { - ResponseError::from_msg(code.to_string(), code.error_code()) - })?, + SimilarQuery { + id: serde_json::Value::String(id.0), offset: offset.0, limit: limit.0, filter, @@ -196,6 +329,6 @@ impl TryFrom for SimilarQuery { show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, ranking_score_threshold: ranking_score_threshold.map(|x| x.0), - }) + } } } diff --git a/crates/meilisearch/src/routes/logs.rs b/crates/meilisearch/src/routes/logs.rs index 57e2cbd22..889ce824e 100644 --- a/crates/meilisearch/src/routes/logs.rs +++ b/crates/meilisearch/src/routes/logs.rs @@ -14,9 +14,11 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; +use serde::Serialize; use tokio::sync::mpsc; use tracing_subscriber::filter::Targets; use tracing_subscriber::Layer; +use utoipa::{OpenApi, ToSchema}; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; @@ -24,6 +26,18 @@ use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::{LogRouteHandle, LogStderrHandle}; +#[derive(OpenApi)] +#[openapi( + paths(get_logs, cancel_logs, update_stderr_target), + tags(( + name = "Logs", + description = "Everything about retrieving or customizing logs. +Currently [experimental](https://www.meilisearch.com/docs/learn/experimental/overview).", + external_docs(url = "https://www.meilisearch.com/docs/learn/experimental/log_customization"), + )), +)] +pub struct LogsApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("stream") @@ -33,12 +47,16 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::resource("stderr").route(web::post().to(SeqHandler(update_stderr_target)))); } -#[derive(Debug, Default, Clone, Copy, Deserr, PartialEq, Eq)] +#[derive(Debug, Default, Clone, Copy, Deserr, Serialize, PartialEq, Eq, ToSchema)] #[deserr(rename_all = camelCase)] +#[schema(rename_all = "camelCase")] pub enum LogMode { + /// Output the logs in a human readable form. #[default] Human, + /// Output the logs in json. Json, + /// Output the logs in the firefox profiler format. They can then be loaded and visualized at https://profiler.firefox.com/ Profile, } @@ -83,16 +101,26 @@ impl MergeWithError for DeserrJsonError { } } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields, validate = validate_get_logs -> DeserrJsonError)] +#[schema(rename_all = "camelCase")] pub struct GetLogs { + /// Lets you specify which parts of the code you want to inspect and is formatted like that: code_part=log_level,code_part=log_level + /// - If the `code_part` is missing, then the `log_level` will be applied to everything. + /// - If the `log_level` is missing, then the `code_part` will be selected in `info` log level. #[deserr(default = "info".parse().unwrap(), try_from(&String) = MyTargets::from_str -> DeserrJsonError)] + #[schema(value_type = String, default = "info", example = json!("milli=trace,index_scheduler,actix_web=off"))] target: MyTargets, + /// Lets you customize the format of the logs. #[deserr(default, error = DeserrJsonError)] + #[schema(default = LogMode::default)] mode: LogMode, + /// A boolean to indicate if you want to profile the memory as well. This is only useful while using the `profile` mode. + /// Be cautious, though; it slows down the engine a lot. #[deserr(default = false, error = DeserrJsonError)] + #[schema(default = false)] profile_memory: bool, } @@ -248,6 +276,46 @@ fn entry_stream( ) } +/// Retrieve logs +/// +/// Stream logs over HTTP. The format of the logs depends on the configuration specified in the payload. +/// The logs are sent as multi-part, and the stream never stops, so make sure your clients correctly handle that. +/// To make the server stop sending you logs, you can call the `DELETE /logs/stream` route. +/// +/// There can only be one listener at a timeand an error will be returned if you call this route while it's being used by another client. +#[utoipa::path( + post, + path = "/stream", + tag = "Logs", + security(("Bearer" = ["metrics.get", "metrics.*", "*"])), + request_body = GetLogs, + responses( + (status = OK, description = "Logs are being returned", body = String, content_type = "application/json", example = json!( + r#" +2024-10-08T13:35:02.643750Z WARN HTTP request{method=GET host="localhost:7700" route=/metrics query_parameters= user_agent=HTTPie/3.2.3 status_code=400 error=Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625}: tracing_actix_web::middleware: Error encountered while processing the incoming HTTP request: ResponseError { code: 400, message: "Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625", error_code: "feature_not_enabled", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#feature_not_enabled" } +2024-10-08T13:35:02.644191Z INFO HTTP request{method=GET host="localhost:7700" route=/metrics query_parameters= user_agent=HTTPie/3.2.3 status_code=400 error=Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625}: meilisearch: close time.busy=1.66ms time.idle=658µs +2024-10-08T13:35:18.564152Z INFO HTTP request{method=PATCH host="localhost:7700" route=/experimental-features query_parameters= user_agent=curl/8.6.0 status_code=200}: meilisearch: close time.busy=1.17ms time.idle=127µs +2024-10-08T13:35:23.094987Z INFO HTTP request{method=GET host="localhost:7700" route=/metrics query_parameters= user_agent=HTTPie/3.2.3 status_code=200}: meilisearch: close time.busy=2.12ms time.idle=595µs +"# + )), + (status = 400, description = "The route is already being used", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The `/logs/stream` route is currently in use by someone else.", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_logs( index_scheduler: GuardedData, Data>, logs: Data, @@ -280,6 +348,26 @@ pub async fn get_logs( } } +/// Stop retrieving logs +/// +/// Call this route to make the engine stops sending logs through the `POST /logs/stream` route. +#[utoipa::path( + delete, + path = "/stream", + tag = "Logs", + security(("Bearer" = ["metrics.get", "metrics.*", "*"])), + responses( + (status = NO_CONTENT, description = "Logs are being returned"), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn cancel_logs( index_scheduler: GuardedData, Data>, logs: Data, @@ -293,13 +381,38 @@ pub async fn cancel_logs( Ok(HttpResponse::NoContent().finish()) } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct UpdateStderrLogs { + /// Lets you specify which parts of the code you want to inspect and is formatted like that: code_part=log_level,code_part=log_level + /// - If the `code_part` is missing, then the `log_level` will be applied to everything. + /// - If the `log_level` is missing, then the `code_part` will be selected in `info` log level. #[deserr(default = "info".parse().unwrap(), try_from(&String) = MyTargets::from_str -> DeserrJsonError)] + #[schema(value_type = String, default = "info", example = json!("milli=trace,index_scheduler,actix_web=off"))] target: MyTargets, } +/// Update target of the console logs +/// +/// This route lets you specify at runtime the level of the console logs outputted on stderr. +#[utoipa::path( + post, + path = "/stderr", + tag = "Logs", + request_body = UpdateStderrLogs, + security(("Bearer" = ["metrics.get", "metrics.*", "*"])), + responses( + (status = NO_CONTENT, description = "The console logs have been updated"), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn update_stderr_target( index_scheduler: GuardedData, Data>, logs: Data, diff --git a/crates/meilisearch/src/routes/metrics.rs b/crates/meilisearch/src/routes/metrics.rs index 48b5d09f5..6e93284c2 100644 --- a/crates/meilisearch/src/routes/metrics.rs +++ b/crates/meilisearch/src/routes/metrics.rs @@ -1,21 +1,114 @@ use actix_web::http::header; use actix_web::web::{self, Data}; use actix_web::HttpResponse; -use index_scheduler::IndexScheduler; +use index_scheduler::{IndexScheduler, Query}; use meilisearch_auth::AuthController; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; +use meilisearch_types::tasks::Status; use prometheus::{Encoder, TextEncoder}; +use time::OffsetDateTime; +use utoipa::OpenApi; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::routes::create_all_stats; use crate::search_queue::SearchQueue; +#[derive(OpenApi)] +#[openapi(paths(get_metrics))] +pub struct MetricApi; + pub fn configure(config: &mut web::ServiceConfig) { config.service(web::resource("").route(web::get().to(get_metrics))); } +/// Get prometheus metrics +/// +/// Retrieve metrics on the engine. See https://www.meilisearch.com/docs/learn/experimental/metrics +/// Currently, [the feature is experimental](https://www.meilisearch.com/docs/learn/experimental/overview) +/// which means it must be enabled. +#[utoipa::path( + get, + path = "", + tag = "Stats", + security(("Bearer" = ["metrics.get", "metrics.*", "*"])), + responses( + (status = 200, description = "The metrics of the instance", body = String, content_type = "text/plain", example = json!( + r#" +# HELP meilisearch_db_size_bytes Meilisearch DB Size In Bytes +# TYPE meilisearch_db_size_bytes gauge +meilisearch_db_size_bytes 1130496 +# HELP meilisearch_http_requests_total Meilisearch HTTP requests total +# TYPE meilisearch_http_requests_total counter +meilisearch_http_requests_total{method="GET",path="/metrics",status="400"} 1 +meilisearch_http_requests_total{method="PATCH",path="/experimental-features",status="200"} 1 +# HELP meilisearch_http_response_time_seconds Meilisearch HTTP response times +# TYPE meilisearch_http_response_time_seconds histogram +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.005"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.01"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.025"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.05"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.075"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.1"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.25"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.5"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="0.75"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="1"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="2.5"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="5"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="7.5"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="10"} 0 +meilisearch_http_response_time_seconds_bucket{method="GET",path="/metrics",le="+Inf"} 0 +meilisearch_http_response_time_seconds_sum{method="GET",path="/metrics"} 0 +meilisearch_http_response_time_seconds_count{method="GET",path="/metrics"} 0 +# HELP meilisearch_index_count Meilisearch Index Count +# TYPE meilisearch_index_count gauge +meilisearch_index_count 1 +# HELP meilisearch_index_docs_count Meilisearch Index Docs Count +# TYPE meilisearch_index_docs_count gauge +meilisearch_index_docs_count{index="mieli"} 2 +# HELP meilisearch_is_indexing Meilisearch Is Indexing +# TYPE meilisearch_is_indexing gauge +meilisearch_is_indexing 0 +# HELP meilisearch_last_update Meilisearch Last Update +# TYPE meilisearch_last_update gauge +meilisearch_last_update 1726675964 +# HELP meilisearch_nb_tasks Meilisearch Number of tasks +# TYPE meilisearch_nb_tasks gauge +meilisearch_nb_tasks{kind="indexes",value="mieli"} 39 +meilisearch_nb_tasks{kind="statuses",value="canceled"} 0 +meilisearch_nb_tasks{kind="statuses",value="enqueued"} 0 +meilisearch_nb_tasks{kind="statuses",value="failed"} 4 +meilisearch_nb_tasks{kind="statuses",value="processing"} 0 +meilisearch_nb_tasks{kind="statuses",value="succeeded"} 35 +meilisearch_nb_tasks{kind="types",value="documentAdditionOrUpdate"} 9 +meilisearch_nb_tasks{kind="types",value="documentDeletion"} 0 +meilisearch_nb_tasks{kind="types",value="documentEdition"} 0 +meilisearch_nb_tasks{kind="types",value="dumpCreation"} 0 +meilisearch_nb_tasks{kind="types",value="indexCreation"} 0 +meilisearch_nb_tasks{kind="types",value="indexDeletion"} 8 +meilisearch_nb_tasks{kind="types",value="indexSwap"} 0 +meilisearch_nb_tasks{kind="types",value="indexUpdate"} 0 +meilisearch_nb_tasks{kind="types",value="settingsUpdate"} 22 +meilisearch_nb_tasks{kind="types",value="snapshotCreation"} 0 +meilisearch_nb_tasks{kind="types",value="taskCancelation"} 0 +meilisearch_nb_tasks{kind="types",value="taskDeletion"} 0 +# HELP meilisearch_used_db_size_bytes Meilisearch Used DB Size In Bytes +# TYPE meilisearch_used_db_size_bytes gauge +meilisearch_used_db_size_bytes 409600 +"# + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn get_metrics( index_scheduler: GuardedData, Data>, auth_controller: Data, @@ -61,6 +154,22 @@ pub async fn get_metrics( } crate::metrics::MEILISEARCH_IS_INDEXING.set(index_scheduler.is_task_processing()? as i64); + let task_queue_latency_seconds = index_scheduler + .get_tasks_from_authorized_indexes( + &Query { + limit: Some(1), + reverse: Some(true), + statuses: Some(vec![Status::Enqueued, Status::Processing]), + ..Query::default() + }, + auth_filters, + )? + .0 + .first() + .map(|task| (OffsetDateTime::now_utc() - task.enqueued_at).as_seconds_f64()) + .unwrap_or(0.0); + crate::metrics::MEILISEARCH_TASK_QUEUE_LATENCY_SECONDS.set(task_queue_latency_seconds); + let encoder = TextEncoder::new(); let mut buffer = vec![]; encoder.encode(&prometheus::gather(), &mut buffer).expect("Failed to encode metrics"); diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 91237b707..2c71fa68b 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -4,19 +4,51 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; -use meilisearch_types::error::{Code, ResponseError}; -use meilisearch_types::settings::{Settings, Unchecked}; +use meilisearch_types::batch_view::BatchView; +use meilisearch_types::batches::BatchStats; +use meilisearch_types::error::{Code, ErrorType, ResponseError}; +use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::keys::CreateApiKey; +use meilisearch_types::milli::{ + AttributePatterns, FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns, + FilterableAttributesRule, +}; +use meilisearch_types::settings::{ + Checked, FacetingSettings, MinWordSizeTyposSetting, PaginationSettings, Settings, TypoSettings, + Unchecked, +}; +use meilisearch_types::task_view::{DetailsView, TaskView}; use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use tracing::debug; +use utoipa::{OpenApi, ToSchema}; +use self::api_key::KeyView; +use self::indexes::documents::BrowseQuery; +use self::indexes::{IndexCreateRequest, IndexStats, UpdateIndexRequest}; +use self::logs::{GetLogs, LogMode, UpdateStderrLogs}; +use self::open_api_utils::OpenApiAuth; +use self::tasks::AllTasks; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; +use crate::milli::progress::{ProgressStepView, ProgressView}; +use crate::routes::batches::AllBatches; +use crate::routes::features::RuntimeTogglableFeatures; +use crate::routes::indexes::documents::{DocumentDeletionByFilter, DocumentEditionByFunction}; +use crate::routes::indexes::IndexView; +use crate::routes::multi_search::SearchResults; +use crate::routes::network::{Network, Remote}; +use crate::routes::swap_indexes::SwapIndexesPayload; +use crate::search::{ + FederatedSearch, FederatedSearchResult, Federation, FederationOptions, MergeFacets, + SearchQueryWithIndex, SearchResultWithIndex, SimilarQuery, SimilarResult, +}; use crate::search_queue::SearchQueue; use crate::Opt; const PAGINATION_DEFAULT_LIMIT: usize = 20; +const PAGINATION_DEFAULT_LIMIT_FN: fn() -> usize = || 20; mod api_key; pub mod batches; @@ -27,9 +59,44 @@ mod logs; mod metrics; mod multi_search; mod multi_search_analytics; +pub mod network; +mod open_api_utils; mod snapshot; mod swap_indexes; pub mod tasks; +#[cfg(test)] +mod tasks_test; + +#[derive(OpenApi)] +#[openapi( + nest( + (path = "/tasks", api = tasks::TaskApi), + (path = "/batches", api = batches::BatchesApi), + (path = "/indexes", api = indexes::IndexesApi), + // We must stop the search path here because the rest must be configured by each route individually + (path = "/indexes", api = indexes::search::SearchApi), + (path = "/snapshots", api = snapshot::SnapshotApi), + (path = "/dumps", api = dump::DumpApi), + (path = "/keys", api = api_key::ApiKeyApi), + (path = "/metrics", api = metrics::MetricApi), + (path = "/logs", api = logs::LogsApi), + (path = "/multi-search", api = multi_search::MultiSearchApi), + (path = "/swap-indexes", api = swap_indexes::SwapIndexesApi), + (path = "/experimental-features", api = features::ExperimentalFeaturesApi), + (path = "/network", api = network::NetworkApi), + ), + paths(get_health, get_version, get_stats), + tags( + (name = "Stats", description = "Stats gives extended information and metrics about indexes and the Meilisearch database."), + ), + modifiers(&OpenApiAuth), + servers(( + url = "/", + description = "Local server", + )), + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures)) +)] +pub struct MeilisearchApi; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::scope("/tasks").configure(tasks::configure)) @@ -45,7 +112,15 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/multi-search").configure(multi_search::configure)) .service(web::scope("/swap-indexes").configure(swap_indexes::configure)) .service(web::scope("/metrics").configure(metrics::configure)) - .service(web::scope("/experimental-features").configure(features::configure)); + .service(web::scope("/experimental-features").configure(features::configure)) + .service(web::scope("/network").configure(network::configure)); + + #[cfg(feature = "swagger")] + { + use utoipa_scalar::{Scalar, Servable as ScalarServable}; + let openapi = MeilisearchApi::openapi(); + cfg.service(Scalar::with_url("/scalar", openapi.clone())); + } } pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result, ResponseError> { @@ -95,17 +170,23 @@ pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result { }) }) .transpose()? - .map_or(false, |s| s.to_lowercase() == "true")) + .is_some_and(|s| s.to_lowercase() == "true")) } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct SummarizedTaskView { + /// The task unique identifier. + #[schema(value_type = u32)] task_uid: TaskId, + /// The index affected by this task. May be `null` if the task is not linked to any index. index_uid: Option, + /// The status of the task. status: Status, + /// The type of the task. #[serde(rename = "type")] kind: Kind, + /// The date on which the task was enqueued. #[serde(serialize_with = "time::serde::rfc3339::serialize")] enqueued_at: OffsetDateTime, } @@ -127,7 +208,9 @@ pub struct Pagination { pub limit: usize, } -#[derive(Debug, Clone, Serialize)] +#[derive(Debug, Clone, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct PaginationView { pub results: Vec, pub offset: usize, @@ -283,17 +366,60 @@ pub async fn running() -> HttpResponse { HttpResponse::Ok().json(serde_json::json!({ "status": "Meilisearch is running" })) } -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, ToSchema)] #[serde(rename_all = "camelCase")] pub struct Stats { + /// The disk space used by the database, in bytes. pub database_size: u64, - #[serde(skip)] + /// The size of the database, in bytes. pub used_database_size: u64, + /// The date of the last update in the RFC 3339 formats. Can be `null` if no update has ever been processed. #[serde(serialize_with = "time::serde::rfc3339::option::serialize")] pub last_update: Option, + /// The stats of every individual index your API key lets you access. + #[schema(value_type = HashMap)] pub indexes: BTreeMap, } +/// Get stats of all indexes. +/// +/// Get stats of all indexes. +#[utoipa::path( + get, + path = "/stats", + tag = "Stats", + security(("Bearer" = ["stats.get", "stats.*", "*"])), + responses( + (status = 200, description = "The stats of the instance", body = Stats, content_type = "application/json", example = json!( + { + "databaseSize": 567, + "usedDatabaseSize": 456, + "lastUpdate": "2019-11-20T09:40:33.711324Z", + "indexes": { + "movies": { + "numberOfDocuments": 10, + "rawDocumentDbSize": 100, + "maxDocumentSize": 16, + "avgDocumentSize": 10, + "isIndexing": true, + "fieldDistribution": { + "genre": 10, + "author": 9 + } + } + } + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn get_stats( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, @@ -343,14 +469,43 @@ pub fn create_all_stats( Ok(stats) } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] #[serde(rename_all = "camelCase")] struct VersionResponse { + /// The commit used to compile this build of Meilisearch. commit_sha: String, + /// The date of this build. commit_date: String, + /// The version of Meilisearch. pkg_version: String, } +/// Get version +/// +/// Current version of Meilisearch. +#[utoipa::path( + get, + path = "/version", + tag = "Version", + security(("Bearer" = ["version", "*"])), + responses( + (status = 200, description = "Instance is healthy", body = VersionResponse, content_type = "application/json", example = json!( + { + "commitSha": "b46889b5f0f2f8b91438a08a358ba8f05fc09fc1", + "commitDate": "2021-07-08", + "pkgVersion": "0.23.0" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn get_version( _index_scheduler: GuardedData, Data>, ) -> HttpResponse { @@ -370,6 +525,35 @@ async fn get_version( }) } +#[derive(Default, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +struct HealthResponse { + /// The status of the instance. + status: HealthStatus, +} + +#[derive(Default, Serialize, ToSchema)] +#[serde(rename_all = "camelCase")] +enum HealthStatus { + #[default] + Available, +} + +/// Get Health +/// +/// The health check endpoint enables you to periodically test the health of your Meilisearch instance. +#[utoipa::path( + get, + path = "/health", + tag = "Health", + responses( + (status = 200, description = "Instance is healthy", body = HealthResponse, content_type = "application/json", example = json!( + { + "status": "available" + } + )), + ) +)] pub async fn get_health( index_scheduler: Data, auth_controller: Data, @@ -379,5 +563,5 @@ pub async fn get_health( index_scheduler.health().unwrap(); auth_controller.health().unwrap(); - Ok(HttpResponse::Ok().json(serde_json::json!({ "status": "available" }))) + Ok(HttpResponse::Ok().json(HealthResponse::default())) } diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index f8b1bc6ee..b3af98fd5 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -8,6 +8,7 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; +use utoipa::{OpenApi, ToSchema}; use super::multi_search_analytics::MultiSearchAggregator; use crate::analytics::Analytics; @@ -17,20 +18,129 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_federated_search, perform_search, FederatedSearch, RetrieveVectors, - SearchQueryWithIndex, SearchResultWithIndex, + add_search_rules, perform_federated_search, perform_search, FederatedSearch, + FederatedSearchResult, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, + PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE, }; use crate::search_queue::SearchQueue; +#[derive(OpenApi)] +#[openapi( + paths(multi_search_with_post), + tags(( + name = "Multi-search", + description = "The `/multi-search` route allows you to perform multiple search queries on one or more indexes by bundling them into a single HTTP request. Multi-search is also known as federated search.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/multi_search"), + )), +)] +pub struct MultiSearchApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(multi_search_with_post)))); } -#[derive(Serialize)] -struct SearchResults { +#[derive(Serialize, ToSchema)] +pub struct SearchResults { results: Vec, } +/// Perform a multi-search +/// +/// Bundle multiple search queries in a single API request. Use this endpoint to search through multiple indexes at once. +#[utoipa::path( + post, + request_body = FederatedSearch, + path = "", + tag = "Multi-search", + security(("Bearer" = ["search", "*"])), + responses( + (status = OK, description = "Non federated multi-search", body = SearchResults, content_type = "application/json", example = json!( + { + "results":[ + { + "indexUid":"movies", + "hits":[ + { + "id":13682, + "title":"Pooh's Heffalump Movie", + }, + ], + "query":"pooh", + "processingTimeMs":26, + "limit":1, + "offset":0, + "estimatedTotalHits":22 + }, + { + "indexUid":"movies", + "hits":[ + { + "id":12, + "title":"Finding Nemo", + }, + ], + "query":"nemo", + "processingTimeMs":5, + "limit":1, + "offset":0, + "estimatedTotalHits":11 + }, + { + "indexUid":"movie_ratings", + "hits":[ + { + "id":"Us", + "director": "Jordan Peele", + } + ], + "query":"Us", + "processingTimeMs":0, + "limit":1, + "offset":0, + "estimatedTotalHits":1 + } + ] + } + )), + (status = OK, description = "Federated multi-search", body = FederatedSearchResult, content_type = "application/json", example = json!( + { + "hits": [ + { + "id": 42, + "title": "Batman returns", + "overview": "The overview of batman returns", + "_federation": { + "indexUid": "movies", + "queriesPosition": 0 + } + }, + { + "comicsId": "batman-killing-joke", + "description": "This comic is really awesome", + "title": "Batman: the killing joke", + "_federation": { + "indexUid": "comics", + "queriesPosition": 1 + } + }, + ], + "processingTimeMs": 0, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "semanticHitCount": 0 + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn multi_search_with_post( index_scheduler: GuardedData, Data>, search_queue: Data, @@ -78,18 +188,22 @@ pub async fn multi_search_with_post( let response = match federation { Some(federation) => { - let search_result = tokio::task::spawn_blocking(move || { - perform_federated_search(&index_scheduler, queries, federation, features) - }) - .await; + // check remote header + let is_proxy = req + .headers() + .get(PROXY_SEARCH_HEADER) + .is_some_and(|value| value.as_bytes() == PROXY_SEARCH_HEADER_VALUE.as_bytes()); + let search_result = + perform_federated_search(&index_scheduler, queries, federation, features, is_proxy) + .await; permit.drop().await; - if let Ok(Ok(_)) = search_result { + if search_result.is_ok() { multi_aggregate.succeed(); } analytics.publish(multi_aggregate, &req); - HttpResponse::Ok().json(search_result??) + HttpResponse::Ok().json(search_result?) } None => { // Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only, @@ -125,14 +239,26 @@ pub async fn multi_search_with_post( }) .with_index(query_index)?; - let search_kind = - search_kind(&query, index_scheduler.get_ref(), &index, features) - .with_index(query_index)?; - let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features) - .with_index(query_index)?; + let index_uid_str = index_uid.to_string(); + + let search_kind = search_kind( + &query, + index_scheduler.get_ref(), + index_uid_str.clone(), + &index, + ) + .with_index(query_index)?; + let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors); let search_result = tokio::task::spawn_blocking(move || { - perform_search(&index, query, search_kind, retrieve_vector, features) + perform_search( + index_uid_str.clone(), + &index, + query, + search_kind, + retrieve_vector, + features, + ) }) .await .with_index(query_index)?; diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs index 3d07f471c..3fa23f630 100644 --- a/crates/meilisearch/src/routes/multi_search_analytics.rs +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -13,6 +13,8 @@ pub struct MultiSearchAggregator { // sum of the number of distinct indexes in each single request, use with total_received to compute an avg total_distinct_index_count: usize, + // sum of the number of distinct remotes in each single request, use with total_received to compute an avg + total_distinct_remote_count: usize, // number of queries with a single index, use with total_received to compute a proportion total_single_index: usize, @@ -31,46 +33,49 @@ impl MultiSearchAggregator { pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { let use_federation = federated_search.federation.is_some(); - let distinct_indexes: HashSet<_> = federated_search - .queries - .iter() - .map(|query| { - let query = &query; - // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex - let SearchQueryWithIndex { - index_uid, - federation_options: _, - q: _, - vector: _, - offset: _, - limit: _, - page: _, - hits_per_page: _, - attributes_to_retrieve: _, - retrieve_vectors: _, - attributes_to_crop: _, - crop_length: _, - attributes_to_highlight: _, - show_ranking_score: _, - show_ranking_score_details: _, - show_matches_position: _, - filter: _, - sort: _, - distinct: _, - facets: _, - highlight_pre_tag: _, - highlight_post_tag: _, - crop_marker: _, - matching_strategy: _, - attributes_to_search_on: _, - hybrid: _, - ranking_score_threshold: _, - locales: _, - } = query; + let mut distinct_indexes = HashSet::with_capacity(federated_search.queries.len()); + let mut distinct_remotes = HashSet::with_capacity(federated_search.queries.len()); - index_uid.as_str() - }) - .collect(); + // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex + for SearchQueryWithIndex { + index_uid, + federation_options, + q: _, + vector: _, + offset: _, + limit: _, + page: _, + hits_per_page: _, + attributes_to_retrieve: _, + retrieve_vectors: _, + attributes_to_crop: _, + crop_length: _, + attributes_to_highlight: _, + show_ranking_score: _, + show_ranking_score_details: _, + show_matches_position: _, + filter: _, + sort: _, + distinct: _, + facets: _, + highlight_pre_tag: _, + highlight_post_tag: _, + crop_marker: _, + matching_strategy: _, + attributes_to_search_on: _, + hybrid: _, + ranking_score_threshold: _, + locales: _, + } in &federated_search.queries + { + if let Some(federation_options) = federation_options { + if let Some(remote) = &federation_options.remote { + distinct_remotes.insert(remote.as_str()); + } + } + + distinct_indexes.insert(index_uid.as_str()); + } let show_ranking_score = federated_search.queries.iter().any(|query| query.show_ranking_score); @@ -81,6 +86,7 @@ impl MultiSearchAggregator { total_received: 1, total_succeeded: 0, total_distinct_index_count: distinct_indexes.len(), + total_distinct_remote_count: distinct_remotes.len(), total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, total_search_count: federated_search.queries.len(), show_ranking_score, @@ -110,6 +116,8 @@ impl Aggregate for MultiSearchAggregator { let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); let total_distinct_index_count = this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_distinct_remote_count = + this.total_distinct_remote_count.saturating_add(new.total_distinct_remote_count); let total_single_index = this.total_single_index.saturating_add(new.total_single_index); let total_search_count = this.total_search_count.saturating_add(new.total_search_count); let show_ranking_score = this.show_ranking_score || new.show_ranking_score; @@ -121,6 +129,7 @@ impl Aggregate for MultiSearchAggregator { total_received, total_succeeded, total_distinct_index_count, + total_distinct_remote_count, total_single_index, total_search_count, show_ranking_score, @@ -134,6 +143,7 @@ impl Aggregate for MultiSearchAggregator { total_received, total_succeeded, total_distinct_index_count, + total_distinct_remote_count, total_single_index, total_search_count, show_ranking_score, @@ -152,6 +162,10 @@ impl Aggregate for MultiSearchAggregator { "total_distinct_index_count": total_distinct_index_count, "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early }, + "remotes": { + "total_distinct_remote_count": total_distinct_remote_count, + "avg_distinct_remote_count": (total_distinct_remote_count as f64) / (total_received as f64), // not 0 else returned early + }, "searches": { "total_search_count": total_search_count, "avg_search_count": (total_search_count as f64) / (total_received as f64), diff --git a/crates/meilisearch/src/routes/network.rs b/crates/meilisearch/src/routes/network.rs new file mode 100644 index 000000000..458ae8cbf --- /dev/null +++ b/crates/meilisearch/src/routes/network.rs @@ -0,0 +1,261 @@ +use std::collections::BTreeMap; + +use actix_web::web::{self, Data}; +use actix_web::{HttpRequest, HttpResponse}; +use deserr::actix_web::AwebJson; +use deserr::Deserr; +use index_scheduler::IndexScheduler; +use itertools::{EitherOrBoth, Itertools}; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::{ + InvalidNetworkRemotes, InvalidNetworkSearchApiKey, InvalidNetworkSelf, InvalidNetworkUrl, +}; +use meilisearch_types::error::ResponseError; +use meilisearch_types::features::{Network as DbNetwork, Remote as DbRemote}; +use meilisearch_types::keys::actions; +use meilisearch_types::milli::update::Setting; +use serde::Serialize; +use tracing::debug; +use utoipa::{OpenApi, ToSchema}; + +use crate::analytics::{Aggregate, Analytics}; +use crate::extractors::authentication::policies::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::extractors::sequential_extractor::SeqHandler; + +#[derive(OpenApi)] +#[openapi( + paths(get_network, patch_network), + tags(( + name = "Network", + description = "The `/network` route allows you to describe the topology of a network of Meilisearch instances. + +This route is **synchronous**. This means that no task object will be returned, and any change to the network will be made available immediately.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/network"), + )), +)] +pub struct NetworkApi; + +pub fn configure(cfg: &mut web::ServiceConfig) { + cfg.service( + web::resource("") + .route(web::get().to(get_network)) + .route(web::patch().to(SeqHandler(patch_network))), + ); +} + +/// Get network topology +/// +/// Get a list of all Meilisearch instances currently known to this instance. +#[utoipa::path( + get, + path = "", + tag = "Network", + security(("Bearer" = ["network.get", "network.*", "*"])), + responses( + (status = OK, description = "Known nodes are returned", body = Network, content_type = "application/json", example = json!( + { + "self": "ms-0", + "remotes": { + "ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset }, + "ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) }, + "ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) }, + } + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn get_network( + index_scheduler: GuardedData, Data>, +) -> Result { + index_scheduler.features().check_network("Using the /network route")?; + + let network = index_scheduler.network(); + debug!(returns = ?network, "Get network"); + Ok(HttpResponse::Ok().json(network)) +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Remote { + #[schema(value_type = Option, example = json!({ + "ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset }, + "ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) }, + "ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) }, + }))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub url: Setting, + #[schema(value_type = Option, example = json!("XWnBI8QHUc-4IlqbKPLUDuhftNq19mQtjc6JvmivzJU"))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub search_api_key: Setting, +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Network { + #[schema(value_type = Option>, example = json!("http://localhost:7700"))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub remotes: Setting>>, + #[schema(value_type = Option, example = json!("ms-00"), rename = "self")] + #[serde(default, rename = "self")] + #[deserr(default, rename = "self", error = DeserrJsonError)] + pub local: Setting, +} + +impl Remote { + pub fn try_into_db_node(self, name: &str) -> Result { + Ok(DbRemote { + url: self.url.set().ok_or(ResponseError::from_msg( + format!("Missing field `.remotes.{name}.url`"), + meilisearch_types::error::Code::MissingNetworkUrl, + ))?, + search_api_key: self.search_api_key.set(), + }) + } +} + +#[derive(Serialize)] +pub struct PatchNetworkAnalytics { + network_size: usize, + network_has_self: bool, +} + +impl Aggregate for PatchNetworkAnalytics { + fn event_name(&self) -> &'static str { + "Network Updated" + } + + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { network_size: new.network_size, network_has_self: new.network_has_self }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + +/// Configure Network +/// +/// Add or remove nodes from network. +#[utoipa::path( + patch, + path = "", + tag = "Network", + request_body = Network, + security(("Bearer" = ["network.update", "network.*", "*"])), + responses( + (status = OK, description = "New network state is returned", body = Network, content_type = "application/json", example = json!( + { + "self": "ms-0", + "remotes": { + "ms-0": Remote { url: Setting::Set("http://localhost:7700".into()), search_api_key: Setting::Reset }, + "ms-1": Remote { url: Setting::Set("http://localhost:7701".into()), search_api_key: Setting::Set("foo".into()) }, + "ms-2": Remote { url: Setting::Set("http://localhost:7702".into()), search_api_key: Setting::Set("bar".into()) }, + } + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn patch_network( + index_scheduler: GuardedData, Data>, + new_network: AwebJson, + req: HttpRequest, + analytics: Data, +) -> Result { + index_scheduler.features().check_network("Using the /network route")?; + + let new_network = new_network.0; + let old_network = index_scheduler.network(); + debug!(parameters = ?new_network, "Patch network"); + + let merged_self = match new_network.local { + Setting::Set(new_self) => Some(new_self), + Setting::Reset => None, + Setting::NotSet => old_network.local, + }; + + let merged_remotes = match new_network.remotes { + Setting::Set(new_remotes) => { + let mut merged_remotes = BTreeMap::new(); + for either_or_both in old_network + .remotes + .into_iter() + .merge_join_by(new_remotes.into_iter(), |left, right| left.0.cmp(&right.0)) + { + match either_or_both { + EitherOrBoth::Both((key, old), (_, Some(new))) => { + let DbRemote { url: old_url, search_api_key: old_search_api_key } = old; + + let Remote { url: new_url, search_api_key: new_search_api_key } = new; + + let merged = DbRemote { + url: match new_url { + Setting::Set(new_url) => new_url, + Setting::Reset => { + return Err(ResponseError::from_msg( + format!( + "Field `.remotes.{key}.url` cannot be set to `null`" + ), + meilisearch_types::error::Code::InvalidNetworkUrl, + )) + } + Setting::NotSet => old_url, + }, + search_api_key: match new_search_api_key { + Setting::Set(new_search_api_key) => Some(new_search_api_key), + Setting::Reset => None, + Setting::NotSet => old_search_api_key, + }, + }; + merged_remotes.insert(key, merged); + } + EitherOrBoth::Both((_, _), (_, None)) | EitherOrBoth::Right((_, None)) => {} + EitherOrBoth::Left((key, node)) => { + merged_remotes.insert(key, node); + } + EitherOrBoth::Right((key, Some(node))) => { + let node = node.try_into_db_node(&key)?; + merged_remotes.insert(key, node); + } + } + } + merged_remotes + } + Setting::Reset => BTreeMap::new(), + Setting::NotSet => old_network.remotes, + }; + + analytics.publish( + PatchNetworkAnalytics { + network_size: merged_remotes.len(), + network_has_self: merged_self.is_some(), + }, + &req, + ); + + let merged_network = DbNetwork { local: merged_self, remotes: merged_remotes }; + index_scheduler.put_network(merged_network.clone())?; + debug!(returns = ?merged_network, "Patch network"); + Ok(HttpResponse::Ok().json(merged_network)) +} diff --git a/crates/meilisearch/src/routes/open_api_utils.rs b/crates/meilisearch/src/routes/open_api_utils.rs new file mode 100644 index 000000000..89a3ef76a --- /dev/null +++ b/crates/meilisearch/src/routes/open_api_utils.rs @@ -0,0 +1,24 @@ +use serde::Serialize; +use utoipa::openapi::security::{HttpAuthScheme, HttpBuilder, SecurityScheme}; + +#[derive(Debug, Serialize)] +pub struct OpenApiAuth; + +impl utoipa::Modify for OpenApiAuth { + fn modify(&self, openapi: &mut utoipa::openapi::OpenApi) { + if let Some(schema) = openapi.components.as_mut() { + schema.add_security_scheme( + "Bearer", + SecurityScheme::Http( + HttpBuilder::new() + .scheme(HttpAuthScheme::Bearer) + .bearer_format("Uuidv4, string or JWT") + .description(Some( +"An API key is a token that you provide when making API calls. Include the token in a header parameter called `Authorization`. +Example: `Authorization: Bearer 8fece4405662dd830e4cb265e7e047aab2e79672a760a12712d2a263c9003509`")) + .build(), + ), + ); + } + } +} diff --git a/crates/meilisearch/src/routes/snapshot.rs b/crates/meilisearch/src/routes/snapshot.rs index cacbc41af..de7ecc37f 100644 --- a/crates/meilisearch/src/routes/snapshot.rs +++ b/crates/meilisearch/src/routes/snapshot.rs @@ -4,6 +4,7 @@ use index_scheduler::IndexScheduler; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; use tracing::debug; +use utoipa::OpenApi; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; @@ -12,12 +13,55 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; +#[derive(OpenApi)] +#[openapi( + paths(create_snapshot), + tags(( + name = "Snapshots", + description = "The snapshots route allows the creation of database snapshots. Snapshots are .snapshot files that can be used to launch Meilisearch. +Creating a snapshot is also referred to as exporting it, whereas launching Meilisearch with a snapshot is referred to as importing it. +During a snapshot export, all indexes of the current instance are exported—together with their documents and settings—and saved as a single .snapshot file. +During a snapshot import, all indexes contained in the indicated .snapshot file are imported along with their associated documents and settings. +Snapshot imports are performed at launch using an option.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/snapshots"), + )), +)] +pub struct SnapshotApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); } crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created"); +/// Create a snapshot +/// +/// Triggers a snapshot creation process. Once the process is complete, a snapshot is created in the snapshot directory. If the snapshot directory does not exist yet, it will be created. +#[utoipa::path( + post, + path = "", + tag = "Snapshots", + security(("Bearer" = ["snapshots.create", "snapshots.*", "*"])), + responses( + (status = 202, description = "Snapshot is being created", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 0, + "indexUid": null, + "status": "enqueued", + "type": "snapshotCreation", + "enqueuedAt": "2021-01-01T09:39:00.000000Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn create_snapshot( index_scheduler: GuardedData, Data>, req: HttpRequest, diff --git a/crates/meilisearch/src/routes/swap_indexes.rs b/crates/meilisearch/src/routes/swap_indexes.rs index 9b8b67e63..4a35d1a6d 100644 --- a/crates/meilisearch/src/routes/swap_indexes.rs +++ b/crates/meilisearch/src/routes/swap_indexes.rs @@ -9,6 +9,7 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use serde::Serialize; +use utoipa::{OpenApi, ToSchema}; use super::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::analytics::{Aggregate, Analytics}; @@ -18,13 +19,18 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; use crate::Opt; +#[derive(OpenApi)] +#[openapi(paths(swap_indexes))] +pub struct SwapIndexesApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(swap_indexes)))); } -#[derive(Deserr, Debug, Clone, PartialEq, Eq)] +#[derive(Deserr, Debug, Clone, PartialEq, Eq, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct SwapIndexesPayload { + /// Array of the two indexUids to be swapped #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_swap_indexes)] indexes: Vec, } @@ -50,6 +56,37 @@ impl Aggregate for IndexSwappedAnalytics { } } +/// Swap indexes +/// +/// Swap the documents, settings, and task history of two or more indexes. You can only swap indexes in pairs. However, a single request can swap as many index pairs as you wish. +/// Swapping indexes is an atomic transaction: either all indexes are successfully swapped, or none are. +/// Swapping indexA and indexB will also replace every mention of indexA by indexB and vice-versa in the task history. enqueued tasks are left unmodified. +#[utoipa::path( + post, + path = "", + tag = "Indexes", + security(("Bearer" = ["search", "*"])), + request_body = Vec, + responses( + (status = OK, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 3, + "indexUid": null, + "status": "enqueued", + "type": "indexSwap", + "enqueuedAt": "2021-08-12T10:00:00.000000Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] pub async fn swap_indexes( index_scheduler: GuardedData, Data>, params: AwebJson, DeserrJsonError>, diff --git a/crates/meilisearch/src/routes/tasks.rs b/crates/meilisearch/src/routes/tasks.rs index cd82a6a18..95c105894 100644 --- a/crates/meilisearch/src/routes/tasks.rs +++ b/crates/meilisearch/src/routes/tasks.rs @@ -1,3 +1,5 @@ +use std::io::ErrorKind; + use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebQueryParameter; @@ -16,7 +18,9 @@ use serde::Serialize; use time::format_description::well_known::Rfc3339; use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; +use tokio::io::AsyncReadExt; use tokio::task; +use utoipa::{IntoParams, OpenApi, ToSchema}; use super::{get_task_id, is_dry_run, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; use crate::analytics::{Aggregate, AggregateMethod, Analytics}; @@ -25,6 +29,17 @@ use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::{aggregate_methods, Opt}; +#[derive(OpenApi)] +#[openapi( + paths(get_tasks, delete_tasks, cancel_tasks, get_task), + tags(( + name = "Tasks", + description = "The tasks route gives information about the progress of the [asynchronous operations](https://docs.meilisearch.com/learn/advanced/asynchronous_operations.html).", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/tasks"), + )), +)] +pub struct TaskApi; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") @@ -32,44 +47,79 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .route(web::delete().to(SeqHandler(delete_tasks))), ) .service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks)))) - .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))); + .service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task)))) + .service( + web::resource("/{task_id}/documents") + .route(web::get().to(SeqHandler(get_task_documents_file))), + ); } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] pub struct TasksFilterQuery { + /// Maximum number of results to return. #[deserr(default = Param(PAGINATION_DEFAULT_LIMIT as u32), error = DeserrQueryParamError)] + #[param(required = false, value_type = u32, example = 12, default = json!(PAGINATION_DEFAULT_LIMIT))] pub limit: Param, + /// Fetch the next set of results from the given uid. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option, example = 12421)] pub from: Option>, + /// The order you want to retrieve the objects. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option, example = true)] pub reverse: Option>, + /// Permits to filter tasks by their batch uid. By default, when the `batchUids` query parameter is not set, all task uids are returned. It's possible to specify several batch uids by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option, example = 12421)] pub batch_uids: OptionStarOrList, + /// Permits to filter tasks by their uid. By default, when the uids query parameter is not set, all task uids are returned. It's possible to specify several uids by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([231, 423, 598, "*"]))] pub uids: OptionStarOrList, + /// Permits to filter tasks using the uid of the task that canceled them. It's possible to specify several task uids by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([374, "*"]))] pub canceled_by: OptionStarOrList, + /// Permits to filter tasks by their related type. By default, when `types` query parameter is not set, all task types are returned. It's possible to specify several types by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([Kind::DocumentAdditionOrUpdate, "*"]))] pub types: OptionStarOrList, + /// Permits to filter tasks by their status. By default, when `statuses` query parameter is not set, all task statuses are returned. It's possible to specify several statuses by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([Status::Succeeded, Status::Failed, Status::Canceled, Status::Enqueued, Status::Processing, "*"]))] pub statuses: OptionStarOrList, + /// Permits to filter tasks by their related index. By default, when `indexUids` query parameter is not set, the tasks of all the indexes are returned. It is possible to specify several indexes by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!(["movies", "theater", "*"]))] pub index_uids: OptionStarOrList, + /// Permits to filter tasks based on their enqueuedAt time. Matches tasks enqueued after the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_after -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub after_enqueued_at: OptionStarOr, + /// Permits to filter tasks based on their enqueuedAt time. Matches tasks enqueued before the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_before -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub before_enqueued_at: OptionStarOr, + /// Permits to filter tasks based on their startedAt time. Matches tasks started after the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_after -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub after_started_at: OptionStarOr, + /// Permits to filter tasks based on their startedAt time. Matches tasks started before the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_before -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub before_started_at: OptionStarOr, + /// Permits to filter tasks based on their finishedAt time. Matches tasks finished after the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_after -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub after_finished_at: OptionStarOr, + /// Permits to filter tasks based on their finishedAt time. Matches tasks finished before the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_before -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub before_finished_at: OptionStarOr, } @@ -96,7 +146,7 @@ impl TasksFilterQuery { } impl TaskDeletionOrCancelationQuery { - fn is_empty(&self) -> bool { + pub fn is_empty(&self) -> bool { matches!( self, TaskDeletionOrCancelationQuery { @@ -117,33 +167,58 @@ impl TaskDeletionOrCancelationQuery { } } -#[derive(Debug, Deserr)] +#[derive(Debug, Deserr, IntoParams)] #[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)] +#[into_params(rename_all = "camelCase", parameter_in = Query)] pub struct TaskDeletionOrCancelationQuery { + /// Permits to filter tasks by their uid. By default, when the `uids` query parameter is not set, all task uids are returned. It's possible to specify several uids by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] - pub uids: OptionStarOrList, + #[param(required = false, value_type = Option>, example = json!([231, 423, 598, "*"]))] + pub uids: OptionStarOrList, + /// Lets you filter tasks by their `batchUid`. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([231, 423, 598, "*"]))] pub batch_uids: OptionStarOrList, + /// Permits to filter tasks using the uid of the task that canceled them. It's possible to specify several task uids by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] - pub canceled_by: OptionStarOrList, + #[param(required = false, value_type = Option>, example = json!([374, "*"]))] + pub canceled_by: OptionStarOrList, + /// Permits to filter tasks by their related type. By default, when `types` query parameter is not set, all task types are returned. It's possible to specify several types by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([Kind::DocumentDeletion, "*"]))] pub types: OptionStarOrList, + /// Permits to filter tasks by their status. By default, when `statuses` query parameter is not set, all task statuses are returned. It's possible to specify several statuses by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!([Status::Succeeded, Status::Failed, Status::Canceled, "*"]))] pub statuses: OptionStarOrList, + /// Permits to filter tasks by their related index. By default, when `indexUids` query parameter is not set, the tasks of all the indexes are returned. It is possible to specify several indexes by separating them with the `,` character. #[deserr(default, error = DeserrQueryParamError)] + #[param(required = false, value_type = Option>, example = json!(["movies", "theater", "*"]))] pub index_uids: OptionStarOrList, + /// Permits to filter tasks based on their enqueuedAt time. Matches tasks enqueued after the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_after -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub after_enqueued_at: OptionStarOr, + /// Permits to filter tasks based on their enqueuedAt time. Matches tasks enqueued before the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_before -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub before_enqueued_at: OptionStarOr, + /// Permits to filter tasks based on their startedAt time. Matches tasks started after the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_after -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub after_started_at: OptionStarOr, + /// Permits to filter tasks based on their startedAt time. Matches tasks started before the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_before -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub before_started_at: OptionStarOr, + /// Permits to filter tasks based on their finishedAt time. Matches tasks finished after the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_after -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub after_finished_at: OptionStarOr, + /// Permits to filter tasks based on their finishedAt time. Matches tasks finished before the given date. Supports RFC 3339 date format. #[deserr(default, error = DeserrQueryParamError, try_from(OptionStarOr) = deserialize_date_before -> InvalidTaskDateError)] + #[param(required = false, value_type = Option, example = json!(["2024-08-08T16:37:09.971Z", "*"]))] pub before_finished_at: OptionStarOr, } @@ -226,6 +301,51 @@ impl Aggregate for TaskFilterAnalytics, Data>, params: AwebQueryParameter, @@ -260,11 +380,8 @@ async fn cancel_tasks( let query = params.into_query(); - let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes( - &index_scheduler.read_txn()?, - &query, - index_scheduler.filters(), - )?; + let (tasks, _) = + index_scheduler.get_task_ids_from_authorized_indexes(&query, index_scheduler.filters())?; let task_cancelation = KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks }; @@ -278,6 +395,51 @@ async fn cancel_tasks( Ok(HttpResponse::Ok().json(task)) } +/// Delete tasks +/// +/// Delete [tasks](https://docs.meilisearch.com/learn/advanced/asynchronous_operations.html) on filter +#[utoipa::path( + delete, + path = "", + tag = "Tasks", + security(("Bearer" = ["tasks.delete", "tasks.*", "*"])), + params(TaskDeletionOrCancelationQuery), + responses( + (status = 200, description = "Task successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( + { + "taskUid": 147, + "indexUid": null, + "status": "enqueued", + "type": "taskDeletion", + "enqueuedAt": "2024-08-08T17:05:55.791772Z" + } + )), + (status = 400, description = "A filter is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Query parameters to filter the tasks to delete are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.", + "code": "missing_task_filters", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_task_filters" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + (status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Task :taskUid not found.", + "code": "task_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors/#task_not_found" + } + )) + ) +)] async fn delete_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, @@ -312,11 +474,8 @@ async fn delete_tasks( let query = params.into_query(); - let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes( - &index_scheduler.read_txn()?, - &query, - index_scheduler.filters(), - )?; + let (tasks, _) = + index_scheduler.get_task_ids_from_authorized_indexes(&query, index_scheduler.filters())?; let task_deletion = KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks }; @@ -329,15 +488,69 @@ async fn delete_tasks( Ok(HttpResponse::Ok().json(task)) } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, ToSchema)] pub struct AllTasks { + /// The list of tasks that matched the filter. results: Vec, + /// Total number of browsable results using offset/limit parameters for the given resource. total: u64, + /// Limit given for the query. If limit is not provided as a query parameter, this parameter displays the default limit value. limit: u32, + /// The first task uid returned. from: Option, + /// Represents the value to send in from to fetch the next slice of the results. The first item for the next slice starts at this exact number. When the returned value is null, it means that all the data have been browsed in the given order. next: Option, } +/// Get all tasks +/// +/// Get all [tasks](https://docs.meilisearch.com/learn/advanced/asynchronous_operations.html) +#[utoipa::path( + get, + path = "", + tag = "Tasks", + security(("Bearer" = ["tasks.get", "tasks.*", "*"])), + params(TasksFilterQuery), + responses( + (status = 200, description = "Get all tasks", body = AllTasks, content_type = "application/json", example = json!( + { + "results": [ + { + "uid": 144, + "indexUid": "mieli", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "settings": { + "filterableAttributes": [ + "play_count" + ] + } + }, + "error": null, + "duration": "PT0.009330S", + "enqueuedAt": "2024-08-08T09:01:13.348471Z", + "startedAt": "2024-08-08T09:01:13.349442Z", + "finishedAt": "2024-08-08T09:01:13.358772Z" + } + ], + "total": 1, + "limit": 1, + "from": 144, + "next": null + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] async fn get_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, @@ -349,7 +562,7 @@ async fn get_tasks( let query = params.into_query(); let filters = index_scheduler.filters(); - let (tasks, total) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?; + let (tasks, total) = index_scheduler.get_tasks_from_authorized_indexes(&query, filters)?; let mut results: Vec<_> = tasks.iter().map(TaskView::from_task).collect(); // If we were able to fetch the number +1 tasks we asked @@ -362,6 +575,52 @@ async fn get_tasks( Ok(HttpResponse::Ok().json(tasks)) } +/// Get a task +/// +/// Get a [task](https://www.meilisearch.com/docs/learn/async/asynchronous_operations) +#[utoipa::path( + get, + path = "/{taskUid}", + tag = "Tasks", + security(("Bearer" = ["tasks.get", "tasks.*", "*"])), + params(("taskUid", format = UInt32, example = 0, description = "The task identifier", nullable = false)), + responses( + (status = 200, description = "Task successfully retrieved", body = TaskView, content_type = "application/json", example = json!( + { + "uid": 1, + "indexUid": "movies", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 79000, + "indexedDocuments": 79000 + }, + "error": null, + "duration": "PT1S", + "enqueuedAt": "2021-01-01T09:39:00.000000Z", + "startedAt": "2021-01-01T09:39:01.000000Z", + "finishedAt": "2021-01-01T09:39:02.000000Z" + } + )), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + (status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Task :taskUid not found.", + "code": "task_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors/#task_not_found" + } + )) + ) +)] async fn get_task( index_scheduler: GuardedData, Data>, task_uid: web::Path, @@ -377,7 +636,7 @@ async fn get_task( let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; let filters = index_scheduler.filters(); - let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?; + let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(&query, filters)?; if let Some(task) = tasks.first() { let task_view = TaskView::from_task(task); @@ -387,6 +646,76 @@ async fn get_task( } } +/// Get a task's documents. +/// +/// Get a [task's documents file](https://www.meilisearch.com/docs/learn/async/asynchronous_operations). +#[utoipa::path( + get, + path = "/{taskUid}/documents", + tag = "Tasks", + security(("Bearer" = ["tasks.get", "tasks.*", "*"])), + params(("taskUid", format = UInt32, example = 0, description = "The task identifier", nullable = false)), + responses( + (status = 200, description = "The content of the task update", body = serde_json::Value, content_type = "application/x-ndjson"), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + (status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "Task :taskUid not found.", + "code": "task_not_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors/#task_not_found" + } + )) + ) +)] +async fn get_task_documents_file( + index_scheduler: GuardedData, Data>, + task_uid: web::Path, +) -> Result { + index_scheduler.features().check_get_task_documents_route()?; + let task_uid_string = task_uid.into_inner(); + + let task_uid: TaskId = match task_uid_string.parse() { + Ok(id) => id, + Err(_e) => { + return Err(index_scheduler::Error::InvalidTaskUid { task_uid: task_uid_string }.into()) + } + }; + + let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; + let filters = index_scheduler.filters(); + let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(&query, filters)?; + + if let Some(task) = tasks.first() { + match task.content_uuid() { + Some(uuid) => { + let mut tfile = match index_scheduler.queue.update_file(uuid) { + Ok(file) => tokio::fs::File::from_std(file), + Err(file_store::Error::IoError(e)) if e.kind() == ErrorKind::NotFound => { + return Err(index_scheduler::Error::TaskFileNotFound(task_uid).into()) + } + Err(e) => return Err(e.into()), + }; + // Yes, that's awful to put everything in memory when we could have streamed it from + // disk but it's really (really) complex to do with the current state of async Rust. + let mut content = String::new(); + tfile.read_to_string(&mut content).await?; + Ok(HttpResponse::Ok().content_type("application/x-ndjson").body(content)) + } + None => Err(index_scheduler::Error::TaskFileNotFound(task_uid).into()), + } + } else { + Err(index_scheduler::Error::TaskNotFound(task_uid).into()) + } +} + pub enum DeserializeDateOption { Before, After, @@ -431,356 +760,3 @@ pub fn deserialize_date_before( ) -> std::result::Result, InvalidTaskDateError> { value.try_map(|x| deserialize_date(&x, DeserializeDateOption::Before)) } - -#[cfg(test)] -mod tests { - use deserr::Deserr; - use meili_snap::snapshot; - use meilisearch_types::deserr::DeserrQueryParamError; - use meilisearch_types::error::{Code, ResponseError}; - - use crate::routes::tasks::{TaskDeletionOrCancelationQuery, TasksFilterQuery}; - - fn deserr_query_params(j: &str) -> Result - where - T: Deserr, - { - let value = serde_urlencoded::from_str::(j) - .map_err(|e| ResponseError::from_msg(e.to_string(), Code::BadRequest))?; - - match deserr::deserialize::<_, _, DeserrQueryParamError>(value) { - Ok(data) => Ok(data), - Err(e) => Err(ResponseError::from(e)), - } - } - - #[test] - fn deserialize_task_filter_dates() { - { - let params = "afterEnqueuedAt=2021-12-03&beforeEnqueuedAt=2021-12-03&afterStartedAt=2021-12-03&beforeStartedAt=2021-12-03&afterFinishedAt=2021-12-03&beforeFinishedAt=2021-12-03"; - let query = deserr_query_params::(params).unwrap(); - - snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(2021-12-04 0:00:00.0 +00:00:00)"); - snapshot!(format!("{:?}", query.before_enqueued_at), @"Other(2021-12-03 0:00:00.0 +00:00:00)"); - snapshot!(format!("{:?}", query.after_started_at), @"Other(2021-12-04 0:00:00.0 +00:00:00)"); - snapshot!(format!("{:?}", query.before_started_at), @"Other(2021-12-03 0:00:00.0 +00:00:00)"); - snapshot!(format!("{:?}", query.after_finished_at), @"Other(2021-12-04 0:00:00.0 +00:00:00)"); - snapshot!(format!("{:?}", query.before_finished_at), @"Other(2021-12-03 0:00:00.0 +00:00:00)"); - } - { - let params = - "afterEnqueuedAt=2021-12-03T23:45:23Z&beforeEnqueuedAt=2021-12-03T23:45:23Z"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(2021-12-03 23:45:23.0 +00:00:00)"); - snapshot!(format!("{:?}", query.before_enqueued_at), @"Other(2021-12-03 23:45:23.0 +00:00:00)"); - } - { - let params = "afterEnqueuedAt=1997-11-12T09:55:06-06:20"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(1997-11-12 9:55:06.0 -06:20:00)"); - } - { - let params = "afterEnqueuedAt=1997-11-12T09:55:06%2B00:00"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(1997-11-12 9:55:06.0 +00:00:00)"); - } - { - let params = "afterEnqueuedAt=1997-11-12T09:55:06.200000300Z"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(1997-11-12 9:55:06.2000003 +00:00:00)"); - } - { - // Stars are allowed in date fields as well - let params = "afterEnqueuedAt=*&beforeStartedAt=*&afterFinishedAt=*&beforeFinishedAt=*&afterStartedAt=*&beforeEnqueuedAt=*"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query), @"TaskDeletionOrCancelationQuery { uids: None, batch_uids: None, canceled_by: None, types: None, statuses: None, index_uids: None, after_enqueued_at: Star, before_enqueued_at: Star, after_started_at: Star, before_started_at: Star, after_finished_at: Star, before_finished_at: Star }"); - } - { - let params = "afterFinishedAt=2021"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `afterFinishedAt`: `2021` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", - "code": "invalid_task_after_finished_at", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_after_finished_at" - } - "###); - } - { - let params = "beforeFinishedAt=2021"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `beforeFinishedAt`: `2021` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", - "code": "invalid_task_before_finished_at", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_before_finished_at" - } - "###); - } - { - let params = "afterEnqueuedAt=2021-12"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `afterEnqueuedAt`: `2021-12` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", - "code": "invalid_task_after_enqueued_at", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_after_enqueued_at" - } - "###); - } - - { - let params = "beforeEnqueuedAt=2021-12-03T23"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `beforeEnqueuedAt`: `2021-12-03T23` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", - "code": "invalid_task_before_enqueued_at", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_before_enqueued_at" - } - "###); - } - { - let params = "afterStartedAt=2021-12-03T23:45"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `afterStartedAt`: `2021-12-03T23:45` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", - "code": "invalid_task_after_started_at", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_after_started_at" - } - "###); - } - { - let params = "beforeStartedAt=2021-12-03T23:45"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `beforeStartedAt`: `2021-12-03T23:45` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", - "code": "invalid_task_before_started_at", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_before_started_at" - } - "###); - } - } - - #[test] - fn deserialize_task_filter_uids() { - { - let params = "uids=78,1,12,73"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.uids), @"List([78, 1, 12, 73])"); - } - { - let params = "uids=1"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.uids), @"List([1])"); - } - { - let params = "uids=cat,*,dog"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `uids[0]`: could not parse `cat` as a positive integer", - "code": "invalid_task_uids", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_uids" - } - "###); - } - { - let params = "uids=78,hello,world"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `uids[1]`: could not parse `hello` as a positive integer", - "code": "invalid_task_uids", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_uids" - } - "###); - } - { - let params = "uids=cat"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `uids`: could not parse `cat` as a positive integer", - "code": "invalid_task_uids", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_uids" - } - "###); - } - } - - #[test] - fn deserialize_task_filter_status() { - { - let params = "statuses=succeeded,failed,enqueued,processing,canceled"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.statuses), @"List([Succeeded, Failed, Enqueued, Processing, Canceled])"); - } - { - let params = "statuses=enqueued"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.statuses), @"List([Enqueued])"); - } - { - let params = "statuses=finished"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `statuses`: `finished` is not a valid task status. Available statuses are `enqueued`, `processing`, `succeeded`, `failed`, `canceled`.", - "code": "invalid_task_statuses", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_statuses" - } - "###); - } - } - #[test] - fn deserialize_task_filter_types() { - { - let params = "types=documentAdditionOrUpdate,documentDeletion,settingsUpdate,indexCreation,indexDeletion,indexUpdate,indexSwap,taskCancelation,taskDeletion,dumpCreation,snapshotCreation"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.types), @"List([DocumentAdditionOrUpdate, DocumentDeletion, SettingsUpdate, IndexCreation, IndexDeletion, IndexUpdate, IndexSwap, TaskCancelation, TaskDeletion, DumpCreation, SnapshotCreation])"); - } - { - let params = "types=settingsUpdate"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.types), @"List([SettingsUpdate])"); - } - { - let params = "types=createIndex"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", - "code": "invalid_task_types", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_types" - } - "###); - } - } - #[test] - fn deserialize_task_filter_index_uids() { - { - let params = "indexUids=toto,tata-78"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.index_uids), @r###"List([IndexUid("toto"), IndexUid("tata-78")])"###); - } - { - let params = "indexUids=index_a"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query.index_uids), @r###"List([IndexUid("index_a")])"###); - } - { - let params = "indexUids=1,hĆ©"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `indexUids[1]`: `hĆ©` is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes.", - "code": "invalid_index_uid", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_index_uid" - } - "###); - } - { - let params = "indexUids=hĆ©"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `indexUids`: `hĆ©` is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes.", - "code": "invalid_index_uid", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_index_uid" - } - "###); - } - } - - #[test] - fn deserialize_task_filter_general() { - { - let params = "from=12&limit=15&indexUids=toto,tata-78&statuses=succeeded,enqueued&afterEnqueuedAt=2012-04-23&uids=1,2,3"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query), @r###"TasksFilterQuery { limit: Param(15), from: Some(Param(12)), reverse: None, batch_uids: None, uids: List([1, 2, 3]), canceled_by: None, types: None, statuses: List([Succeeded, Enqueued]), index_uids: List([IndexUid("toto"), IndexUid("tata-78")]), after_enqueued_at: Other(2012-04-24 0:00:00.0 +00:00:00), before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"###); - } - { - // Stars should translate to `None` in the query - // Verify value of the default limit - let params = "indexUids=*&statuses=succeeded,*&afterEnqueuedAt=2012-04-23&uids=1,2,3"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query), @"TasksFilterQuery { limit: Param(20), from: None, reverse: None, batch_uids: None, uids: List([1, 2, 3]), canceled_by: None, types: None, statuses: Star, index_uids: Star, after_enqueued_at: Other(2012-04-24 0:00:00.0 +00:00:00), before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"); - } - { - // Stars should also translate to `None` in task deletion/cancelation queries - let params = "indexUids=*&statuses=succeeded,*&afterEnqueuedAt=2012-04-23&uids=1,2,3"; - let query = deserr_query_params::(params).unwrap(); - snapshot!(format!("{:?}", query), @"TaskDeletionOrCancelationQuery { uids: List([1, 2, 3]), batch_uids: None, canceled_by: None, types: None, statuses: Star, index_uids: Star, after_enqueued_at: Other(2012-04-24 0:00:00.0 +00:00:00), before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"); - } - { - // Star in from not allowed - let params = "uids=*&from=*"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Invalid value in parameter `from`: could not parse `*` as a positive integer", - "code": "invalid_task_from", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_task_from" - } - "###); - } - { - // From not allowed in task deletion/cancelation queries - let params = "from=12"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Unknown parameter `from`: expected one of `uids`, `batchUids`, `canceledBy`, `types`, `statuses`, `indexUids`, `afterEnqueuedAt`, `beforeEnqueuedAt`, `afterStartedAt`, `beforeStartedAt`, `afterFinishedAt`, `beforeFinishedAt`", - "code": "bad_request", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#bad_request" - } - "###); - } - { - // Limit not allowed in task deletion/cancelation queries - let params = "limit=12"; - let err = deserr_query_params::(params).unwrap_err(); - snapshot!(meili_snap::json_string!(err), @r###" - { - "message": "Unknown parameter `limit`: expected one of `uids`, `batchUids`, `canceledBy`, `types`, `statuses`, `indexUids`, `afterEnqueuedAt`, `beforeEnqueuedAt`, `afterStartedAt`, `beforeStartedAt`, `afterFinishedAt`, `beforeFinishedAt`", - "code": "bad_request", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#bad_request" - } - "###); - } - } - - #[test] - fn deserialize_task_delete_or_cancel_empty() { - { - let params = ""; - let query = deserr_query_params::(params).unwrap(); - assert!(query.is_empty()); - } - { - let params = "statuses=*"; - let query = deserr_query_params::(params).unwrap(); - assert!(!query.is_empty()); - snapshot!(format!("{query:?}"), @"TaskDeletionOrCancelationQuery { uids: None, batch_uids: None, canceled_by: None, types: None, statuses: Star, index_uids: None, after_enqueued_at: None, before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"); - } - } -} diff --git a/crates/meilisearch/src/routes/tasks_test.rs b/crates/meilisearch/src/routes/tasks_test.rs new file mode 100644 index 000000000..a17b80c82 --- /dev/null +++ b/crates/meilisearch/src/routes/tasks_test.rs @@ -0,0 +1,352 @@ +#[cfg(test)] +mod tests { + use deserr::Deserr; + use meili_snap::snapshot; + use meilisearch_types::deserr::DeserrQueryParamError; + use meilisearch_types::error::{Code, ResponseError}; + + use crate::routes::tasks::{TaskDeletionOrCancelationQuery, TasksFilterQuery}; + + fn deserr_query_params(j: &str) -> Result + where + T: Deserr, + { + let value = serde_urlencoded::from_str::(j) + .map_err(|e| ResponseError::from_msg(e.to_string(), Code::BadRequest))?; + + match deserr::deserialize::<_, _, DeserrQueryParamError>(value) { + Ok(data) => Ok(data), + Err(e) => Err(ResponseError::from(e)), + } + } + + #[test] + fn deserialize_task_filter_dates() { + { + let params = "afterEnqueuedAt=2021-12-03&beforeEnqueuedAt=2021-12-03&afterStartedAt=2021-12-03&beforeStartedAt=2021-12-03&afterFinishedAt=2021-12-03&beforeFinishedAt=2021-12-03"; + let query = deserr_query_params::(params).unwrap(); + + snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(2021-12-04 0:00:00.0 +00:00:00)"); + snapshot!(format!("{:?}", query.before_enqueued_at), @"Other(2021-12-03 0:00:00.0 +00:00:00)"); + snapshot!(format!("{:?}", query.after_started_at), @"Other(2021-12-04 0:00:00.0 +00:00:00)"); + snapshot!(format!("{:?}", query.before_started_at), @"Other(2021-12-03 0:00:00.0 +00:00:00)"); + snapshot!(format!("{:?}", query.after_finished_at), @"Other(2021-12-04 0:00:00.0 +00:00:00)"); + snapshot!(format!("{:?}", query.before_finished_at), @"Other(2021-12-03 0:00:00.0 +00:00:00)"); + } + { + let params = + "afterEnqueuedAt=2021-12-03T23:45:23Z&beforeEnqueuedAt=2021-12-03T23:45:23Z"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(2021-12-03 23:45:23.0 +00:00:00)"); + snapshot!(format!("{:?}", query.before_enqueued_at), @"Other(2021-12-03 23:45:23.0 +00:00:00)"); + } + { + let params = "afterEnqueuedAt=1997-11-12T09:55:06-06:20"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(1997-11-12 9:55:06.0 -06:20:00)"); + } + { + let params = "afterEnqueuedAt=1997-11-12T09:55:06%2B00:00"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(1997-11-12 9:55:06.0 +00:00:00)"); + } + { + let params = "afterEnqueuedAt=1997-11-12T09:55:06.200000300Z"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.after_enqueued_at), @"Other(1997-11-12 9:55:06.2000003 +00:00:00)"); + } + { + // Stars are allowed in date fields as well + let params = "afterEnqueuedAt=*&beforeStartedAt=*&afterFinishedAt=*&beforeFinishedAt=*&afterStartedAt=*&beforeEnqueuedAt=*"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query), @"TaskDeletionOrCancelationQuery { uids: None, batch_uids: None, canceled_by: None, types: None, statuses: None, index_uids: None, after_enqueued_at: Star, before_enqueued_at: Star, after_started_at: Star, before_started_at: Star, after_finished_at: Star, before_finished_at: Star }"); + } + { + let params = "afterFinishedAt=2021"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `afterFinishedAt`: `2021` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", + "code": "invalid_task_after_finished_at", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_after_finished_at" + } + "###); + } + { + let params = "beforeFinishedAt=2021"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `beforeFinishedAt`: `2021` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", + "code": "invalid_task_before_finished_at", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_before_finished_at" + } + "###); + } + { + let params = "afterEnqueuedAt=2021-12"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `afterEnqueuedAt`: `2021-12` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", + "code": "invalid_task_after_enqueued_at", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_after_enqueued_at" + } + "###); + } + + { + let params = "beforeEnqueuedAt=2021-12-03T23"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `beforeEnqueuedAt`: `2021-12-03T23` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", + "code": "invalid_task_before_enqueued_at", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_before_enqueued_at" + } + "###); + } + { + let params = "afterStartedAt=2021-12-03T23:45"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `afterStartedAt`: `2021-12-03T23:45` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", + "code": "invalid_task_after_started_at", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_after_started_at" + } + "###); + } + { + let params = "beforeStartedAt=2021-12-03T23:45"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `beforeStartedAt`: `2021-12-03T23:45` is an invalid date-time. It should follow the YYYY-MM-DD or RFC 3339 date-time format.", + "code": "invalid_task_before_started_at", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_before_started_at" + } + "###); + } + } + + #[test] + fn deserialize_task_filter_uids() { + { + let params = "uids=78,1,12,73"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.uids), @"List([78, 1, 12, 73])"); + } + { + let params = "uids=1"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.uids), @"List([1])"); + } + { + let params = "uids=cat,*,dog"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `uids[0]`: could not parse `cat` as a positive integer", + "code": "invalid_task_uids", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_uids" + } + "###); + } + { + let params = "uids=78,hello,world"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `uids[1]`: could not parse `hello` as a positive integer", + "code": "invalid_task_uids", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_uids" + } + "###); + } + { + let params = "uids=cat"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `uids`: could not parse `cat` as a positive integer", + "code": "invalid_task_uids", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_uids" + } + "###); + } + } + + #[test] + fn deserialize_task_filter_status() { + { + let params = "statuses=succeeded,failed,enqueued,processing,canceled"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.statuses), @"List([Succeeded, Failed, Enqueued, Processing, Canceled])"); + } + { + let params = "statuses=enqueued"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.statuses), @"List([Enqueued])"); + } + { + let params = "statuses=finished"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `statuses`: `finished` is not a valid task status. Available statuses are `enqueued`, `processing`, `succeeded`, `failed`, `canceled`.", + "code": "invalid_task_statuses", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_statuses" + } + "###); + } + } + #[test] + fn deserialize_task_filter_types() { + { + let params = "types=documentAdditionOrUpdate,documentDeletion,settingsUpdate,indexCreation,indexDeletion,indexUpdate,indexSwap,taskCancelation,taskDeletion,dumpCreation,snapshotCreation"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.types), @"List([DocumentAdditionOrUpdate, DocumentDeletion, SettingsUpdate, IndexCreation, IndexDeletion, IndexUpdate, IndexSwap, TaskCancelation, TaskDeletion, DumpCreation, SnapshotCreation])"); + } + { + let params = "types=settingsUpdate"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.types), @"List([SettingsUpdate])"); + } + { + let params = "types=createIndex"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r#" + { + "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "code": "invalid_task_types", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_types" + } + "#); + } + } + #[test] + fn deserialize_task_filter_index_uids() { + { + let params = "indexUids=toto,tata-78"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.index_uids), @r###"List([IndexUid("toto"), IndexUid("tata-78")])"###); + } + { + let params = "indexUids=index_a"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query.index_uids), @r###"List([IndexUid("index_a")])"###); + } + { + let params = "indexUids=1,hĆ©"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `indexUids[1]`: `hĆ©` is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "code": "invalid_index_uid", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_index_uid" + } + "###); + } + { + let params = "indexUids=hĆ©"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `indexUids`: `hĆ©` is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "code": "invalid_index_uid", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_index_uid" + } + "###); + } + } + + #[test] + fn deserialize_task_filter_general() { + { + let params = "from=12&limit=15&indexUids=toto,tata-78&statuses=succeeded,enqueued&afterEnqueuedAt=2012-04-23&uids=1,2,3"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query), @r###"TasksFilterQuery { limit: Param(15), from: Some(Param(12)), reverse: None, batch_uids: None, uids: List([1, 2, 3]), canceled_by: None, types: None, statuses: List([Succeeded, Enqueued]), index_uids: List([IndexUid("toto"), IndexUid("tata-78")]), after_enqueued_at: Other(2012-04-24 0:00:00.0 +00:00:00), before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"###); + } + { + // Stars should translate to `None` in the query + // Verify value of the default limit + let params = "indexUids=*&statuses=succeeded,*&afterEnqueuedAt=2012-04-23&uids=1,2,3"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query), @"TasksFilterQuery { limit: Param(20), from: None, reverse: None, batch_uids: None, uids: List([1, 2, 3]), canceled_by: None, types: None, statuses: Star, index_uids: Star, after_enqueued_at: Other(2012-04-24 0:00:00.0 +00:00:00), before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"); + } + { + // Stars should also translate to `None` in task deletion/cancelation queries + let params = "indexUids=*&statuses=succeeded,*&afterEnqueuedAt=2012-04-23&uids=1,2,3"; + let query = deserr_query_params::(params).unwrap(); + snapshot!(format!("{:?}", query), @"TaskDeletionOrCancelationQuery { uids: List([1, 2, 3]), batch_uids: None, canceled_by: None, types: None, statuses: Star, index_uids: Star, after_enqueued_at: Other(2012-04-24 0:00:00.0 +00:00:00), before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"); + } + { + // Star in from not allowed + let params = "uids=*&from=*"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Invalid value in parameter `from`: could not parse `*` as a positive integer", + "code": "invalid_task_from", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_task_from" + } + "###); + } + { + // From not allowed in task deletion/cancelation queries + let params = "from=12"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Unknown parameter `from`: expected one of `uids`, `batchUids`, `canceledBy`, `types`, `statuses`, `indexUids`, `afterEnqueuedAt`, `beforeEnqueuedAt`, `afterStartedAt`, `beforeStartedAt`, `afterFinishedAt`, `beforeFinishedAt`", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); + } + { + // Limit not allowed in task deletion/cancelation queries + let params = "limit=12"; + let err = deserr_query_params::(params).unwrap_err(); + snapshot!(meili_snap::json_string!(err), @r###" + { + "message": "Unknown parameter `limit`: expected one of `uids`, `batchUids`, `canceledBy`, `types`, `statuses`, `indexUids`, `afterEnqueuedAt`, `beforeEnqueuedAt`, `afterStartedAt`, `beforeStartedAt`, `afterFinishedAt`, `beforeFinishedAt`", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); + } + } + + #[test] + fn deserialize_task_delete_or_cancel_empty() { + { + let params = ""; + let query = deserr_query_params::(params).unwrap(); + assert!(query.is_empty()); + } + { + let params = "statuses=*"; + let query = deserr_query_params::(params).unwrap(); + assert!(!query.is_empty()); + snapshot!(format!("{query:?}"), @"TaskDeletionOrCancelationQuery { uids: None, batch_uids: None, canceled_by: None, types: None, statuses: Star, index_uids: None, after_enqueued_at: None, before_enqueued_at: None, after_started_at: None, before_started_at: None, after_finished_at: None, before_finished_at: None }"); + } + } +} diff --git a/crates/meilisearch/src/search/federated.rs b/crates/meilisearch/src/search/federated.rs deleted file mode 100644 index 5279c26bb..000000000 --- a/crates/meilisearch/src/search/federated.rs +++ /dev/null @@ -1,910 +0,0 @@ -use std::cmp::Ordering; -use std::collections::BTreeMap; -use std::fmt; -use std::iter::Zip; -use std::rc::Rc; -use std::str::FromStr as _; -use std::time::Duration; -use std::vec::{IntoIter, Vec}; - -use actix_http::StatusCode; -use index_scheduler::{IndexScheduler, RoFeatures}; -use indexmap::IndexMap; -use meilisearch_types::deserr::DeserrJsonError; -use meilisearch_types::error::deserr_codes::{ - InvalidMultiSearchFacetsByIndex, InvalidMultiSearchMaxValuesPerFacet, - InvalidMultiSearchMergeFacets, InvalidMultiSearchWeight, InvalidSearchLimit, - InvalidSearchOffset, -}; -use meilisearch_types::error::ResponseError; -use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::milli::score_details::{ScoreDetails, ScoreValue}; -use meilisearch_types::milli::{self, DocumentId, OrderBy, TimeBudget}; -use roaring::RoaringBitmap; -use serde::Serialize; - -use super::ranking_rules::{self, RankingRules}; -use super::{ - compute_facet_distribution_stats, prepare_search, AttributesFormat, ComputedFacets, FacetStats, - HitMaker, HitsInfo, RetrieveVectors, SearchHit, SearchKind, SearchQuery, SearchQueryWithIndex, -}; -use crate::error::MeilisearchHttpError; -use crate::routes::indexes::search::search_kind; - -pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0; - -#[derive(Debug, Default, Clone, Copy, PartialEq, deserr::Deserr)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -pub struct FederationOptions { - #[deserr(default, error = DeserrJsonError)] - pub weight: Weight, -} - -#[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] -#[deserr(try_from(f64) = TryFrom::try_from -> InvalidMultiSearchWeight)] -pub struct Weight(f64); - -impl Default for Weight { - fn default() -> Self { - Weight(DEFAULT_FEDERATED_WEIGHT) - } -} - -impl std::convert::TryFrom for Weight { - type Error = InvalidMultiSearchWeight; - - fn try_from(f: f64) -> Result { - if f < 0.0 { - Err(InvalidMultiSearchWeight) - } else { - Ok(Weight(f)) - } - } -} - -impl std::ops::Deref for Weight { - type Target = f64; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -#[derive(Debug, deserr::Deserr)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -pub struct Federation { - #[deserr(default = super::DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] - pub limit: usize, - #[deserr(default = super::DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] - pub offset: usize, - #[deserr(default, error = DeserrJsonError)] - pub facets_by_index: BTreeMap>>, - #[deserr(default, error = DeserrJsonError)] - pub merge_facets: Option, -} - -#[derive(Copy, Clone, Debug, deserr::Deserr, Default)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -pub struct MergeFacets { - #[deserr(default, error = DeserrJsonError)] - pub max_values_per_facet: Option, -} - -#[derive(Debug, deserr::Deserr)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] -pub struct FederatedSearch { - pub queries: Vec, - #[deserr(default)] - pub federation: Option, -} -#[derive(Serialize, Clone)] -#[serde(rename_all = "camelCase")] -pub struct FederatedSearchResult { - pub hits: Vec, - pub processing_time_ms: u128, - #[serde(flatten)] - pub hits_info: HitsInfo, - - #[serde(skip_serializing_if = "Option::is_none")] - pub semantic_hit_count: Option, - - #[serde(skip_serializing_if = "Option::is_none")] - pub facet_distribution: Option>>, - #[serde(skip_serializing_if = "Option::is_none")] - pub facet_stats: Option>, - #[serde(skip_serializing_if = "FederatedFacets::is_empty")] - pub facets_by_index: FederatedFacets, - - // These fields are only used for analytics purposes - #[serde(skip)] - pub degraded: bool, - #[serde(skip)] - pub used_negative_operator: bool, -} - -impl fmt::Debug for FederatedSearchResult { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let FederatedSearchResult { - hits, - processing_time_ms, - hits_info, - semantic_hit_count, - degraded, - used_negative_operator, - facet_distribution, - facet_stats, - facets_by_index, - } = self; - - let mut debug = f.debug_struct("SearchResult"); - // The most important thing when looking at a search result is the time it took to process - debug.field("processing_time_ms", &processing_time_ms); - debug.field("hits", &format!("[{} hits returned]", hits.len())); - debug.field("hits_info", &hits_info); - if *used_negative_operator { - debug.field("used_negative_operator", used_negative_operator); - } - if *degraded { - debug.field("degraded", degraded); - } - if let Some(facet_distribution) = facet_distribution { - debug.field("facet_distribution", &facet_distribution); - } - if let Some(facet_stats) = facet_stats { - debug.field("facet_stats", &facet_stats); - } - if let Some(semantic_hit_count) = semantic_hit_count { - debug.field("semantic_hit_count", &semantic_hit_count); - } - if !facets_by_index.is_empty() { - debug.field("facets_by_index", &facets_by_index); - } - - debug.finish() - } -} - -struct WeightedScore<'a> { - details: &'a [ScoreDetails], - weight: f64, -} - -impl<'a> WeightedScore<'a> { - pub fn new(details: &'a [ScoreDetails], weight: f64) -> Self { - Self { details, weight } - } - - pub fn weighted_global_score(&self) -> f64 { - ScoreDetails::global_score(self.details.iter()) * self.weight - } - - pub fn compare_weighted_global_scores(&self, other: &Self) -> Ordering { - self.weighted_global_score() - .partial_cmp(&other.weighted_global_score()) - // both are numbers, possibly infinite - .unwrap() - } - - pub fn compare(&self, other: &Self) -> Ordering { - let mut left_it = ScoreDetails::score_values(self.details.iter()); - let mut right_it = ScoreDetails::score_values(other.details.iter()); - - loop { - let left = left_it.next(); - let right = right_it.next(); - - match (left, right) { - (None, None) => return Ordering::Equal, - (None, Some(_)) => return Ordering::Less, - (Some(_), None) => return Ordering::Greater, - (Some(ScoreValue::Score(left)), Some(ScoreValue::Score(right))) => { - let left = left * self.weight; - let right = right * other.weight; - if (left - right).abs() <= f64::EPSILON { - continue; - } - return left.partial_cmp(&right).unwrap(); - } - (Some(ScoreValue::Sort(left)), Some(ScoreValue::Sort(right))) => { - match left.partial_cmp(right) { - Some(Ordering::Equal) => continue, - Some(order) => return order, - None => return self.compare_weighted_global_scores(other), - } - } - (Some(ScoreValue::GeoSort(left)), Some(ScoreValue::GeoSort(right))) => { - match left.partial_cmp(right) { - Some(Ordering::Equal) => continue, - Some(order) => return order, - None => { - return self.compare_weighted_global_scores(other); - } - } - } - // not comparable details, use global - (Some(ScoreValue::Score(_)), Some(_)) - | (Some(_), Some(ScoreValue::Score(_))) - | (Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_))) - | (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => { - let left_count = left_it.count(); - let right_count = right_it.count(); - // compare how many remaining groups of rules each side has. - // the group with the most remaining groups wins. - return left_count - .cmp(&right_count) - // breaks ties with the global ranking score - .then_with(|| self.compare_weighted_global_scores(other)); - } - } - } - } -} - -struct QueryByIndex { - query: SearchQuery, - federation_options: FederationOptions, - query_index: usize, -} - -struct SearchResultByQuery<'a> { - documents_ids: Vec, - document_scores: Vec>, - federation_options: FederationOptions, - hit_maker: HitMaker<'a>, - query_index: usize, -} - -struct SearchResultByQueryIter<'a> { - it: Zip, IntoIter>>, - federation_options: FederationOptions, - hit_maker: Rc>, - query_index: usize, -} - -impl<'a> SearchResultByQueryIter<'a> { - fn new( - SearchResultByQuery { - documents_ids, - document_scores, - federation_options, - hit_maker, - query_index, - }: SearchResultByQuery<'a>, - ) -> Self { - let it = documents_ids.into_iter().zip(document_scores); - Self { it, federation_options, hit_maker: Rc::new(hit_maker), query_index } - } -} - -struct SearchResultByQueryIterItem<'a> { - docid: DocumentId, - score: Vec, - federation_options: FederationOptions, - hit_maker: Rc>, - query_index: usize, -} - -fn merge_index_local_results( - results_by_query: Vec>, -) -> impl Iterator + '_ { - itertools::kmerge_by( - results_by_query.into_iter().map(SearchResultByQueryIter::new), - |left: &SearchResultByQueryIterItem, right: &SearchResultByQueryIterItem| { - let left_score = WeightedScore::new(&left.score, *left.federation_options.weight); - let right_score = WeightedScore::new(&right.score, *right.federation_options.weight); - - match left_score.compare(&right_score) { - // the biggest score goes first - Ordering::Greater => true, - // break ties using query index - Ordering::Equal => left.query_index < right.query_index, - Ordering::Less => false, - } - }, - ) -} - -fn merge_index_global_results( - results_by_index: Vec, -) -> impl Iterator { - itertools::kmerge_by( - results_by_index.into_iter().map(|result_by_index| result_by_index.hits.into_iter()), - |left: &SearchHitByIndex, right: &SearchHitByIndex| { - let left_score = WeightedScore::new(&left.score, *left.federation_options.weight); - let right_score = WeightedScore::new(&right.score, *right.federation_options.weight); - - match left_score.compare(&right_score) { - // the biggest score goes first - Ordering::Greater => true, - // break ties using query index - Ordering::Equal => left.query_index < right.query_index, - Ordering::Less => false, - } - }, - ) -} - -impl<'a> Iterator for SearchResultByQueryIter<'a> { - type Item = SearchResultByQueryIterItem<'a>; - - fn next(&mut self) -> Option { - let (docid, score) = self.it.next()?; - Some(SearchResultByQueryIterItem { - docid, - score, - federation_options: self.federation_options, - hit_maker: Rc::clone(&self.hit_maker), - query_index: self.query_index, - }) - } -} - -struct SearchHitByIndex { - hit: SearchHit, - score: Vec, - federation_options: FederationOptions, - query_index: usize, -} - -struct SearchResultByIndex { - index: String, - hits: Vec, - estimated_total_hits: usize, - degraded: bool, - used_negative_operator: bool, - facets: Option, -} - -#[derive(Debug, Clone, Default, Serialize)] -pub struct FederatedFacets(pub BTreeMap); - -impl FederatedFacets { - pub fn insert(&mut self, index: String, facets: Option) { - if let Some(facets) = facets { - self.0.insert(index, facets); - } - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - pub fn merge( - self, - MergeFacets { max_values_per_facet }: MergeFacets, - facet_order: BTreeMap, - ) -> Option { - if self.is_empty() { - return None; - } - - let mut distribution: BTreeMap = Default::default(); - let mut stats: BTreeMap = Default::default(); - - for facets_by_index in self.0.into_values() { - for (facet, index_distribution) in facets_by_index.distribution { - match distribution.entry(facet) { - std::collections::btree_map::Entry::Vacant(entry) => { - entry.insert(index_distribution); - } - std::collections::btree_map::Entry::Occupied(mut entry) => { - let distribution = entry.get_mut(); - - for (value, index_count) in index_distribution { - distribution - .entry(value) - .and_modify(|count| *count += index_count) - .or_insert(index_count); - } - } - } - } - - for (facet, index_stats) in facets_by_index.stats { - match stats.entry(facet) { - std::collections::btree_map::Entry::Vacant(entry) => { - entry.insert(index_stats); - } - std::collections::btree_map::Entry::Occupied(mut entry) => { - let stats = entry.get_mut(); - - stats.min = f64::min(stats.min, index_stats.min); - stats.max = f64::max(stats.max, index_stats.max); - } - } - } - } - - // fixup order - for (facet, values) in &mut distribution { - let order_by = facet_order.get(facet).map(|(_, order)| *order).unwrap_or_default(); - - match order_by { - OrderBy::Lexicographic => { - values.sort_unstable_by(|left, _, right, _| left.cmp(right)) - } - OrderBy::Count => { - values.sort_unstable_by(|_, left, _, right| { - left.cmp(right) - // biggest first - .reverse() - }) - } - } - - if let Some(max_values_per_facet) = max_values_per_facet { - values.truncate(max_values_per_facet) - }; - } - - Some(ComputedFacets { distribution, stats }) - } -} - -pub fn perform_federated_search( - index_scheduler: &IndexScheduler, - queries: Vec, - mut federation: Federation, - features: RoFeatures, -) -> Result { - let before_search = std::time::Instant::now(); - - // this implementation partition the queries by index to guarantee an important property: - // - all the queries to a particular index use the same read transaction. - // This is an important property, otherwise we cannot guarantee the self-consistency of the results. - - // 1. partition queries by index - let mut queries_by_index: BTreeMap> = Default::default(); - for (query_index, federated_query) in queries.into_iter().enumerate() { - if let Some(pagination_field) = federated_query.has_pagination() { - return Err(MeilisearchHttpError::PaginationInFederatedQuery( - query_index, - pagination_field, - ) - .into()); - } - - if let Some(facets) = federated_query.has_facets() { - let facets = facets.to_owned(); - return Err(MeilisearchHttpError::FacetsInFederatedQuery( - query_index, - federated_query.index_uid.into_inner(), - facets, - ) - .into()); - } - - let (index_uid, query, federation_options) = federated_query.into_index_query_federation(); - - queries_by_index.entry(index_uid.into_inner()).or_default().push(QueryByIndex { - query, - federation_options: federation_options.unwrap_or_default(), - query_index, - }) - } - - // 2. perform queries, merge and make hits index by index - let required_hit_count = federation.limit + federation.offset; - - // In step (2), semantic_hit_count will be set to Some(0) if any search kind uses semantic - // Then in step (3), we'll update its value if there is any semantic search - let mut semantic_hit_count = None; - let mut results_by_index = Vec::with_capacity(queries_by_index.len()); - let mut previous_query_data: Option<(RankingRules, usize, String)> = None; - - // remember the order and name of first index for each facet when merging with index settings - // to detect if the order is inconsistent for a facet. - let mut facet_order: Option> = match federation.merge_facets - { - Some(MergeFacets { .. }) => Some(Default::default()), - _ => None, - }; - - for (index_uid, queries) in queries_by_index { - let first_query_index = queries.first().map(|query| query.query_index); - - let index = match index_scheduler.index(&index_uid) { - Ok(index) => index, - Err(err) => { - let mut err = ResponseError::from(err); - // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but - // here the resource not found is not part of the URL. - err.code = StatusCode::BAD_REQUEST; - if let Some(query_index) = first_query_index { - err.message = format!("Inside `.queries[{}]`: {}", query_index, err.message); - } - return Err(err); - } - }; - - // Important: this is the only transaction we'll use for this index during this federated search - let rtxn = index.read_txn()?; - - let criteria = index.criteria(&rtxn)?; - - let dictionary = index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); - let separators = index.allowed_separators(&rtxn)?; - let separators: Option> = - separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); - - // each query gets its individual cutoff - let cutoff = index.search_cutoff(&rtxn)?; - - let mut degraded = false; - let mut used_negative_operator = false; - let mut candidates = RoaringBitmap::new(); - - let facets_by_index = federation.facets_by_index.remove(&index_uid).flatten(); - - // TODO: recover the max size + facets_by_index as return value of this function so as not to ask it for all queries - if let Err(mut error) = - check_facet_order(&mut facet_order, &index_uid, &facets_by_index, &index, &rtxn) - { - error.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {error}{}", - if let Some(query_index) = first_query_index { - format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") - } else { - Default::default() - } - ); - return Err(error); - } - - // 2.1. Compute all candidates for each query in the index - let mut results_by_query = Vec::with_capacity(queries.len()); - - for QueryByIndex { query, federation_options, query_index } in queries { - // use an immediately invoked lambda to capture the result without returning from the function - - let res: Result<(), ResponseError> = (|| { - let search_kind = search_kind(&query, index_scheduler, &index, features)?; - - let canonicalization_kind = match (&search_kind, &query.q) { - (SearchKind::SemanticOnly { .. }, _) => { - ranking_rules::CanonicalizationKind::Vector - } - (_, Some(q)) if !q.is_empty() => ranking_rules::CanonicalizationKind::Keyword, - _ => ranking_rules::CanonicalizationKind::Placeholder, - }; - - let sort = if let Some(sort) = &query.sort { - let sorts: Vec<_> = - match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { - Ok(sorts) => sorts, - Err(asc_desc_error) => { - return Err(milli::Error::from(milli::SortError::from( - asc_desc_error, - )) - .into()) - } - }; - Some(sorts) - } else { - None - }; - - let ranking_rules = ranking_rules::RankingRules::new( - criteria.clone(), - sort, - query.matching_strategy.into(), - canonicalization_kind, - ); - - if let Some((previous_ranking_rules, previous_query_index, previous_index_uid)) = - previous_query_data.take() - { - if let Err(error) = ranking_rules.is_compatible_with(&previous_ranking_rules) { - return Err(error.to_response_error( - &ranking_rules, - &previous_ranking_rules, - query_index, - previous_query_index, - &index_uid, - &previous_index_uid, - )); - } - previous_query_data = if previous_ranking_rules.constraint_count() - > ranking_rules.constraint_count() - { - Some((previous_ranking_rules, previous_query_index, previous_index_uid)) - } else { - Some((ranking_rules, query_index, index_uid.clone())) - }; - } else { - previous_query_data = Some((ranking_rules, query_index, index_uid.clone())); - } - - match search_kind { - SearchKind::KeywordOnly => {} - _ => semantic_hit_count = Some(0), - } - - let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; - - let time_budget = match cutoff { - Some(cutoff) => TimeBudget::new(Duration::from_millis(cutoff)), - None => TimeBudget::default(), - }; - - let (mut search, _is_finite_pagination, _max_total_hits, _offset) = - prepare_search(&index, &rtxn, &query, &search_kind, time_budget, features)?; - - search.scoring_strategy(milli::score_details::ScoringStrategy::Detailed); - search.offset(0); - search.limit(required_hit_count); - - let (result, _semantic_hit_count) = super::search_from_kind(search_kind, search)?; - let format = AttributesFormat { - attributes_to_retrieve: query.attributes_to_retrieve, - retrieve_vectors, - attributes_to_highlight: query.attributes_to_highlight, - attributes_to_crop: query.attributes_to_crop, - crop_length: query.crop_length, - crop_marker: query.crop_marker, - highlight_pre_tag: query.highlight_pre_tag, - highlight_post_tag: query.highlight_post_tag, - show_matches_position: query.show_matches_position, - sort: query.sort, - show_ranking_score: query.show_ranking_score, - show_ranking_score_details: query.show_ranking_score_details, - locales: query.locales.map(|l| l.iter().copied().map(Into::into).collect()), - }; - - let milli::SearchResult { - matching_words, - candidates: query_candidates, - documents_ids, - document_scores, - degraded: query_degraded, - used_negative_operator: query_used_negative_operator, - } = result; - - candidates |= query_candidates; - degraded |= query_degraded; - used_negative_operator |= query_used_negative_operator; - - let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); - - let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); - - let hit_maker = HitMaker::new(&index, &rtxn, format, formatter_builder)?; - - results_by_query.push(SearchResultByQuery { - federation_options, - hit_maker, - query_index, - documents_ids, - document_scores, - }); - Ok(()) - })(); - - if let Err(mut error) = res { - error.message = format!("Inside `.queries[{query_index}]`: {}", error.message); - return Err(error); - } - } - // 2.2. merge inside index - let mut documents_seen = RoaringBitmap::new(); - let merged_result: Result, ResponseError> = - merge_index_local_results(results_by_query) - // skip documents we've already seen & mark that we saw the current document - .filter(|SearchResultByQueryIterItem { docid, .. }| documents_seen.insert(*docid)) - .take(required_hit_count) - // 2.3 make hits - .map( - |SearchResultByQueryIterItem { - docid, - score, - federation_options, - hit_maker, - query_index, - }| { - let mut hit = hit_maker.make_hit(docid, &score)?; - let weighted_score = - ScoreDetails::global_score(score.iter()) * (*federation_options.weight); - - let _federation = serde_json::json!( - { - "indexUid": index_uid, - "queriesPosition": query_index, - "weightedRankingScore": weighted_score, - } - ); - hit.document.insert("_federation".to_string(), _federation); - Ok(SearchHitByIndex { hit, score, federation_options, query_index }) - }, - ) - .collect(); - - let merged_result = merged_result?; - - let estimated_total_hits = candidates.len() as usize; - - let facets = facets_by_index - .map(|facets_by_index| { - compute_facet_distribution_stats( - &facets_by_index, - &index, - &rtxn, - candidates, - super::Route::MultiSearch, - ) - }) - .transpose() - .map_err(|mut error| { - error.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {}{}", - error.message, - if let Some(query_index) = first_query_index { - format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") - } else { - Default::default() - } - ); - error - })?; - - results_by_index.push(SearchResultByIndex { - index: index_uid, - hits: merged_result, - estimated_total_hits, - degraded, - used_negative_operator, - facets, - }); - } - - // bonus step, make sure to return an error if an index wants a non-faceted field, even if no query actually uses that index. - for (index_uid, facets) in federation.facets_by_index { - let index = match index_scheduler.index(&index_uid) { - Ok(index) => index, - Err(err) => { - let mut err = ResponseError::from(err); - // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but - // here the resource not found is not part of the URL. - err.code = StatusCode::BAD_REQUEST; - err.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", - err.message - ); - return Err(err); - } - }; - - // Important: this is the only transaction we'll use for this index during this federated search - let rtxn = index.read_txn()?; - - if let Err(mut error) = - check_facet_order(&mut facet_order, &index_uid, &facets, &index, &rtxn) - { - error.message = format!( - "Inside `.federation.facetsByIndex.{index_uid}`: {error}\n - Note: index `{index_uid}` is not used in queries", - ); - return Err(error); - } - - if let Some(facets) = facets { - if let Err(mut error) = compute_facet_distribution_stats( - &facets, - &index, - &rtxn, - Default::default(), - super::Route::MultiSearch, - ) { - error.message = - format!("Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", error.message); - return Err(error); - } - } - } - - // 3. merge hits and metadata across indexes - // 3.1 merge metadata - let (estimated_total_hits, degraded, used_negative_operator, facets) = { - let mut estimated_total_hits = 0; - let mut degraded = false; - let mut used_negative_operator = false; - - let mut facets: FederatedFacets = FederatedFacets::default(); - - for SearchResultByIndex { - index, - hits: _, - estimated_total_hits: estimated_total_hits_by_index, - facets: facets_by_index, - degraded: degraded_by_index, - used_negative_operator: used_negative_operator_by_index, - } in &mut results_by_index - { - estimated_total_hits += *estimated_total_hits_by_index; - degraded |= *degraded_by_index; - used_negative_operator |= *used_negative_operator_by_index; - - let facets_by_index = std::mem::take(facets_by_index); - let index = std::mem::take(index); - - facets.insert(index, facets_by_index); - } - - (estimated_total_hits, degraded, used_negative_operator, facets) - }; - - // 3.2 merge hits - let merged_hits: Vec<_> = merge_index_global_results(results_by_index) - .skip(federation.offset) - .take(federation.limit) - .inspect(|hit| { - if let Some(semantic_hit_count) = &mut semantic_hit_count { - if hit.score.iter().any(|score| matches!(&score, ScoreDetails::Vector(_))) { - *semantic_hit_count += 1; - } - } - }) - .map(|hit| hit.hit) - .collect(); - - let (facet_distribution, facet_stats, facets_by_index) = - match federation.merge_facets.zip(facet_order) { - Some((merge_facets, facet_order)) => { - let facets = facets.merge(merge_facets, facet_order); - - let (facet_distribution, facet_stats) = facets - .map(|ComputedFacets { distribution, stats }| (distribution, stats)) - .unzip(); - - (facet_distribution, facet_stats, FederatedFacets::default()) - } - None => (None, None, facets), - }; - - let search_result = FederatedSearchResult { - hits: merged_hits, - processing_time_ms: before_search.elapsed().as_millis(), - hits_info: HitsInfo::OffsetLimit { - limit: federation.limit, - offset: federation.offset, - estimated_total_hits, - }, - semantic_hit_count, - degraded, - used_negative_operator, - facet_distribution, - facet_stats, - facets_by_index, - }; - - Ok(search_result) -} - -fn check_facet_order( - facet_order: &mut Option>, - current_index: &str, - facets_by_index: &Option>, - index: &milli::Index, - rtxn: &milli::heed::RoTxn<'_>, -) -> Result<(), ResponseError> { - if let (Some(facet_order), Some(facets_by_index)) = (facet_order, facets_by_index) { - let index_facet_order = index.sort_facet_values_by(rtxn)?; - for facet in facets_by_index { - let index_facet_order = index_facet_order.get(facet); - let (previous_index, previous_facet_order) = facet_order - .entry(facet.to_owned()) - .or_insert_with(|| (current_index.to_owned(), index_facet_order)); - if previous_facet_order != &index_facet_order { - return Err(MeilisearchHttpError::InconsistentFacetOrder { - facet: facet.clone(), - previous_facet_order: *previous_facet_order, - previous_uid: previous_index.clone(), - current_uid: current_index.to_owned(), - index_facet_order, - } - .into()); - } - } - }; - Ok(()) -} diff --git a/crates/meilisearch/src/search/federated/mod.rs b/crates/meilisearch/src/search/federated/mod.rs new file mode 100644 index 000000000..40204c591 --- /dev/null +++ b/crates/meilisearch/src/search/federated/mod.rs @@ -0,0 +1,10 @@ +mod perform; +mod proxy; +mod types; +mod weighted_scores; + +pub use perform::perform_federated_search; +pub use proxy::{PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE}; +pub use types::{ + FederatedSearch, FederatedSearchResult, Federation, FederationOptions, MergeFacets, +}; diff --git a/crates/meilisearch/src/search/federated/perform.rs b/crates/meilisearch/src/search/federated/perform.rs new file mode 100644 index 000000000..5ad64d63c --- /dev/null +++ b/crates/meilisearch/src/search/federated/perform.rs @@ -0,0 +1,1112 @@ +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::iter::Zip; +use std::rc::Rc; +use std::str::FromStr as _; +use std::time::{Duration, Instant}; +use std::vec::{IntoIter, Vec}; + +use actix_http::StatusCode; +use index_scheduler::{IndexScheduler, RoFeatures}; +use itertools::Itertools; +use meilisearch_types::error::ResponseError; +use meilisearch_types::features::{Network, Remote}; +use meilisearch_types::milli::order_by_map::OrderByMap; +use meilisearch_types::milli::score_details::{ScoreDetails, WeightedScoreValue}; +use meilisearch_types::milli::{self, DocumentId, OrderBy, TimeBudget, DEFAULT_VALUES_PER_FACET}; +use roaring::RoaringBitmap; +use tokio::task::JoinHandle; + +use super::super::ranking_rules::{self, RankingRules}; +use super::super::{ + compute_facet_distribution_stats, prepare_search, AttributesFormat, ComputedFacets, HitMaker, + HitsInfo, RetrieveVectors, SearchHit, SearchKind, SearchQuery, SearchQueryWithIndex, +}; +use super::proxy::{proxy_search, ProxySearchError, ProxySearchParams}; +use super::types::{ + FederatedFacets, FederatedSearchResult, Federation, FederationOptions, MergeFacets, Weight, + FEDERATION_HIT, FEDERATION_REMOTE, WEIGHTED_SCORE_VALUES, +}; +use super::weighted_scores; +use crate::error::MeilisearchHttpError; +use crate::routes::indexes::search::search_kind; +use crate::search::federated::types::{INDEX_UID, QUERIES_POSITION, WEIGHTED_RANKING_SCORE}; + +pub async fn perform_federated_search( + index_scheduler: &IndexScheduler, + queries: Vec, + federation: Federation, + features: RoFeatures, + is_proxy: bool, +) -> Result { + if is_proxy { + features.check_network("Performing a remote federated search")?; + } + let before_search = std::time::Instant::now(); + let deadline = before_search + std::time::Duration::from_secs(9); + + let required_hit_count = federation.limit + federation.offset; + + let network = index_scheduler.network(); + + // this implementation partition the queries by index to guarantee an important property: + // - all the queries to a particular index use the same read transaction. + // This is an important property, otherwise we cannot guarantee the self-consistency of the results. + + // 1. partition queries by host and index + let mut partitioned_queries = PartitionedQueries::new(); + for (query_index, federated_query) in queries.into_iter().enumerate() { + partitioned_queries.partition(federated_query, query_index, &network, features)? + } + + // 2. perform queries, merge and make hits index by index + // 2.1. start remote queries + let remote_search = + RemoteSearch::start(partitioned_queries.remote_queries_by_host, &federation, deadline); + + // 2.2. concurrently execute local queries + let params = SearchByIndexParams { + index_scheduler, + features, + is_proxy, + network: &network, + has_remote: partitioned_queries.has_remote, + required_hit_count, + }; + let mut search_by_index = SearchByIndex::new( + federation, + partitioned_queries.local_queries_by_index.len(), + params.has_remote, + ); + + for (index_uid, queries) in partitioned_queries.local_queries_by_index { + // note: this is the only place we open `index_uid` + search_by_index.execute(index_uid, queries, ¶ms)?; + } + + // bonus step, make sure to return an error if an index wants a non-faceted field, even if no query actually uses that index. + search_by_index.check_unused_facets(index_scheduler)?; + + let SearchByIndex { + federation, + mut semantic_hit_count, + mut results_by_index, + previous_query_data: _, + facet_order, + } = search_by_index; + + let before_waiting_remote_results = std::time::Instant::now(); + + // 2.3. Wait for proxy search requests to complete + let (mut remote_results, remote_errors) = remote_search.finish().await; + + let after_waiting_remote_results = std::time::Instant::now(); + + // 3. merge hits and metadata across indexes and hosts + // 3.1. merge metadata + let (estimated_total_hits, degraded, used_negative_operator, facets, max_remote_duration) = + merge_metadata(&mut results_by_index, &remote_results); + + // 3.2. merge hits + let merged_hits: Vec<_> = merge_index_global_results(results_by_index, &mut remote_results) + .skip(federation.offset) + .take(federation.limit) + .inspect(|hit| { + if let Some(semantic_hit_count) = &mut semantic_hit_count { + if hit.to_score().0.any(|score| matches!(&score, WeightedScoreValue::VectorSort(_))) + { + *semantic_hit_count += 1; + } + } + }) + .map(|hit| hit.hit()) + .collect(); + + // 3.3. merge facets + let (facet_distribution, facet_stats, facets_by_index) = + facet_order.merge(federation.merge_facets, remote_results, facets); + + let after_merge = std::time::Instant::now(); + + let local_duration = (before_waiting_remote_results - before_search) + + (after_merge - after_waiting_remote_results); + let max_duration = Duration::max(local_duration, max_remote_duration); + + Ok(FederatedSearchResult { + hits: merged_hits, + processing_time_ms: max_duration.as_millis(), + hits_info: HitsInfo::OffsetLimit { + limit: federation.limit, + offset: federation.offset, + estimated_total_hits, + }, + semantic_hit_count, + degraded, + used_negative_operator, + facet_distribution, + facet_stats, + facets_by_index, + remote_errors: partitioned_queries.has_remote.then_some(remote_errors), + }) +} + +struct QueryByIndex { + query: SearchQuery, + weight: Weight, + query_index: usize, +} + +struct SearchResultByQuery<'a> { + documents_ids: Vec, + document_scores: Vec>, + weight: Weight, + hit_maker: HitMaker<'a>, + query_index: usize, +} + +struct SearchResultByQueryIter<'a> { + it: Zip, IntoIter>>, + weight: Weight, + hit_maker: Rc>, + query_index: usize, +} + +impl<'a> SearchResultByQueryIter<'a> { + fn new( + SearchResultByQuery { + documents_ids, + document_scores, + weight, + hit_maker, + query_index, + }: SearchResultByQuery<'a>, + ) -> Self { + let it = documents_ids.into_iter().zip(document_scores); + Self { it, weight, hit_maker: Rc::new(hit_maker), query_index } + } +} + +struct SearchResultByQueryIterItem<'a> { + docid: DocumentId, + score: Vec, + weight: Weight, + hit_maker: Rc>, + query_index: usize, +} + +fn merge_index_local_results( + results_by_query: Vec>, +) -> impl Iterator + '_ { + itertools::kmerge_by( + results_by_query.into_iter().map(SearchResultByQueryIter::new), + |left: &SearchResultByQueryIterItem, right: &SearchResultByQueryIterItem| { + match weighted_scores::compare( + ScoreDetails::weighted_score_values(left.score.iter(), *left.weight), + ScoreDetails::global_score(left.score.iter()) * *left.weight, + ScoreDetails::weighted_score_values(right.score.iter(), *right.weight), + ScoreDetails::global_score(right.score.iter()) * *right.weight, + ) { + // the biggest score goes first + Ordering::Greater => true, + // break ties using query index + Ordering::Equal => left.query_index < right.query_index, + Ordering::Less => false, + } + }, + ) +} + +fn merge_index_global_results( + results_by_index: Vec, + remote_results: &mut [FederatedSearchResult], +) -> impl Iterator + '_ { + itertools::kmerge_by( + // local results + results_by_index + .into_iter() + .map(|result_by_index| { + either::Either::Left(result_by_index.hits.into_iter().map(MergedSearchHit::Local)) + }) + // remote results + .chain(remote_results.iter_mut().map(|x| either::Either::Right(iter_remote_hits(x)))), + |left: &MergedSearchHit, right: &MergedSearchHit| { + let (left_it, left_weighted_global_score, left_query_index) = left.to_score(); + let (right_it, right_weighted_global_score, right_query_index) = right.to_score(); + + match weighted_scores::compare( + left_it, + left_weighted_global_score, + right_it, + right_weighted_global_score, + ) { + // the biggest score goes first + Ordering::Greater => true, + // break ties using query index + Ordering::Equal => left_query_index < right_query_index, + Ordering::Less => false, + } + }, + ) +} + +enum MergedSearchHit { + Local(SearchHitByIndex), + Remote { + hit: SearchHit, + score: Vec, + global_weighted_score: f64, + query_index: usize, + }, +} + +impl MergedSearchHit { + fn remote(mut hit: SearchHit) -> Result { + let federation = hit + .document + .get_mut(FEDERATION_HIT) + .ok_or(ProxySearchError::MissingPathInResponse("._federation"))?; + let federation = match federation.as_object_mut() { + Some(federation) => federation, + None => { + return Err(ProxySearchError::UnexpectedValueInPath { + path: "._federation", + expected_type: "map", + received_value: federation.to_string(), + }); + } + }; + + let global_weighted_score = federation + .get(WEIGHTED_RANKING_SCORE) + .ok_or(ProxySearchError::MissingPathInResponse("._federation.weightedRankingScore"))?; + let global_weighted_score = global_weighted_score.as_f64().ok_or_else(|| { + ProxySearchError::UnexpectedValueInPath { + path: "._federation.weightedRankingScore", + expected_type: "number", + received_value: global_weighted_score.to_string(), + } + })?; + + let score: Vec = + serde_json::from_value(federation.remove(WEIGHTED_SCORE_VALUES).ok_or( + ProxySearchError::MissingPathInResponse("._federation.weightedScoreValues"), + )?) + .map_err(ProxySearchError::CouldNotParseWeightedScoreValues)?; + + let query_index = federation + .get(QUERIES_POSITION) + .ok_or(ProxySearchError::MissingPathInResponse("._federation.queriesPosition"))?; + let query_index = + query_index.as_u64().ok_or_else(|| ProxySearchError::UnexpectedValueInPath { + path: "._federation.queriesPosition", + expected_type: "integer", + received_value: query_index.to_string(), + })? as usize; + + Ok(Self::Remote { hit, score, global_weighted_score, query_index }) + } + + fn hit(self) -> SearchHit { + match self { + MergedSearchHit::Local(search_hit_by_index) => search_hit_by_index.hit, + MergedSearchHit::Remote { hit, .. } => hit, + } + } + + fn to_score(&self) -> (impl Iterator + '_, f64, usize) { + match self { + MergedSearchHit::Local(search_hit_by_index) => ( + either::Left(ScoreDetails::weighted_score_values( + search_hit_by_index.score.iter(), + *search_hit_by_index.weight, + )), + ScoreDetails::global_score(search_hit_by_index.score.iter()) + * *search_hit_by_index.weight, + search_hit_by_index.query_index, + ), + MergedSearchHit::Remote { hit: _, score, global_weighted_score, query_index } => { + let global_weighted_score = *global_weighted_score; + let query_index = *query_index; + (either::Right(score.iter().cloned()), global_weighted_score, query_index) + } + } + } +} + +fn iter_remote_hits( + results_by_host: &mut FederatedSearchResult, +) -> impl Iterator + '_ { + // have a per node registry of failed hits + results_by_host.hits.drain(..).filter_map(|hit| match MergedSearchHit::remote(hit) { + Ok(hit) => Some(hit), + Err(err) => { + tracing::warn!("skipping remote hit due to error: {err}"); + None + } + }) +} + +impl<'a> Iterator for SearchResultByQueryIter<'a> { + type Item = SearchResultByQueryIterItem<'a>; + + fn next(&mut self) -> Option { + let (docid, score) = self.it.next()?; + Some(SearchResultByQueryIterItem { + docid, + score, + weight: self.weight, + hit_maker: Rc::clone(&self.hit_maker), + query_index: self.query_index, + }) + } +} + +struct SearchHitByIndex { + hit: SearchHit, + score: Vec, + weight: Weight, + query_index: usize, +} + +struct SearchResultByIndex { + index: String, + hits: Vec, + estimated_total_hits: usize, + degraded: bool, + used_negative_operator: bool, + facets: Option, +} + +fn merge_metadata( + results_by_index: &mut Vec, + remote_results: &Vec, +) -> (usize, bool, bool, FederatedFacets, Duration) { + let mut estimated_total_hits = 0; + let mut degraded = false; + let mut used_negative_operator = false; + let mut facets: FederatedFacets = FederatedFacets::default(); + let mut max_remote_duration = Duration::ZERO; + for SearchResultByIndex { + index, + hits: _, + estimated_total_hits: estimated_total_hits_by_index, + facets: facets_by_index, + degraded: degraded_by_index, + used_negative_operator: used_negative_operator_by_index, + } in results_by_index + { + estimated_total_hits += *estimated_total_hits_by_index; + degraded |= *degraded_by_index; + used_negative_operator |= *used_negative_operator_by_index; + + let facets_by_index = std::mem::take(facets_by_index); + let index = std::mem::take(index); + + facets.insert(index, facets_by_index); + } + for FederatedSearchResult { + hits: _, + processing_time_ms, + hits_info, + semantic_hit_count: _, + facet_distribution: _, + facet_stats: _, + facets_by_index: _, + degraded: degraded_for_host, + used_negative_operator: host_used_negative_operator, + remote_errors: _, + } in remote_results + { + let this_remote_duration = Duration::from_millis(*processing_time_ms as u64); + max_remote_duration = Duration::max(this_remote_duration, max_remote_duration); + estimated_total_hits += match hits_info { + HitsInfo::Pagination { total_hits: estimated_total_hits, .. } + | HitsInfo::OffsetLimit { estimated_total_hits, .. } => estimated_total_hits, + }; + // note that because `degraded` and `used_negative_operator` are #[serde(skip)], + // `degraded_for_host` and `host_used_negative_operator` will always be false. + degraded |= degraded_for_host; + used_negative_operator |= host_used_negative_operator; + } + (estimated_total_hits, degraded, used_negative_operator, facets, max_remote_duration) +} + +type LocalQueriesByIndex = BTreeMap>; +type RemoteQueriesByHost = BTreeMap)>; + +struct PartitionedQueries { + local_queries_by_index: LocalQueriesByIndex, + remote_queries_by_host: RemoteQueriesByHost, + has_remote: bool, +} + +impl PartitionedQueries { + fn new() -> PartitionedQueries { + PartitionedQueries { + local_queries_by_index: Default::default(), + remote_queries_by_host: Default::default(), + has_remote: false, + } + } + + fn partition( + &mut self, + federated_query: SearchQueryWithIndex, + query_index: usize, + network: &Network, + features: RoFeatures, + ) -> Result<(), ResponseError> { + if let Some(pagination_field) = federated_query.has_pagination() { + return Err(MeilisearchHttpError::PaginationInFederatedQuery( + query_index, + pagination_field, + ) + .into()); + } + + if let Some(facets) = federated_query.has_facets() { + let facets = facets.to_owned(); + return Err(MeilisearchHttpError::FacetsInFederatedQuery( + query_index, + federated_query.index_uid.into_inner(), + facets, + ) + .into()); + } + + let (index_uid, query, federation_options) = federated_query.into_index_query_federation(); + + let federation_options = federation_options.unwrap_or_default(); + + // local or remote node? + 'local_query: { + let queries_by_index = match federation_options.remote { + None => self.local_queries_by_index.entry(index_uid.into_inner()).or_default(), + Some(remote_name) => { + self.has_remote = true; + features.check_network("Performing a remote federated search")?; + + match &network.local { + Some(local) if local == &remote_name => { + self.local_queries_by_index.entry(index_uid.into_inner()).or_default() + } + _ => { + // node from the network + let Some(remote) = network.remotes.get(&remote_name) else { + return Err(ResponseError::from_msg(format!("Invalid `queries[{query_index}].federation_options.remote`: remote `{remote_name}` is not registered"), + meilisearch_types::error::Code::InvalidMultiSearchRemote)); + }; + let query = SearchQueryWithIndex::from_index_query_federation( + index_uid, + query, + Some(FederationOptions { + weight: federation_options.weight, + // do not pass the `remote` to not require the remote instance to have itself has a local node + remote: None, + // pass an explicit query index + query_position: Some(query_index), + }), + ); + + self.remote_queries_by_host + .entry(remote_name) + .or_insert_with(|| (remote.clone(), Default::default())) + .1 + .push(query); + break 'local_query; + } + } + } + }; + + queries_by_index.push(QueryByIndex { + query, + weight: federation_options.weight, + // override query index here with the one in federation. + // this will fix-up error messages to refer to the global query index of the original request. + query_index: if let Some(query_index) = federation_options.query_position { + features.check_network("Using `federationOptions.queryPosition`")?; + query_index + } else { + query_index + }, + }) + } + Ok(()) + } +} + +struct RemoteSearch { + in_flight_remote_queries: + BTreeMap>>, +} + +impl RemoteSearch { + fn start(queries: RemoteQueriesByHost, federation: &Federation, deadline: Instant) -> Self { + let mut in_flight_remote_queries = BTreeMap::new(); + let client = reqwest::ClientBuilder::new() + .connect_timeout(std::time::Duration::from_millis(200)) + .build() + .unwrap(); + let params = + ProxySearchParams { deadline: Some(deadline), try_count: 3, client: client.clone() }; + for (node_name, (node, queries)) in queries { + // spawn one task per host + in_flight_remote_queries.insert( + node_name, + tokio::spawn({ + let mut proxy_federation = federation.clone(); + // fixup limit and offset to not apply them twice + proxy_federation.limit = federation.limit + federation.offset; + proxy_federation.offset = 0; + // never merge distant facets + proxy_federation.merge_facets = None; + let params = params.clone(); + async move { proxy_search(&node, queries, proxy_federation, ¶ms).await } + }), + ); + } + Self { in_flight_remote_queries } + } + + async fn finish(self) -> (Vec, BTreeMap) { + let mut remote_results = Vec::with_capacity(self.in_flight_remote_queries.len()); + let mut remote_errors: BTreeMap = BTreeMap::new(); + 'remote_queries: for (node_name, handle) in self.in_flight_remote_queries { + match handle.await { + Ok(Ok(mut res)) => { + for hit in &mut res.hits { + let Some(federation) = hit.document.get_mut(FEDERATION_HIT) else { + let error = ProxySearchError::MissingPathInResponse("._federation"); + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + }; + let Some(federation) = federation.as_object_mut() else { + let error = ProxySearchError::UnexpectedValueInPath { + path: "._federation", + expected_type: "map", + received_value: federation.to_string(), + }; + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + }; + if !federation.contains_key(WEIGHTED_SCORE_VALUES) { + let error = ProxySearchError::MissingPathInResponse( + "._federation.weightedScoreValues", + ); + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + } + + if !federation.contains_key(WEIGHTED_RANKING_SCORE) { + let error = ProxySearchError::MissingPathInResponse( + "._federation.weightedRankingScore", + ); + remote_errors.insert(node_name, error.as_response_error()); + continue 'remote_queries; + } + + federation.insert( + FEDERATION_REMOTE.to_string(), + serde_json::Value::String(node_name.clone()), + ); + } + + remote_results.push(res); + } + Ok(Err(error)) => { + remote_errors.insert(node_name, error.as_response_error()); + } + Err(panic) => match panic.try_into_panic() { + Ok(panic) => { + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + remote_errors.insert( + node_name, + ResponseError::from_msg( + msg.to_string(), + meilisearch_types::error::Code::Internal, + ), + ); + } + Err(_) => tracing::error!("proxy search task was unexpectedly cancelled"), + }, + } + } + (remote_results, remote_errors) + } +} + +struct SearchByIndexParams<'a> { + index_scheduler: &'a IndexScheduler, + required_hit_count: usize, + features: RoFeatures, + is_proxy: bool, + has_remote: bool, + network: &'a Network, +} + +struct SearchByIndex { + federation: Federation, + // During search by index, semantic_hit_count will be set to Some(0) if any search kind uses semantic + // Then when merging, we'll update its value if there is any semantic hit + semantic_hit_count: Option, + results_by_index: Vec, + previous_query_data: Option<(RankingRules, usize, String)>, + // remember the order and name of first index for each facet when merging with index settings + // to detect if the order is inconsistent for a facet. + facet_order: FacetOrder, +} + +impl SearchByIndex { + fn new(federation: Federation, index_count: usize, has_remote: bool) -> Self { + SearchByIndex { + facet_order: match (federation.merge_facets, has_remote) { + (None, true) => FacetOrder::ByIndex(Default::default()), + (None, false) => FacetOrder::None, + (Some(_), _) => FacetOrder::ByFacet(Default::default()), + }, + federation, + semantic_hit_count: None, + results_by_index: Vec::with_capacity(index_count), + previous_query_data: None, + } + } + + fn execute( + &mut self, + index_uid: String, + queries: Vec, + params: &SearchByIndexParams<'_>, + ) -> Result<(), ResponseError> { + let first_query_index = queries.first().map(|query| query.query_index); + let index = match params.index_scheduler.index(&index_uid) { + Ok(index) => index, + Err(err) => { + let mut err = ResponseError::from(err); + // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but + // here the resource not found is not part of the URL. + err.code = StatusCode::BAD_REQUEST; + if let Some(query_index) = first_query_index { + err.message = format!("Inside `.queries[{}]`: {}", query_index, err.message); + } + return Err(err); + } + }; + let rtxn = index.read_txn()?; + let criteria = index.criteria(&rtxn)?; + let dictionary = index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let separators = index.allowed_separators(&rtxn)?; + let separators: Option> = + separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); + let cutoff = index.search_cutoff(&rtxn)?; + let mut degraded = false; + let mut used_negative_operator = false; + let mut candidates = RoaringBitmap::new(); + let facets_by_index = self.federation.facets_by_index.remove(&index_uid).flatten(); + if let Err(mut error) = + self.facet_order.check_facet_order(&index_uid, &facets_by_index, &index, &rtxn) + { + error.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {error}{}", + if let Some(query_index) = first_query_index { + format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") + } else { + Default::default() + } + ); + return Err(error); + } + let mut results_by_query = Vec::with_capacity(queries.len()); + for QueryByIndex { query, weight, query_index } in queries { + // use an immediately invoked lambda to capture the result without returning from the function + + let res: Result<(), ResponseError> = (|| { + let search_kind = + search_kind(&query, params.index_scheduler, index_uid.to_string(), &index)?; + + let canonicalization_kind = match (&search_kind, &query.q) { + (SearchKind::SemanticOnly { .. }, _) => { + ranking_rules::CanonicalizationKind::Vector + } + (_, Some(q)) if !q.is_empty() => ranking_rules::CanonicalizationKind::Keyword, + _ => ranking_rules::CanonicalizationKind::Placeholder, + }; + + let sort = if let Some(sort) = &query.sort { + let sorts: Vec<_> = + match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { + Ok(sorts) => sorts, + Err(asc_desc_error) => { + return Err(milli::Error::from(milli::SortError::from( + asc_desc_error, + )) + .into()) + } + }; + Some(sorts) + } else { + None + }; + + let ranking_rules = ranking_rules::RankingRules::new( + criteria.clone(), + sort, + query.matching_strategy.into(), + canonicalization_kind, + ); + + if let Some((previous_ranking_rules, previous_query_index, previous_index_uid)) = + self.previous_query_data.take() + { + if let Err(error) = ranking_rules.is_compatible_with(&previous_ranking_rules) { + return Err(error.to_response_error( + &ranking_rules, + &previous_ranking_rules, + query_index, + previous_query_index, + &index_uid, + &previous_index_uid, + )); + } + self.previous_query_data = if previous_ranking_rules.constraint_count() + > ranking_rules.constraint_count() + { + Some((previous_ranking_rules, previous_query_index, previous_index_uid)) + } else { + Some((ranking_rules, query_index, index_uid.clone())) + }; + } else { + self.previous_query_data = + Some((ranking_rules, query_index, index_uid.clone())); + } + + match search_kind { + SearchKind::KeywordOnly => {} + _ => self.semantic_hit_count = Some(0), + } + + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors); + + let time_budget = match cutoff { + Some(cutoff) => TimeBudget::new(Duration::from_millis(cutoff)), + None => TimeBudget::default(), + }; + + let (mut search, _is_finite_pagination, _max_total_hits, _offset) = prepare_search( + &index, + &rtxn, + &query, + &search_kind, + time_budget, + params.features, + )?; + + search.scoring_strategy(milli::score_details::ScoringStrategy::Detailed); + search.offset(0); + search.limit(params.required_hit_count); + + let (result, _semantic_hit_count) = + super::super::search_from_kind(index_uid.to_string(), search_kind, search)?; + let format = AttributesFormat { + attributes_to_retrieve: query.attributes_to_retrieve, + retrieve_vectors, + attributes_to_highlight: query.attributes_to_highlight, + attributes_to_crop: query.attributes_to_crop, + crop_length: query.crop_length, + crop_marker: query.crop_marker, + highlight_pre_tag: query.highlight_pre_tag, + highlight_post_tag: query.highlight_post_tag, + show_matches_position: query.show_matches_position, + sort: query.sort, + show_ranking_score: query.show_ranking_score, + show_ranking_score_details: query.show_ranking_score_details, + locales: query.locales.map(|l| l.iter().copied().map(Into::into).collect()), + }; + + let milli::SearchResult { + matching_words, + candidates: query_candidates, + documents_ids, + document_scores, + degraded: query_degraded, + used_negative_operator: query_used_negative_operator, + } = result; + + candidates |= query_candidates; + degraded |= query_degraded; + used_negative_operator |= query_used_negative_operator; + + let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); + + let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); + + let hit_maker = + HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| { + MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())) + })?; + + results_by_query.push(SearchResultByQuery { + weight, + hit_maker, + query_index, + documents_ids, + document_scores, + }); + Ok(()) + })(); + + if let Err(mut error) = res { + error.message = format!("Inside `.queries[{query_index}]`: {}", error.message); + return Err(error); + } + } + let mut documents_seen = RoaringBitmap::new(); + let merged_result: Result, ResponseError> = + merge_index_local_results(results_by_query) + // skip documents we've already seen & mark that we saw the current document + .filter(|SearchResultByQueryIterItem { docid, .. }| documents_seen.insert(*docid)) + .take(params.required_hit_count) + // 2.3 make hits + .map( + |SearchResultByQueryIterItem { + docid, + score, + weight, + hit_maker, + query_index, + }| { + let mut hit = hit_maker.make_hit(docid, &score)?; + let weighted_score = ScoreDetails::global_score(score.iter()) * (*weight); + + let mut _federation = serde_json::json!( + { + INDEX_UID: index_uid, + QUERIES_POSITION: query_index, + WEIGHTED_RANKING_SCORE: weighted_score, + } + ); + if params.has_remote && !params.is_proxy { + _federation.as_object_mut().unwrap().insert( + FEDERATION_REMOTE.to_string(), + params.network.local.clone().into(), + ); + } + if params.is_proxy { + _federation.as_object_mut().unwrap().insert( + WEIGHTED_SCORE_VALUES.to_string(), + serde_json::json!(ScoreDetails::weighted_score_values( + score.iter(), + *weight + ) + .collect_vec()), + ); + } + hit.document.insert(FEDERATION_HIT.to_string(), _federation); + Ok(SearchHitByIndex { hit, score, weight, query_index }) + }, + ) + .collect(); + let merged_result = merged_result?; + let estimated_total_hits = candidates.len() as usize; + let facets = facets_by_index + .map(|facets_by_index| { + compute_facet_distribution_stats( + &facets_by_index, + &index, + &rtxn, + candidates, + super::super::Route::MultiSearch, + ) + }) + .transpose() + .map_err(|mut error| { + error.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {}{}", + error.message, + if let Some(query_index) = first_query_index { + format!("\n - Note: index `{index_uid}` used in `.queries[{query_index}]`") + } else { + Default::default() + } + ); + error + })?; + self.results_by_index.push(SearchResultByIndex { + index: index_uid, + hits: merged_result, + estimated_total_hits, + degraded, + used_negative_operator, + facets, + }); + Ok(()) + } + + fn check_unused_facets( + &mut self, + index_scheduler: &IndexScheduler, + ) -> Result<(), ResponseError> { + for (index_uid, facets) in std::mem::take(&mut self.federation.facets_by_index) { + let index = match index_scheduler.index(&index_uid) { + Ok(index) => index, + Err(err) => { + let mut err = ResponseError::from(err); + // Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but + // here the resource not found is not part of the URL. + err.code = StatusCode::BAD_REQUEST; + err.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", + err.message + ); + return Err(err); + } + }; + + // Important: this is the only transaction we'll use for this index during this federated search + let rtxn = index.read_txn()?; + + if let Err(mut error) = + self.facet_order.check_facet_order(&index_uid, &facets, &index, &rtxn) + { + error.message = format!( + "Inside `.federation.facetsByIndex.{index_uid}`: {error}\n - Note: index `{index_uid}` is not used in queries", + ); + return Err(error); + } + + if let Some(facets) = facets { + if let Err(mut error) = compute_facet_distribution_stats( + &facets, + &index, + &rtxn, + Default::default(), + super::super::Route::MultiSearch, + ) { + error.message = + format!("Inside `.federation.facetsByIndex.{index_uid}`: {}\n - Note: index `{index_uid}` is not used in queries", error.message); + return Err(error); + } + } + } + Ok(()) + } +} + +enum FacetOrder { + /// The order is stored by facet to be able to merge facets regardless of index of origin + /// + /// - key: facet name + /// - value: (first_index_name, first_index_order) + /// + /// We store the name of the first index where the facet is present as well as its order, + /// so that if encountering the same facet in a different index we can compare the order and send + /// a readable error. + ByFacet(BTreeMap), + /// The order is stored by index to be able to merge facets regardless of the remote of origin. + /// + /// This variant is only used when `is_remote = true`, and always used in that case. + /// + /// - key: index name + /// - value: (order_by_map, max_values_per_facet) + /// + /// We store a map of the order per facet for that index, as well as the max values per facet. + /// Both are retrieved from the settings of the local version of the index. + /// + /// It is not possible to have an index only existing in the remotes, because as of now all indexes that appear + /// in `federation.facetsByIndex` must exist on all hosts. + ByIndex(BTreeMap), + /// Do not merge facets. Used when `federation.mergeFacets = null` and `!has_remote` + None, +} + +type FacetDistributions = BTreeMap>; +type FacetStats = BTreeMap; + +impl FacetOrder { + fn check_facet_order( + &mut self, + current_index: &str, + facets_by_index: &Option>, + index: &milli::Index, + rtxn: &milli::heed::RoTxn<'_>, + ) -> Result<(), ResponseError> { + match self { + FacetOrder::ByFacet(facet_order) => { + if let Some(facets_by_index) = facets_by_index { + let index_facet_order = index.sort_facet_values_by(rtxn)?; + for facet in facets_by_index { + let index_facet_order = index_facet_order.get(facet); + let (previous_index, previous_facet_order) = facet_order + .entry(facet.to_owned()) + .or_insert_with(|| (current_index.to_owned(), index_facet_order)); + if previous_facet_order != &index_facet_order { + return Err(MeilisearchHttpError::InconsistentFacetOrder { + facet: facet.clone(), + previous_facet_order: *previous_facet_order, + previous_uid: previous_index.clone(), + current_uid: current_index.to_owned(), + index_facet_order, + } + .into()); + } + } + } + } + FacetOrder::ByIndex(order_by_index) => { + let max_values_per_facet = index + .max_values_per_facet(rtxn)? + .map(|x| x as usize) + .unwrap_or(DEFAULT_VALUES_PER_FACET); + order_by_index.insert( + current_index.to_owned(), + (index.sort_facet_values_by(rtxn)?, max_values_per_facet), + ); + } + FacetOrder::None => {} + } + Ok(()) + } + + fn merge( + self, + merge_facets: Option, + remote_results: Vec, + mut facets: FederatedFacets, + ) -> (Option, Option, FederatedFacets) { + let (facet_distribution, facet_stats, facets_by_index) = match (self, merge_facets) { + (FacetOrder::ByFacet(facet_order), Some(merge_facets)) => { + for remote_facets_by_index in + remote_results.into_iter().map(|result| result.facets_by_index) + { + facets.append(remote_facets_by_index); + } + let facets = facets.merge(merge_facets, facet_order); + + let (facet_distribution, facet_stats) = facets + .map(|ComputedFacets { distribution, stats }| (distribution, stats)) + .unzip(); + + (facet_distribution, facet_stats, FederatedFacets::default()) + } + (FacetOrder::ByIndex(facet_order), _) => { + for remote_facets_by_index in + remote_results.into_iter().map(|result| result.facets_by_index) + { + facets.append(remote_facets_by_index); + } + facets.sort_and_truncate(facet_order); + (None, None, facets) + } + _ => (None, None, facets), + }; + (facet_distribution, facet_stats, facets_by_index) + } +} diff --git a/crates/meilisearch/src/search/federated/proxy.rs b/crates/meilisearch/src/search/federated/proxy.rs new file mode 100644 index 000000000..bf954693c --- /dev/null +++ b/crates/meilisearch/src/search/federated/proxy.rs @@ -0,0 +1,267 @@ +pub use error::ProxySearchError; +use error::ReqwestErrorWithoutUrl; +use meilisearch_types::features::Remote; +use rand::Rng as _; +use reqwest::{Client, Response, StatusCode}; +use serde::de::DeserializeOwned; +use serde_json::Value; + +use super::types::{FederatedSearch, FederatedSearchResult, Federation}; +use crate::search::SearchQueryWithIndex; + +pub const PROXY_SEARCH_HEADER: &str = "Meili-Proxy-Search"; +pub const PROXY_SEARCH_HEADER_VALUE: &str = "true"; + +mod error { + use meilisearch_types::error::ResponseError; + use reqwest::StatusCode; + + #[derive(Debug, thiserror::Error)] + pub enum ProxySearchError { + #[error("{0}")] + CouldNotSendRequest(ReqwestErrorWithoutUrl), + #[error("could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `search` action")] + AuthenticationError, + #[error( + "could not parse response from the remote host as a federated search response{}\n - hint: check that the remote instance is a Meilisearch instance running the same version", + response_from_remote(response) + )] + CouldNotParseResponse { response: Result }, + #[error("remote host responded with code {}{}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", status_code.as_u16(), response_from_remote(response))] + BadRequest { status_code: StatusCode, response: Result }, + #[error("remote host did not answer before the deadline")] + Timeout, + #[error("remote hit does not contain `{0}`\n - hint: check that the remote instance is a Meilisearch instance running the same version")] + MissingPathInResponse(&'static str), + #[error("remote host responded with code {}{}", status_code.as_u16(), response_from_remote(response))] + RemoteError { status_code: StatusCode, response: Result }, + #[error("remote hit contains an unexpected value at path `{path}`: expected {expected_type}, received `{received_value}`\n - hint: check that the remote instance is a Meilisearch instance running the same version")] + UnexpectedValueInPath { + path: &'static str, + expected_type: &'static str, + received_value: String, + }, + #[error("could not parse weighted score values in the remote hit: {0}")] + CouldNotParseWeightedScoreValues(serde_json::Error), + } + + impl ProxySearchError { + pub fn as_response_error(&self) -> ResponseError { + use meilisearch_types::error::Code; + let message = self.to_string(); + let code = match self { + ProxySearchError::CouldNotSendRequest(_) => Code::RemoteCouldNotSendRequest, + ProxySearchError::AuthenticationError => Code::RemoteInvalidApiKey, + ProxySearchError::BadRequest { .. } => Code::RemoteBadRequest, + ProxySearchError::Timeout => Code::RemoteTimeout, + ProxySearchError::RemoteError { .. } => Code::RemoteRemoteError, + ProxySearchError::CouldNotParseResponse { .. } + | ProxySearchError::MissingPathInResponse(_) + | ProxySearchError::UnexpectedValueInPath { .. } + | ProxySearchError::CouldNotParseWeightedScoreValues(_) => Code::RemoteBadResponse, + }; + ResponseError::from_msg(message, code) + } + } + + #[derive(Debug, thiserror::Error)] + #[error(transparent)] + pub struct ReqwestErrorWithoutUrl(reqwest::Error); + impl ReqwestErrorWithoutUrl { + pub fn new(inner: reqwest::Error) -> Self { + Self(inner.without_url()) + } + } + + fn response_from_remote(response: &Result) -> String { + match response { + Ok(response) => { + format!(":\n - response from remote: {}", response) + } + Err(error) => { + format!(":\n - additionally, could not retrieve response from remote: {error}") + } + } + } +} + +#[derive(Clone)] +pub struct ProxySearchParams { + pub deadline: Option, + pub try_count: u32, + pub client: reqwest::Client, +} + +/// Performs a federated search on a remote host and returns the results +pub async fn proxy_search( + node: &Remote, + queries: Vec, + federation: Federation, + params: &ProxySearchParams, +) -> Result { + let url = format!("{}/multi-search", node.url); + + let federated = FederatedSearch { queries, federation: Some(federation) }; + + let search_api_key = node.search_api_key.as_deref(); + + let max_deadline = std::time::Instant::now() + std::time::Duration::from_secs(5); + + let deadline = if let Some(deadline) = params.deadline { + std::time::Instant::min(deadline, max_deadline) + } else { + max_deadline + }; + + for i in 0..params.try_count { + match try_proxy_search(&url, search_api_key, &federated, ¶ms.client, deadline).await { + Ok(response) => return Ok(response), + Err(retry) => { + let duration = retry.into_duration(i)?; + tokio::time::sleep(duration).await; + } + } + } + try_proxy_search(&url, search_api_key, &federated, ¶ms.client, deadline) + .await + .map_err(Retry::into_error) +} + +async fn try_proxy_search( + url: &str, + search_api_key: Option<&str>, + federated: &FederatedSearch, + client: &Client, + deadline: std::time::Instant, +) -> Result { + let timeout = deadline.saturating_duration_since(std::time::Instant::now()); + + let request = client.post(url).json(&federated).timeout(timeout); + let request = if let Some(search_api_key) = search_api_key { + request.bearer_auth(search_api_key) + } else { + request + }; + let request = request.header(PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE); + + let response = request.send().await; + let response = match response { + Ok(response) => response, + Err(error) if error.is_timeout() => return Err(Retry::give_up(ProxySearchError::Timeout)), + Err(error) => { + return Err(Retry::retry_later(ProxySearchError::CouldNotSendRequest( + ReqwestErrorWithoutUrl::new(error), + ))) + } + }; + + match response.status() { + status_code if status_code.is_success() => (), + StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { + return Err(Retry::give_up(ProxySearchError::AuthenticationError)) + } + status_code if status_code.is_client_error() => { + let response = parse_error(response).await; + return Err(Retry::give_up(ProxySearchError::BadRequest { status_code, response })); + } + status_code if status_code.is_server_error() => { + let response = parse_error(response).await; + return Err(Retry::retry_later(ProxySearchError::RemoteError { + status_code, + response, + })); + } + status_code => { + tracing::warn!( + status_code = status_code.as_u16(), + "remote replied with unexpected status code" + ); + } + } + + let response = match parse_response(response).await { + Ok(response) => response, + Err(response) => { + return Err(Retry::retry_later(ProxySearchError::CouldNotParseResponse { response })) + } + }; + + Ok(response) +} + +/// Always parse the body of the response of a failed request as JSON. +async fn parse_error(response: Response) -> Result { + let bytes = match response.bytes().await { + Ok(bytes) => bytes, + Err(error) => return Err(ReqwestErrorWithoutUrl::new(error)), + }; + + Ok(parse_bytes_as_error(&bytes)) +} + +fn parse_bytes_as_error(bytes: &[u8]) -> String { + match serde_json::from_slice::(bytes) { + Ok(value) => value.to_string(), + Err(_) => String::from_utf8_lossy(bytes).into_owned(), + } +} + +async fn parse_response( + response: Response, +) -> Result> { + let bytes = match response.bytes().await { + Ok(bytes) => bytes, + Err(error) => return Err(Err(ReqwestErrorWithoutUrl::new(error))), + }; + + match serde_json::from_slice::(&bytes) { + Ok(value) => Ok(value), + Err(_) => Err(Ok(parse_bytes_as_error(&bytes))), + } +} + +pub struct Retry { + error: ProxySearchError, + strategy: RetryStrategy, +} + +pub enum RetryStrategy { + GiveUp, + Retry, +} + +impl Retry { + pub fn give_up(error: ProxySearchError) -> Self { + Self { error, strategy: RetryStrategy::GiveUp } + } + + pub fn retry_later(error: ProxySearchError) -> Self { + Self { error, strategy: RetryStrategy::Retry } + } + + pub fn into_duration(self, attempt: u32) -> Result { + match self.strategy { + RetryStrategy::GiveUp => Err(self.error), + RetryStrategy::Retry => { + let retry_duration = std::time::Duration::from_nanos((10u64).pow(attempt)); + let retry_duration = retry_duration.min(std::time::Duration::from_millis(100)); // don't wait more than 100ms + + // randomly up to double the retry duration + let retry_duration = retry_duration + + rand::thread_rng().gen_range(std::time::Duration::ZERO..retry_duration); + + tracing::warn!( + "Attempt #{}, failed with {}, retrying after {}ms.", + attempt, + self.error, + retry_duration.as_millis() + ); + Ok(retry_duration) + } + } + } + + pub fn into_error(self) -> ProxySearchError { + self.error + } +} diff --git a/crates/meilisearch/src/search/federated/types.rs b/crates/meilisearch/src/search/federated/types.rs new file mode 100644 index 000000000..3cf28c815 --- /dev/null +++ b/crates/meilisearch/src/search/federated/types.rs @@ -0,0 +1,321 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::fmt; +use std::vec::Vec; + +use indexmap::IndexMap; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::{ + InvalidMultiSearchFacetsByIndex, InvalidMultiSearchMaxValuesPerFacet, + InvalidMultiSearchMergeFacets, InvalidMultiSearchQueryPosition, InvalidMultiSearchRemote, + InvalidMultiSearchWeight, InvalidSearchLimit, InvalidSearchOffset, +}; +use meilisearch_types::error::ResponseError; +use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::milli::order_by_map::OrderByMap; +use meilisearch_types::milli::OrderBy; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +use super::super::{ComputedFacets, FacetStats, HitsInfo, SearchHit, SearchQueryWithIndex}; + +pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0; + +// fields in the response +pub const FEDERATION_HIT: &str = "_federation"; +pub const INDEX_UID: &str = "indexUid"; +pub const QUERIES_POSITION: &str = "queriesPosition"; +pub const WEIGHTED_RANKING_SCORE: &str = "weightedRankingScore"; +pub const WEIGHTED_SCORE_VALUES: &str = "weightedScoreValues"; +pub const FEDERATION_REMOTE: &str = "remote"; + +#[derive(Debug, Default, Clone, PartialEq, Serialize, deserr::Deserr, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub struct FederationOptions { + #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = f64)] + pub weight: Weight, + + #[deserr(default, error = DeserrJsonError)] + pub remote: Option, + + #[deserr(default, error = DeserrJsonError)] + pub query_position: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, deserr::Deserr)] +#[deserr(try_from(f64) = TryFrom::try_from -> InvalidMultiSearchWeight)] +pub struct Weight(f64); + +impl Default for Weight { + fn default() -> Self { + Weight(DEFAULT_FEDERATED_WEIGHT) + } +} + +impl std::convert::TryFrom for Weight { + type Error = InvalidMultiSearchWeight; + + fn try_from(f: f64) -> Result { + if f < 0.0 { + Err(InvalidMultiSearchWeight) + } else { + Ok(Weight(f)) + } + } +} + +impl std::ops::Deref for Weight { + type Target = f64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +#[derive(Debug, Clone, deserr::Deserr, Serialize, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] +#[serde(rename_all = "camelCase")] +pub struct Federation { + #[deserr(default = super::super::DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] + pub limit: usize, + #[deserr(default = super::super::DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] + pub offset: usize, + #[deserr(default, error = DeserrJsonError)] + pub facets_by_index: BTreeMap>>, + #[deserr(default, error = DeserrJsonError)] + pub merge_facets: Option, +} + +#[derive(Copy, Clone, Debug, deserr::Deserr, Serialize, Default, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] +#[serde(rename_all = "camelCase")] +pub struct MergeFacets { + #[deserr(default, error = DeserrJsonError)] + pub max_values_per_facet: Option, +} + +#[derive(Debug, deserr::Deserr, Serialize, ToSchema)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[schema(rename_all = "camelCase")] +#[serde(rename_all = "camelCase")] +pub struct FederatedSearch { + pub queries: Vec, + #[deserr(default)] + pub federation: Option, +} + +#[derive(Serialize, Deserialize, Clone, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct FederatedSearchResult { + pub hits: Vec, + pub processing_time_ms: u128, + #[serde(flatten)] + pub hits_info: HitsInfo, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub semantic_hit_count: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + #[schema(value_type = Option>>)] + pub facet_distribution: Option>>, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub facet_stats: Option>, + #[serde(default, skip_serializing_if = "FederatedFacets::is_empty")] + pub facets_by_index: FederatedFacets, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub remote_errors: Option>, + + // These fields are only used for analytics purposes + #[serde(skip)] + pub degraded: bool, + #[serde(skip)] + pub used_negative_operator: bool, +} + +impl fmt::Debug for FederatedSearchResult { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let FederatedSearchResult { + hits, + processing_time_ms, + hits_info, + semantic_hit_count, + degraded, + used_negative_operator, + facet_distribution, + facet_stats, + facets_by_index, + remote_errors, + } = self; + + let mut debug = f.debug_struct("SearchResult"); + // The most important thing when looking at a search result is the time it took to process + debug.field("processing_time_ms", &processing_time_ms); + debug.field("hits", &format!("[{} hits returned]", hits.len())); + debug.field("hits_info", &hits_info); + if *used_negative_operator { + debug.field("used_negative_operator", used_negative_operator); + } + if *degraded { + debug.field("degraded", degraded); + } + if let Some(facet_distribution) = facet_distribution { + debug.field("facet_distribution", &facet_distribution); + } + if let Some(facet_stats) = facet_stats { + debug.field("facet_stats", &facet_stats); + } + if let Some(semantic_hit_count) = semantic_hit_count { + debug.field("semantic_hit_count", &semantic_hit_count); + } + if !facets_by_index.is_empty() { + debug.field("facets_by_index", &facets_by_index); + } + if let Some(remote_errors) = remote_errors { + debug.field("remote_errors", &remote_errors); + } + + debug.finish() + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] +pub struct FederatedFacets(pub BTreeMap); + +impl FederatedFacets { + pub fn insert(&mut self, index: String, facets: Option) { + if let Some(facets) = facets { + self.0.insert(index, facets); + } + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn merge( + self, + MergeFacets { max_values_per_facet }: MergeFacets, + facet_order: BTreeMap, + ) -> Option { + if self.is_empty() { + return None; + } + + let mut distribution: BTreeMap = Default::default(); + let mut stats: BTreeMap = Default::default(); + + for facets_by_index in self.0.into_values() { + for (facet, index_distribution) in facets_by_index.distribution { + match distribution.entry(facet) { + Entry::Vacant(entry) => { + entry.insert(index_distribution); + } + Entry::Occupied(mut entry) => { + let distribution = entry.get_mut(); + + for (value, index_count) in index_distribution { + distribution + .entry(value) + .and_modify(|count| *count += index_count) + .or_insert(index_count); + } + } + } + } + + for (facet, index_stats) in facets_by_index.stats { + match stats.entry(facet) { + Entry::Vacant(entry) => { + entry.insert(index_stats); + } + Entry::Occupied(mut entry) => { + let stats = entry.get_mut(); + + stats.min = f64::min(stats.min, index_stats.min); + stats.max = f64::max(stats.max, index_stats.max); + } + } + } + } + + // fixup order + for (facet, values) in &mut distribution { + let order_by = facet_order.get(facet).map(|(_, order)| *order).unwrap_or_default(); + + match order_by { + OrderBy::Lexicographic => { + values.sort_unstable_by(|left, _, right, _| left.cmp(right)) + } + OrderBy::Count => { + values.sort_unstable_by(|_, left, _, right| { + left.cmp(right) + // biggest first + .reverse() + }) + } + } + + if let Some(max_values_per_facet) = max_values_per_facet { + values.truncate(max_values_per_facet) + }; + } + + Some(ComputedFacets { distribution, stats }) + } + + pub(crate) fn append(&mut self, FederatedFacets(remote_facets_by_index): FederatedFacets) { + for (index, remote_facets) in remote_facets_by_index { + let merged_facets = self.0.entry(index).or_default(); + + for (remote_facet, remote_stats) in remote_facets.stats { + match merged_facets.stats.entry(remote_facet) { + Entry::Vacant(vacant_entry) => { + vacant_entry.insert(remote_stats); + } + Entry::Occupied(mut occupied_entry) => { + let stats = occupied_entry.get_mut(); + stats.min = f64::min(stats.min, remote_stats.min); + stats.max = f64::max(stats.max, remote_stats.max); + } + } + } + + for (remote_facet, remote_values) in remote_facets.distribution { + let merged_facet = merged_facets.distribution.entry(remote_facet).or_default(); + for (remote_value, remote_count) in remote_values { + let count = merged_facet.entry(remote_value).or_default(); + *count += remote_count; + } + } + } + } + + pub fn sort_and_truncate(&mut self, facet_order: BTreeMap) { + for (index, facets) in &mut self.0 { + let Some((order_by, max_values_per_facet)) = facet_order.get(index) else { + continue; + }; + for (facet, values) in &mut facets.distribution { + match order_by.get(facet) { + OrderBy::Lexicographic => { + values.sort_unstable_by(|left, _, right, _| left.cmp(right)) + } + OrderBy::Count => { + values.sort_unstable_by(|_, left, _, right| { + left.cmp(right) + // biggest first + .reverse() + }) + } + } + values.truncate(*max_values_per_facet); + } + } + } +} diff --git a/crates/meilisearch/src/search/federated/weighted_scores.rs b/crates/meilisearch/src/search/federated/weighted_scores.rs new file mode 100644 index 000000000..899940a31 --- /dev/null +++ b/crates/meilisearch/src/search/federated/weighted_scores.rs @@ -0,0 +1,88 @@ +use std::cmp::Ordering; + +use meilisearch_types::milli::score_details::{self, WeightedScoreValue}; + +pub fn compare( + mut left_it: impl Iterator, + left_weighted_global_score: f64, + mut right_it: impl Iterator, + right_weighted_global_score: f64, +) -> Ordering { + loop { + let left = left_it.next(); + let right = right_it.next(); + + match (left, right) { + (None, None) => return Ordering::Equal, + (None, Some(_)) => return Ordering::Less, + (Some(_), None) => return Ordering::Greater, + ( + Some( + WeightedScoreValue::WeightedScore(left) | WeightedScoreValue::VectorSort(left), + ), + Some( + WeightedScoreValue::WeightedScore(right) + | WeightedScoreValue::VectorSort(right), + ), + ) => { + if (left - right).abs() <= f64::EPSILON { + continue; + } + return left.partial_cmp(&right).unwrap(); + } + ( + Some(WeightedScoreValue::Sort { asc: left_asc, value: left }), + Some(WeightedScoreValue::Sort { asc: right_asc, value: right }), + ) => { + if left_asc != right_asc { + return left_weighted_global_score + .partial_cmp(&right_weighted_global_score) + .unwrap(); + } + match score_details::compare_sort_values(left_asc, &left, &right) { + Ordering::Equal => continue, + order => return order, + } + } + ( + Some(WeightedScoreValue::GeoSort { asc: left_asc, distance: left }), + Some(WeightedScoreValue::GeoSort { asc: right_asc, distance: right }), + ) => { + if left_asc != right_asc { + continue; + } + match (left, right) { + (None, None) => continue, + (None, Some(_)) => return Ordering::Less, + (Some(_), None) => return Ordering::Greater, + (Some(left), Some(right)) => { + if (left - right).abs() <= f64::EPSILON { + continue; + } + return left.partial_cmp(&right).unwrap(); + } + } + } + // not comparable details, use global + (Some(WeightedScoreValue::WeightedScore(_)), Some(_)) + | (Some(_), Some(WeightedScoreValue::WeightedScore(_))) + | (Some(WeightedScoreValue::VectorSort(_)), Some(_)) + | (Some(_), Some(WeightedScoreValue::VectorSort(_))) + | (Some(WeightedScoreValue::GeoSort { .. }), Some(WeightedScoreValue::Sort { .. })) + | (Some(WeightedScoreValue::Sort { .. }), Some(WeightedScoreValue::GeoSort { .. })) => { + let left_count = left_it.count(); + let right_count = right_it.count(); + // compare how many remaining groups of rules each side has. + // the group with the most remaining groups wins. + return left_count + .cmp(&right_count) + // breaks ties with the global ranking score + .then_with(|| { + left_weighted_global_score + .partial_cmp(&right_weighted_global_score) + .unwrap() + }); + } + } + } +} diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 7e185e951..1dd16c474 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -19,7 +19,9 @@ use meilisearch_types::locales::Locale; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; -use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; +use meilisearch_types::milli::{ + FacetValueHit, InternalError, OrderBy, PatternMatch, SearchForFacetValues, TimeBudget, +}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; use meilisearch_types::{milli, Document}; use milli::tokenizer::{Language, TokenizerBuilder}; @@ -28,13 +30,19 @@ use milli::{ MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; use regex::Regex; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; +#[cfg(test)] +mod mod_test; +use utoipa::ToSchema; use crate::error::MeilisearchHttpError; mod federated; -pub use federated::{perform_federated_search, FederatedSearch, Federation, FederationOptions}; +pub use federated::{ + perform_federated_search, FederatedSearch, FederatedSearchResult, Federation, + FederationOptions, MergeFacets, PROXY_SEARCH_HEADER, PROXY_SEARCH_HEADER_VALUE, +}; mod ranking_rules; @@ -48,18 +56,20 @@ pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "".to_string(); pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "".to_string(); pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5); -#[derive(Clone, Default, PartialEq, Deserr)] +#[derive(Clone, Default, PartialEq, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct SearchQuery { #[deserr(default, error = DeserrJsonError)] pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] + #[schema(default = DEFAULT_SEARCH_OFFSET)] pub offset: usize, #[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] + #[schema(default = DEFAULT_SEARCH_LIMIT)] pub limit: usize, #[deserr(default, error = DeserrJsonError)] pub page: Option, @@ -71,15 +81,16 @@ pub struct SearchQuery { pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, - #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] + #[deserr(error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] + #[schema(default = DEFAULT_CROP_LENGTH)] pub crop_length: usize, #[deserr(default, error = DeserrJsonError)] pub attributes_to_highlight: Option>, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub show_matches_position: bool, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub show_ranking_score: bool, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub show_ranking_score_details: bool, #[deserr(default, error = DeserrJsonError)] pub filter: Option, @@ -89,26 +100,28 @@ pub struct SearchQuery { pub distinct: Option, #[deserr(default, error = DeserrJsonError)] pub facets: Option>, - #[deserr(default, error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_PRE_TAG())] + #[deserr(error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_PRE_TAG())] + #[schema(default = DEFAULT_HIGHLIGHT_PRE_TAG)] pub highlight_pre_tag: String, - #[deserr(default, error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_POST_TAG())] + #[deserr(error = DeserrJsonError, default = DEFAULT_HIGHLIGHT_POST_TAG())] + #[schema(default = DEFAULT_HIGHLIGHT_POST_TAG)] pub highlight_post_tag: String, - #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_MARKER())] + #[deserr(error = DeserrJsonError, default = DEFAULT_CROP_MARKER())] + #[schema(default = DEFAULT_CROP_MARKER)] pub crop_marker: String, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub matching_strategy: MatchingStrategy, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub attributes_to_search_on: Option>, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub ranking_score_threshold: Option, - #[deserr(default, error = DeserrJsonError, default)] + #[deserr(default, error = DeserrJsonError)] pub locales: Option>, } -#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[derive(Debug, Clone, Copy, PartialEq, Deserr, ToSchema, Serialize)] #[deserr(try_from(f64) = TryFrom::try_from -> InvalidSearchRankingScoreThreshold)] pub struct RankingScoreThreshold(f64); - impl std::convert::TryFrom for RankingScoreThreshold { type Error = InvalidSearchRankingScoreThreshold; @@ -262,12 +275,15 @@ impl fmt::Debug for SearchQuery { } } -#[derive(Debug, Clone, Default, PartialEq, Deserr)] -#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[derive(Debug, Clone, Default, PartialEq, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] pub struct HybridQuery { #[deserr(default, error = DeserrJsonError, default)] + #[schema(value_type = f32, default)] + #[serde(default)] pub semantic_ratio: SemanticRatio, - #[deserr(error = DeserrJsonError)] + #[deserr(error = DeserrJsonError)] pub embedder: String, } @@ -281,39 +297,63 @@ pub enum SearchKind { impl SearchKind { pub(crate) fn semantic( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, vector_len: Option, ) -> Result { - let (embedder_name, embedder, quantized) = - Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + let (embedder_name, embedder, quantized) = Self::embedder( + index_scheduler, + index_uid, + index, + embedder_name, + vector_len, + Route::Search, + )?; Ok(Self::SemanticOnly { embedder_name, embedder, quantized }) } pub(crate) fn hybrid( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, semantic_ratio: f32, vector_len: Option, ) -> Result { - let (embedder_name, embedder, quantized) = - Self::embedder(index_scheduler, index, embedder_name, vector_len)?; + let (embedder_name, embedder, quantized) = Self::embedder( + index_scheduler, + index_uid, + index, + embedder_name, + vector_len, + Route::Search, + )?; Ok(Self::Hybrid { embedder_name, embedder, quantized, semantic_ratio }) } pub(crate) fn embedder( index_scheduler: &index_scheduler::IndexScheduler, + index_uid: String, index: &Index, embedder_name: &str, vector_len: Option, + route: Route, ) -> Result<(String, Arc, bool), ResponseError> { - let embedder_configs = index.embedding_configs(&index.read_txn()?)?; - let embedders = index_scheduler.embedders(embedder_configs)?; + let rtxn = index.read_txn()?; + let embedder_configs = index.embedding_configs(&rtxn)?; + let embedders = index_scheduler.embedders(index_uid, embedder_configs)?; let (embedder, _, quantized) = embedders .get(embedder_name) - .ok_or(milli::UserError::InvalidEmbedder(embedder_name.to_owned())) + .ok_or(match route { + Route::Search | Route::MultiSearch => { + milli::UserError::InvalidSearchEmbedder(embedder_name.to_owned()) + } + Route::Similar => { + milli::UserError::InvalidSimilarEmbedder(embedder_name.to_owned()) + } + }) .map_err(milli::Error::from)?; if let Some(vector_len) = vector_len { @@ -332,7 +372,7 @@ impl SearchKind { } } -#[derive(Debug, Clone, Copy, PartialEq, Deserr)] +#[derive(Debug, Clone, Copy, PartialEq, Deserr, Serialize)] #[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)] pub struct SemanticRatio(f32); @@ -374,8 +414,10 @@ impl SearchQuery { // This struct contains the fields of `SearchQuery` inline. // This is because neither deserr nor serde support `flatten` when using `deny_unknown_fields. // The `From` implementation ensures both structs remain up to date. -#[derive(Debug, Clone, PartialEq, Deserr)] +#[derive(Debug, Clone, Serialize, PartialEq, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct SearchQueryWithIndex { #[deserr(error = DeserrJsonError, missing_field_error = DeserrJsonError::missing_index_uid)] pub index_uid: IndexUid, @@ -383,7 +425,7 @@ pub struct SearchQueryWithIndex { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] pub offset: Option, @@ -455,6 +497,72 @@ impl SearchQueryWithIndex { self.facets.as_deref().filter(|v| !v.is_empty()) } + pub fn from_index_query_federation( + index_uid: IndexUid, + query: SearchQuery, + federation_options: Option, + ) -> Self { + let SearchQuery { + q, + vector, + hybrid, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve, + retrieve_vectors, + attributes_to_crop, + crop_length, + attributes_to_highlight, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + distinct, + facets, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + ranking_score_threshold, + locales, + } = query; + + SearchQueryWithIndex { + index_uid, + q, + vector, + hybrid, + offset: if offset == DEFAULT_SEARCH_OFFSET() { None } else { Some(offset) }, + limit: if limit == DEFAULT_SEARCH_LIMIT() { None } else { Some(limit) }, + page, + hits_per_page, + attributes_to_retrieve, + retrieve_vectors, + attributes_to_crop, + crop_length, + attributes_to_highlight, + show_ranking_score, + show_ranking_score_details, + show_matches_position, + filter, + sort, + distinct, + facets, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + ranking_score_threshold, + locales, + federation_options, + } + } + pub fn into_index_query_federation(self) -> (IndexUid, SearchQuery, Option) { let SearchQueryWithIndex { index_uid, @@ -523,18 +631,19 @@ impl SearchQueryWithIndex { } } -#[derive(Debug, Clone, PartialEq, Deserr)] +#[derive(Debug, Clone, PartialEq, Deserr, ToSchema)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] pub struct SimilarQuery { #[deserr(error = DeserrJsonError)] - pub id: ExternalDocumentId, + #[schema(value_type = String)] + pub id: serde_json::Value, #[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] pub offset: usize, #[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError)] pub limit: usize, #[deserr(default, error = DeserrJsonError)] pub filter: Option, - #[deserr(error = DeserrJsonError)] + #[deserr(error = DeserrJsonError)] pub embedder: String, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, @@ -545,11 +654,11 @@ pub struct SimilarQuery { #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score_details: bool, #[deserr(default, error = DeserrJsonError, default)] + #[schema(value_type = f64)] pub ranking_score_threshold: Option, } -#[derive(Debug, Clone, PartialEq, Deserr)] -#[deserr(try_from(Value) = TryFrom::try_from -> InvalidSimilarId)] +#[derive(Debug, Clone, PartialEq)] pub struct ExternalDocumentId(String); impl AsRef for ExternalDocumentId { @@ -565,7 +674,7 @@ impl ExternalDocumentId { } impl TryFrom for ExternalDocumentId { - type Error = InvalidSimilarId; + type Error = milli::UserError; fn try_from(value: String) -> Result { serde_json::Value::String(value).try_into() @@ -573,15 +682,16 @@ impl TryFrom for ExternalDocumentId { } impl TryFrom for ExternalDocumentId { - type Error = InvalidSimilarId; + type Error = milli::UserError; fn try_from(value: Value) -> Result { - Ok(Self(milli::documents::validate_document_id_value(value).map_err(|_| InvalidSimilarId)?)) + Ok(Self(milli::documents::validate_document_id_value(value)?)) } } -#[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, ToSchema, Serialize)] #[deserr(rename_all = camelCase)] +#[serde(rename_all = "camelCase")] pub enum MatchingStrategy { /// Remove query words from last to first Last, @@ -627,22 +737,25 @@ impl From for OrderBy { } } -#[derive(Debug, Clone, Serialize, PartialEq)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, ToSchema)] pub struct SearchHit { #[serde(flatten)] + #[schema(additional_properties, inline, value_type = HashMap)] pub document: Document, - #[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")] + #[serde(default, rename = "_formatted", skip_serializing_if = "Document::is_empty")] + #[schema(additional_properties, value_type = HashMap)] pub formatted: Document, - #[serde(rename = "_matchesPosition", skip_serializing_if = "Option::is_none")] + #[serde(default, rename = "_matchesPosition", skip_serializing_if = "Option::is_none")] pub matches_position: Option, - #[serde(rename = "_rankingScore", skip_serializing_if = "Option::is_none")] + #[serde(default, rename = "_rankingScore", skip_serializing_if = "Option::is_none")] pub ranking_score: Option, - #[serde(rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")] + #[serde(default, rename = "_rankingScoreDetails", skip_serializing_if = "Option::is_none")] pub ranking_score_details: Option>, } -#[derive(Serialize, Clone, PartialEq)] +#[derive(Serialize, Clone, PartialEq, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct SearchResult { pub hits: Vec, pub query: String, @@ -650,6 +763,7 @@ pub struct SearchResult { #[serde(flatten)] pub hits_info: HitsInfo, #[serde(skip_serializing_if = "Option::is_none")] + #[schema(value_type = Option>)] pub facet_distribution: Option>>, #[serde(skip_serializing_if = "Option::is_none")] pub facet_stats: Option>, @@ -704,7 +818,7 @@ impl fmt::Debug for SearchResult { } } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Debug, Clone, PartialEq, ToSchema)] #[serde(rename_all = "camelCase")] pub struct SimilarResult { pub hits: Vec, @@ -714,24 +828,27 @@ pub struct SimilarResult { pub hits_info: HitsInfo, } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Debug, Clone, PartialEq, ToSchema)] #[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] pub struct SearchResultWithIndex { pub index_uid: String, #[serde(flatten)] pub result: SearchResult, } -#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, ToSchema)] #[serde(untagged)] pub enum HitsInfo { #[serde(rename_all = "camelCase")] + #[schema(rename_all = "camelCase")] Pagination { hits_per_page: usize, page: usize, total_pages: usize, total_hits: usize }, #[serde(rename_all = "camelCase")] + #[schema(rename_all = "camelCase")] OffsetLimit { limit: usize, offset: usize, estimated_total_hits: usize }, } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, ToSchema)] pub struct FacetStats { pub min: f64, pub max: f64, @@ -799,7 +916,7 @@ fn prepare_search<'t>( let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10); embedder - .embed_one(query.q.clone().unwrap(), Some(deadline)) + .embed_search(query.q.as_ref().unwrap(), Some(deadline)) .map_err(milli::vector::Error::from) .map_err(milli::Error::from)? } @@ -890,6 +1007,7 @@ fn prepare_search<'t>( } pub fn perform_search( + index_uid: String, index: &Index, query: SearchQuery, search_kind: SearchKind, @@ -916,7 +1034,7 @@ pub fn perform_search( used_negative_operator, }, semantic_hit_count, - ) = search_from_kind(search_kind, search)?; + ) = search_from_kind(index_uid, search_kind, search)?; let SearchQuery { q, @@ -1013,15 +1131,17 @@ pub fn perform_search( Ok(result) } -#[derive(Debug, Clone, Default, Serialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] pub struct ComputedFacets { + #[schema(value_type = BTreeMap>)] pub distribution: BTreeMap>, pub stats: BTreeMap, } -enum Route { +pub enum Route { Search, MultiSearch, + Similar, } fn compute_facet_distribution_stats>( @@ -1069,17 +1189,27 @@ fn compute_facet_distribution_stats>( } pub fn search_from_kind( + index_uid: String, search_kind: SearchKind, search: milli::Search<'_>, ) -> Result<(milli::SearchResult, Option), MeilisearchHttpError> { let (milli_result, semantic_hit_count) = match &search_kind { - SearchKind::KeywordOnly => (search.execute()?, None), + SearchKind::KeywordOnly => { + let results = search + .execute() + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; + (results, None) + } SearchKind::SemanticOnly { .. } => { - let results = search.execute()?; + let results = search + .execute() + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?; let semantic_hit_count = results.document_scores.len() as u32; (results, Some(semantic_hit_count)) } - SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?, + SearchKind::Hybrid { semantic_ratio, .. } => search + .execute_hybrid(*semantic_ratio) + .map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid)))?, }; Ok((milli_result, semantic_hit_count)) } @@ -1102,10 +1232,6 @@ struct AttributesFormat { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum RetrieveVectors { - /// Do not touch the `_vectors` field - /// - /// this is the behavior when the vectorStore feature is disabled - Ignore, /// Remove the `_vectors` field /// /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false` @@ -1117,15 +1243,11 @@ pub enum RetrieveVectors { } impl RetrieveVectors { - pub fn new( - retrieve_vector: bool, - features: index_scheduler::RoFeatures, - ) -> Result { - match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) { - (true, Ok(())) => Ok(Self::Retrieve), - (true, Err(error)) => Err(error), - (false, Ok(())) => Ok(Self::Hide), - (false, Err(_)) => Ok(Self::Ignore), + pub fn new(retrieve_vector: bool) -> Self { + if retrieve_vector { + Self::Retrieve + } else { + Self::Hide } } } @@ -1181,7 +1303,7 @@ impl<'a> HitMaker<'a> { rtxn: &'a RoTxn<'a>, format: AttributesFormat, mut formatter_builder: MatcherBuilder<'a>, - ) -> Result { + ) -> milli::Result { formatter_builder.crop_marker(format.crop_marker); formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); @@ -1191,8 +1313,7 @@ impl<'a> HitMaker<'a> { .displayed_fields_ids(rtxn)? .map(|fields| fields.into_iter().collect::>()); - let vectors_fid = - fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + let vectors_fid = fields_ids_map.id(milli::constants::RESERVED_VECTORS_FIELD_NAME); let vectors_is_hidden = match (&displayed_ids, vectors_fid) { // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid @@ -1201,8 +1322,7 @@ impl<'a> HitMaker<'a> { (Some(_), None) => { // unwrap as otherwise we'd go to the first one let displayed_names = index.displayed_fields(rtxn)?.unwrap(); - !displayed_names - .contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME) + !displayed_names.contains(&milli::constants::RESERVED_VECTORS_FIELD_NAME) } // displayed_ids is a finit list, so hide if `_vectors` is not part of it (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), @@ -1276,11 +1396,7 @@ impl<'a> HitMaker<'a> { }) } - pub fn make_hit( - &self, - id: u32, - score: &[ScoreDetails], - ) -> Result { + pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result { let (_, obkv) = self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; @@ -1323,7 +1439,10 @@ impl<'a> HitMaker<'a> { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; - vectors.insert(name, serde_json::to_value(embeddings)?); + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, + ); } document.insert("_vectors".into(), vectors.into()); } @@ -1369,7 +1488,7 @@ fn make_hits<'a>( format: AttributesFormat, matching_words: milli::MatchingWords, documents_ids_scores: impl Iterator)> + 'a, -) -> Result, MeilisearchHttpError> { +) -> milli::Result> { let mut documents = Vec::new(); let dictionary = index.dictionary(rtxn)?; @@ -1407,17 +1526,25 @@ pub fn perform_facet_search( None => TimeBudget::default(), }; + if !index.facet_search(&rtxn)? { + return Err(ResponseError::from_msg( + "The facet search is disabled for this index".to_string(), + Code::FacetSearchDisabled, + )); + } + // In the faceted search context, we want to use the intersection between the locales provided by the user // and the locales of the facet string. // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale. // If the user does not provide locales, we use the locales of the facet string. let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - let localized_attributes_locales = - localized_attributes.into_iter().find(|attr| attr.match_str(&facet_name)); + let localized_attributes_locales = localized_attributes + .into_iter() + .find(|attr| attr.match_str(&facet_name) == PatternMatch::Match); let locales = localized_attributes_locales.map(|attr| { attr.locales .into_iter() - .filter(|locale| locales.as_ref().map_or(true, |locales| locales.contains(locale))) + .filter(|locale| locales.as_ref().is_none_or(|locales| locales.contains(locale))) .collect() }); @@ -1471,6 +1598,11 @@ pub fn perform_similar( ranking_score_threshold, } = query; + let id: ExternalDocumentId = id.try_into().map_err(|error| { + let msg = format!("Invalid value at `.id`: {error}"); + ResponseError::from_msg(msg, Code::InvalidSimilarId) + })?; + // using let-else rather than `?` so that the borrow checker identifies we're always returning here, // preventing a use-after-move let Some(internal_id) = index.external_documents_ids().get(&rtxn, &id)? else { @@ -1557,7 +1689,7 @@ pub fn perform_similar( Ok(result) } -fn insert_geo_distance(sorts: &[String], document: &mut Document) { +pub fn insert_geo_distance(sorts: &[String], document: &mut Document) { lazy_static::lazy_static! { static ref GEO_REGEX: Regex = Regex::new(r"_geoPoint\(\s*([[:digit:].\-]+)\s*,\s*([[:digit:].\-]+)\s*\)").unwrap(); @@ -1690,12 +1822,12 @@ fn make_document( displayed_attributes: &BTreeSet, field_ids_map: &FieldsIdsMap, obkv: &obkv::KvReaderU16, -) -> Result { +) -> milli::Result { let mut document = serde_json::Map::new(); // recreate the original json for (key, value) in obkv.iter() { - let value = serde_json::from_slice(value)?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; let key = field_ids_map.name(key).expect("Missing field name").to_string(); document.insert(key, value); @@ -1720,7 +1852,7 @@ fn format_fields( displayable_ids: &BTreeSet, locales: Option<&[Language]>, localized_attributes: &[LocalizedAttributesRule], -) -> Result<(Option, Document), MeilisearchHttpError> { +) -> milli::Result<(Option, Document)> { let mut matches_position = compute_matches.then(BTreeMap::new); let mut document = document.clone(); @@ -1759,7 +1891,7 @@ fn format_fields( let locales = locales.or_else(|| { localized_attributes .iter() - .find(|rule| rule.match_str(key)) + .find(|rule| rule.match_str(key) == PatternMatch::Match) .map(LocalizedAttributesRule::locales) }); @@ -1898,121 +2030,5 @@ fn parse_filter_array(arr: &[Value]) -> Result, MeilisearchHttpEr } } - Ok(Filter::from_array(ands)?) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_insert_geo_distance() { - let value: Document = serde_json::from_str( - r#"{ - "_geo": { - "lat": 50.629973371633746, - "lng": 3.0569447399419567 - }, - "city": "Lille", - "id": "1" - }"#, - ) - .unwrap(); - - let sorters = &["_geoPoint(50.629973371633746,3.0569447399419567):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let sorters = &["_geoPoint(50.629973371633746, 3.0569447399419567):asc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let sorters = - &["_geoPoint( 50.629973371633746 , 3.0569447399419567 ):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let sorters = &[ - "prix:asc", - "villeneuve:desc", - "_geoPoint(50.629973371633746, 3.0569447399419567):asc", - "ubu:asc", - ] - .map(|s| s.to_string()); - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - // only the first geoPoint is used to compute the distance - let sorters = &[ - "chien:desc", - "_geoPoint(50.629973371633746, 3.0569447399419567):asc", - "pangolin:desc", - "_geoPoint(100.0, -80.0):asc", - "chat:asc", - ] - .map(|s| s.to_string()); - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - // there was no _geoPoint so nothing is inserted in the document - let sorters = &["chien:asc".to_string()]; - let mut document = value; - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), None); - } - - #[test] - fn test_insert_geo_distance_with_coords_as_string() { - let value: Document = serde_json::from_str( - r#"{ - "_geo": { - "lat": "50", - "lng": 3 - } - }"#, - ) - .unwrap(); - - let sorters = &["_geoPoint(50,3):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let value: Document = serde_json::from_str( - r#"{ - "_geo": { - "lat": "50", - "lng": "3" - }, - "id": "1" - }"#, - ) - .unwrap(); - - let sorters = &["_geoPoint(50,3):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - - let value: Document = serde_json::from_str( - r#"{ - "_geo": { - "lat": 50, - "lng": "3" - }, - "id": "1" - }"#, - ) - .unwrap(); - - let sorters = &["_geoPoint(50,3):desc".to_string()]; - let mut document = value.clone(); - insert_geo_distance(sorters, &mut document); - assert_eq!(document.get("_geoDistance"), Some(&json!(0))); - } + Filter::from_array(ands).map_err(|e| MeilisearchHttpError::from_milli(e, None)) } diff --git a/crates/meilisearch/src/search/mod_test.rs b/crates/meilisearch/src/search/mod_test.rs new file mode 100644 index 000000000..d65255801 --- /dev/null +++ b/crates/meilisearch/src/search/mod_test.rs @@ -0,0 +1,114 @@ +use meilisearch_types::Document; +use serde_json::json; + +use crate::search::insert_geo_distance; + +#[test] +fn test_insert_geo_distance() { + let value: Document = serde_json::from_str( + r#"{ + "_geo": { + "lat": 50.629973371633746, + "lng": 3.0569447399419567 + }, + "city": "Lille", + "id": "1" + }"#, + ) + .unwrap(); + + let sorters = &["_geoPoint(50.629973371633746,3.0569447399419567):desc".to_string()]; + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + let sorters = &["_geoPoint(50.629973371633746, 3.0569447399419567):asc".to_string()]; + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + let sorters = &["_geoPoint( 50.629973371633746 , 3.0569447399419567 ):desc".to_string()]; + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + let sorters = &[ + "prix:asc", + "villeneuve:desc", + "_geoPoint(50.629973371633746, 3.0569447399419567):asc", + "ubu:asc", + ] + .map(|s| s.to_string()); + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + // only the first geoPoint is used to compute the distance + let sorters = &[ + "chien:desc", + "_geoPoint(50.629973371633746, 3.0569447399419567):asc", + "pangolin:desc", + "_geoPoint(100.0, -80.0):asc", + "chat:asc", + ] + .map(|s| s.to_string()); + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + // there was no _geoPoint so nothing is inserted in the document + let sorters = &["chien:asc".to_string()]; + let mut document = value; + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), None); +} + +#[test] +fn test_insert_geo_distance_with_coords_as_string() { + let value: Document = serde_json::from_str( + r#"{ + "_geo": { + "lat": "50", + "lng": 3 + } + }"#, + ) + .unwrap(); + + let sorters = &["_geoPoint(50,3):desc".to_string()]; + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + let value: Document = serde_json::from_str( + r#"{ + "_geo": { + "lat": "50", + "lng": "3" + }, + "id": "1" + }"#, + ) + .unwrap(); + + let sorters = &["_geoPoint(50,3):desc".to_string()]; + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); + + let value: Document = serde_json::from_str( + r#"{ + "_geo": { + "lat": 50, + "lng": "3" + }, + "id": "1" + }"#, + ) + .unwrap(); + + let sorters = &["_geoPoint(50,3):desc".to_string()]; + let mut document = value.clone(); + insert_geo_distance(sorters, &mut document); + assert_eq!(document.get("_geoDistance"), Some(&json!(0))); +} diff --git a/crates/meilisearch/src/upgrade/mod.rs b/crates/meilisearch/src/upgrade/mod.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/crates/meilisearch/src/upgrade/mod.rs @@ -0,0 +1 @@ + diff --git a/crates/meilisearch/tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump b/crates/meilisearch/tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump new file mode 100644 index 000000000..a1816b79a Binary files /dev/null and b/crates/meilisearch/tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump differ diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index a465b022a..4de5c01ae 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `stats.*`, `metrics.get`, `metrics.*`, `dumps.create`, `dumps.*`, `snapshots.create`, `snapshots.*`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `*`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `stats.*`, `metrics.get`, `metrics.*`, `dumps.create`, `dumps.*`, `snapshots.create`, `snapshots.*`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `*`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/authorization.rs b/crates/meilisearch/tests/auth/authorization.rs index cb2510663..5ac922ef5 100644 --- a/crates/meilisearch/tests/auth/authorization.rs +++ b/crates/meilisearch/tests/auth/authorization.rs @@ -68,6 +68,8 @@ pub static AUTHORIZATIONS: Lazy hashset!{"keys.get", "*"}, ("GET", "/experimental-features") => hashset!{"experimental.get", "*"}, ("PATCH", "/experimental-features") => hashset!{"experimental.update", "*"}, + ("GET", "/network") => hashset!{"network.get", "*"}, + ("PATCH", "/network") => hashset!{"network.update", "*"}, }; authorizations diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index 4ba9874bc..42891acad 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `stats.*`, `metrics.get`, `metrics.*`, `dumps.create`, `dumps.*`, `snapshots.create`, `snapshots.*`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `*`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `search`, `documents.add`, `documents.get`, `documents.delete`, `documents.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `indexes.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `tasks.*`, `settings.get`, `settings.update`, `settings.*`, `stats.get`, `stats.*`, `metrics.get`, `metrics.*`, `dumps.create`, `dumps.*`, `snapshots.create`, `snapshots.*`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `*`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/tenant_token.rs b/crates/meilisearch/tests/auth/tenant_token.rs index 2e3b228d3..a3f89e70b 100644 --- a/crates/meilisearch/tests/auth/tenant_token.rs +++ b/crates/meilisearch/tests/auth/tenant_token.rs @@ -99,12 +99,12 @@ macro_rules! compute_authorized_search { server.use_admin_key("MASTER_KEY").await; let index = server.index("sales"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; - index + let (task1,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task1.uid()).await.succeeded(); + let (task2,_status_code) = index .update_settings(json!({"filterableAttributes": ["color"]})) .await; - index.wait_task(1).await; + index.wait_task(task2.uid()).await.succeeded(); drop(index); for key_content in ACCEPTED_KEYS.iter() { @@ -146,8 +146,8 @@ macro_rules! compute_forbidden_search { server.use_admin_key("MASTER_KEY").await; let index = server.index("sales"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); for key_content in $parent_keys.iter() { diff --git a/crates/meilisearch/tests/auth/tenant_token_multi_search.rs b/crates/meilisearch/tests/auth/tenant_token_multi_search.rs index e994aa3bc..9059299f3 100644 --- a/crates/meilisearch/tests/auth/tenant_token_multi_search.rs +++ b/crates/meilisearch/tests/auth/tenant_token_multi_search.rs @@ -267,22 +267,22 @@ macro_rules! compute_authorized_single_search { server.use_admin_key("MASTER_KEY").await; let index = server.index("sales"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; - index + let (add_task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(add_task.uid()).await.succeeded(); + let (update_task,_status_code) = index .update_settings(json!({"filterableAttributes": ["color"]})) .await; - index.wait_task(1).await; + index.wait_task(update_task.uid()).await.succeeded(); drop(index); let index = server.index("products"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; - index + let (add_task2,_status_code) = index.add_documents(documents, None).await; + index.wait_task(add_task2.uid()).await.succeeded(); + let (update_task2,_status_code) = index .update_settings(json!({"filterableAttributes": ["doggos"]})) .await; - index.wait_task(3).await; + index.wait_task(update_task2.uid()).await.succeeded(); drop(index); @@ -338,22 +338,22 @@ macro_rules! compute_authorized_multiple_search { server.use_admin_key("MASTER_KEY").await; let index = server.index("sales"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; - index + let (task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task,_status_code) = index .update_settings(json!({"filterableAttributes": ["color"]})) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); let index = server.index("products"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; - index + let (task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task,_status_code) = index .update_settings(json!({"filterableAttributes": ["doggos"]})) .await; - index.wait_task(3).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); @@ -422,22 +422,22 @@ macro_rules! compute_forbidden_single_search { server.use_admin_key("MASTER_KEY").await; let index = server.index("sales"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; - index + let (task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task,_status_code) = index .update_settings(json!({"filterableAttributes": ["color"]})) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); let index = server.index("products"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; - index + let (task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task,_status_code) = index .update_settings(json!({"filterableAttributes": ["doggos"]})) .await; - index.wait_task(3).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); assert_eq!($parent_keys.len(), $failed_query_indexes.len(), "keys != query_indexes"); @@ -498,22 +498,22 @@ macro_rules! compute_forbidden_multiple_search { server.use_admin_key("MASTER_KEY").await; let index = server.index("sales"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; - index + let (task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task,_status_code) = index .update_settings(json!({"filterableAttributes": ["color"]})) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); let index = server.index("products"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; - index + let (task,_status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task,_status_code) = index .update_settings(json!({"filterableAttributes": ["doggos"]})) .await; - index.wait_task(3).await; + index.wait_task(task.uid()).await.succeeded(); drop(index); assert_eq!($parent_keys.len(), $failed_query_indexes.len(), "keys != query_indexes"); diff --git a/crates/meilisearch/tests/batches/errors.rs b/crates/meilisearch/tests/batches/errors.rs index 2c3484bc1..7f5fedb6a 100644 --- a/crates/meilisearch/tests/batches/errors.rs +++ b/crates/meilisearch/tests/batches/errors.rs @@ -42,7 +42,7 @@ async fn batch_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index 799aa3df7..468963631 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -10,8 +10,8 @@ use crate::json; async fn error_get_unexisting_batch_status() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _coder) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_batch(1).await; let expected_response = json!({ @@ -29,8 +29,8 @@ async fn error_get_unexisting_batch_status() { async fn get_batch_status() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); let (_response, code) = index.get_batch(0).await; assert_eq!(code, 200); } @@ -39,11 +39,10 @@ async fn get_batch_status() { async fn list_batches() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.failed(); let (response, code) = index.list_batches().await; assert_eq!(code, 200); assert_eq!( @@ -96,11 +95,12 @@ async fn list_batches_pagination_and_reverse() { async fn list_batches_with_star_filters() { let server = Server::new().await; let index = server.index("test"); - let (batch, _code) = index.create(None).await; - index.wait_task(batch.uid()).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let index = server.index("test"); + let (task, _code) = index.create(None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.service.get("/batches?indexUids=test").await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 2); @@ -141,25 +141,21 @@ async fn list_batches_with_star_filters() { async fn list_batches_status_filtered() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.failed(); let (response, code) = index.filtered_batches(&[], &["succeeded"], &[]).await; assert_eq!(code, 200, "{}", response); assert_eq!(response["results"].as_array().unwrap().len(), 1); - // We can't be sure that the update isn't already processed so we can't test this - // let (response, code) = index.filtered_batches(&[], &["processing"]).await; - // assert_eq!(code, 200, "{}", response); - // assert_eq!(response["results"].as_array().unwrap().len(), 1); - - index.wait_task(1).await; - let (response, code) = index.filtered_batches(&[], &["succeeded"], &[]).await; assert_eq!(code, 200, "{}", response); + assert_eq!(response["results"].as_array().unwrap().len(), 1); + + let (response, code) = index.filtered_batches(&[], &["succeeded", "failed"], &[]).await; + assert_eq!(code, 200, "{}", response); assert_eq!(response["results"].as_array().unwrap().len(), 2); } @@ -167,31 +163,30 @@ async fn list_batches_status_filtered() { async fn list_batches_type_filtered() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; - + let (task, _) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.delete().await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.filtered_batches(&["indexCreation"], &[], &[]).await; assert_eq!(code, 200, "{}", response); assert_eq!(response["results"].as_array().unwrap().len(), 1); let (response, code) = - index.filtered_batches(&["indexCreation", "documentAdditionOrUpdate"], &[], &[]).await; + index.filtered_batches(&["indexCreation", "IndexDeletion"], &[], &[]).await; assert_eq!(code, 200, "{}", response); assert_eq!(response["results"].as_array().unwrap().len(), 2); + + let (response, code) = index.filtered_batches(&["indexCreation"], &[], &[]).await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["results"].as_array().unwrap().len(), 1); } #[actix_rt::test] async fn list_batches_invalid_canceled_by_filter() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.filtered_batches(&[], &[], &["0"]).await; assert_eq!(code, 200, "{}", response); @@ -202,11 +197,10 @@ async fn list_batches_invalid_canceled_by_filter() { async fn list_batches_status_and_type_filtered() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index.update(Some("id")).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.filtered_batches(&["indexCreation"], &["failed"], &[]).await; assert_eq!(code, 200, "{}", response); @@ -214,7 +208,7 @@ async fn list_batches_status_and_type_filtered() { let (response, code) = index .filtered_batches( - &["indexCreation", "documentAdditionOrUpdate"], + &["indexCreation", "IndexUpdate"], &["succeeded", "processing", "enqueued"], &[], ) @@ -224,7 +218,7 @@ async fn list_batches_status_and_type_filtered() { } #[actix_rt::test] -async fn get_batch_filter_error() { +async fn list_batch_filter_error() { let server = Server::new().await; let (response, code) = server.batches_filter("lol=pied").await; @@ -276,14 +270,23 @@ async fn get_batch_filter_error() { async fn test_summarized_document_addition_or_update() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), None).await; - index.wait_task(0).await; + let (task, _status_code) = + index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), None).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": { "receivedDocuments": 1, "indexedDocuments": 1 @@ -298,22 +301,33 @@ async fn test_summarized_document_addition_or_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]", + "writeChannelCongestion": "[writeChannelCongestion]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); - index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), Some("id")).await; - index.wait_task(1).await; + let (task, _status_code) = + index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), Some("id")).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(1).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 1, + "progress": null, "details": { "receivedDocuments": 1, "indexedDocuments": 1 @@ -328,27 +342,37 @@ async fn test_summarized_document_addition_or_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]", + "writeChannelCongestion": "[writeChannelCongestion]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] async fn test_summarized_delete_documents_by_batch() { let server = Server::new().await; let index = server.index("test"); - index.delete_batch(vec![1, 2, 3]).await; - index.wait_task(0).await; + let (task, _status_code) = index.delete_batch(vec![1, 2, 3]).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": { "providedIds": 3, "deletedDocuments": 0 @@ -363,23 +387,32 @@ async fn test_summarized_delete_documents_by_batch() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); index.create(None).await; - index.delete_batch(vec![42]).await; - index.wait_task(2).await; + let (task, _status_code) = index.delete_batch(vec![42]).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(2).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 2, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -394,13 +427,14 @@ async fn test_summarized_delete_documents_by_batch() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] @@ -408,14 +442,23 @@ async fn test_summarized_delete_documents_by_filter() { let server = Server::new().await; let index = server.index("test"); - index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; - index.wait_task(0).await; + let (task, _status_code) = + index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -431,23 +474,33 @@ async fn test_summarized_delete_documents_by_filter() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); index.create(None).await; - index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; - index.wait_task(2).await; + let (task, _status_code) = + index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(2).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 2, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -463,23 +516,33 @@ async fn test_summarized_delete_documents_by_filter() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); index.update_settings(json!({ "filterableAttributes": ["doggo"] })).await; - index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; - index.wait_task(4).await; + let (task, _status_code) = + index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(4).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, @r#" { "uid": 4, + "progress": null, "details": { "providedIds": 0, "deletedDocuments": 0, @@ -495,7 +558,8 @@ async fn test_summarized_delete_documents_by_filter() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -508,14 +572,23 @@ async fn test_summarized_delete_documents_by_filter() { async fn test_summarized_delete_document_by_id() { let server = Server::new().await; let index = server.index("test"); - index.delete_document(1).await; - index.wait_task(0).await; + let (task, _status_code) = index.delete_document(1).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, + { + ".uid" => "[uid]", + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, @r#" { - "uid": 0, + "uid": "[uid]", + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -530,7 +603,8 @@ async fn test_summarized_delete_document_by_id() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -539,14 +613,22 @@ async fn test_summarized_delete_document_by_id() { "#); index.create(None).await; - index.delete_document(42).await; - index.wait_task(2).await; + let (task, _status_code) = index.delete_document(42).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(2).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, @r#" { "uid": 2, + "progress": null, "details": { "providedIds": 1, "deletedDocuments": 0 @@ -561,7 +643,8 @@ async fn test_summarized_delete_document_by_id() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -586,14 +669,22 @@ async fn test_summarized_settings_update() { } "###); - index.update_settings(json!({ "displayedAttributes": ["doggos", "name"], "filterableAttributes": ["age", "nb_paw_pads"], "sortableAttributes": ["iq"] })).await; - index.wait_task(0).await; + let (task,_status_code) = index.update_settings(json!({ "displayedAttributes": ["doggos", "name"], "filterableAttributes": ["age", "nb_paw_pads"], "sortableAttributes": ["iq"] })).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": { "displayedAttributes": [ "doggos", @@ -617,27 +708,36 @@ async fn test_summarized_settings_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] async fn test_summarized_index_creation() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -649,22 +749,31 @@ async fn test_summarized_index_creation() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); - index.create(Some("doggos")).await; - index.wait_task(1).await; + let (task, _status_code) = index.create(Some("doggos")).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(1).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 1, + "progress": null, "details": { "primaryKey": "doggos" }, @@ -678,13 +787,14 @@ async fn test_summarized_index_creation() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] @@ -692,7 +802,7 @@ async fn test_summarized_index_deletion() { let server = Server::new().await; let index = server.index("test"); let (ret, _code) = index.delete().await; - let batch = index.wait_task(ret.uid()).await; + let batch = index.wait_task(ret.uid()).await.failed(); snapshot!(batch, @r###" { @@ -723,7 +833,7 @@ async fn test_summarized_index_deletion() { // both batches may get autobatched and the deleted documents count will be wrong. let (ret, _code) = index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), Some("id")).await; - let batch = index.wait_task(ret.uid()).await; + let batch = index.wait_task(ret.uid()).await.succeeded(); snapshot!(batch, @r###" { @@ -746,7 +856,7 @@ async fn test_summarized_index_deletion() { "###); let (ret, _code) = index.delete().await; - let batch = index.wait_task(ret.uid()).await; + let batch = index.wait_task(ret.uid()).await.succeeded(); snapshot!(batch, @r###" { @@ -769,7 +879,7 @@ async fn test_summarized_index_deletion() { // What happens when you delete an index that doesn't exists. let (ret, _code) = index.delete().await; - let batch = index.wait_task(ret.uid()).await; + let batch = index.wait_task(ret.uid()).await.failed(); snapshot!(batch, @r###" { @@ -801,14 +911,22 @@ async fn test_summarized_index_update() { let server = Server::new().await; let index = server.index("test"); // If the index doesn't exist yet, we should get errors with or without the primary key. - index.update(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.update(None).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -820,22 +938,31 @@ async fn test_summarized_index_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); - index.update(Some("bones")).await; - index.wait_task(1).await; + let (task, _status_code) = index.update(Some("bones")).await; + index.wait_task(task.uid()).await.failed(); let (batch, _) = index.get_batch(1).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 1, + "progress": null, "details": { "primaryKey": "bones" }, @@ -849,25 +976,34 @@ async fn test_summarized_index_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); // And run the same two tests once the index do exists. index.create(None).await; - index.update(None).await; - index.wait_task(3).await; + let (task, _status_code) = index.update(None).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(3).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, @r#" { "uid": 3, + "progress": null, "details": {}, "stats": { "totalNbTasks": 1, @@ -879,7 +1015,8 @@ async fn test_summarized_index_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -887,14 +1024,22 @@ async fn test_summarized_index_update() { } "#); - index.update(Some("bones")).await; - index.wait_task(4).await; + let (task, _status_code) = index.update(Some("bones")).await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(4).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 4, + "progress": null, "details": { "primaryKey": "bones" }, @@ -908,30 +1053,39 @@ async fn test_summarized_index_update() { }, "indexUids": { "test": 1 - } + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] async fn test_summarized_index_swap() { let server = Server::new().await; - server + let (task, _status_code) = server .index_swap(json!([ { "indexes": ["doggos", "cattos"] } ])) .await; - server.wait_task(0).await; + server.wait_task(task.uid()).await.failed(); let (batch, _) = server.get_batch(0).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": { "swaps": [ { @@ -950,53 +1104,56 @@ async fn test_summarized_index_swap() { "types": { "indexSwap": 1 }, - "indexUids": {} + "indexUids": {}, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); server.index("doggos").create(None).await; - server.index("cattos").create(None).await; + let (task, _status_code) = server.index("cattos").create(None).await; server .index_swap(json!([ { "indexes": ["doggos", "cattos"] } ])) .await; - server.wait_task(3).await; - let (batch, _) = server.get_batch(3).await; + server.wait_task(task.uid()).await.succeeded(); + let (batch, _) = server.get_batch(1).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { - "uid": 3, - "details": { - "swaps": [ - { - "indexes": [ - "doggos", - "cattos" - ] - } - ] - }, + "uid": 1, + "progress": null, + "details": {}, "stats": { "totalNbTasks": 1, "status": { "succeeded": 1 }, "types": { - "indexSwap": 1 + "indexCreation": 1 }, - "indexUids": {} + "indexUids": { + "doggos": 1 + }, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] @@ -1004,16 +1161,24 @@ async fn test_summarized_batch_cancelation() { let server = Server::new().await; let index = server.index("doggos"); // to avoid being flaky we're only going to cancel an already finished batch :( - index.create(None).await; - index.wait_task(0).await; - server.cancel_tasks("uids=0").await; - index.wait_task(1).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = server.cancel_tasks("uids=0").await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(1).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 1, + "progress": null, "details": { "matchedTasks": 1, "canceledTasks": 0, @@ -1027,13 +1192,14 @@ async fn test_summarized_batch_cancelation() { "types": { "taskCancelation": 1 }, - "indexUids": {} + "indexUids": {}, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] @@ -1041,16 +1207,24 @@ async fn test_summarized_batch_deletion() { let server = Server::new().await; let index = server.index("doggos"); // to avoid being flaky we're only going to delete an already finished batch :( - index.create(None).await; - index.wait_task(0).await; - server.delete_tasks("uids=0").await; - index.wait_task(1).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = server.delete_tasks("uids=0").await; + index.wait_task(task.uid()).await.succeeded(); let (batch, _) = index.get_batch(1).await; assert_json_snapshot!(batch, - { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 1, + "progress": null, "details": { "matchedTasks": 1, "deletedTasks": 1, @@ -1064,26 +1238,36 @@ async fn test_summarized_batch_deletion() { "types": { "taskDeletion": 1 }, - "indexUids": {} + "indexUids": {}, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } #[actix_web::test] async fn test_summarized_dump_creation() { let server = Server::new().await; - server.create_dump().await; - server.wait_task(0).await; + let (task, _status_code) = server.create_dump().await; + server.wait_task(task.uid()).await; let (batch, _) = server.get_batch(0).await; assert_json_snapshot!(batch, - { ".details.dumpUid" => "[dumpUid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, - @r#" + { + ".details.dumpUid" => "[dumpUid]", + ".duration" => "[duration]", + ".enqueuedAt" => "[date]", + ".startedAt" => "[date]", + ".finishedAt" => "[date]", + ".stats.progressTrace" => "[progressTrace]", + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + }, + @r###" { "uid": 0, + "progress": null, "details": { "dumpUid": "[dumpUid]" }, @@ -1095,11 +1279,12 @@ async fn test_summarized_dump_creation() { "types": { "dumpCreation": 1 }, - "indexUids": {} + "indexUids": {}, + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); } diff --git a/crates/meilisearch/tests/common/index.rs b/crates/meilisearch/tests/common/index.rs index b8c1d2824..09a7d623c 100644 --- a/crates/meilisearch/tests/common/index.rs +++ b/crates/meilisearch/tests/common/index.rs @@ -259,7 +259,7 @@ impl<'a> Index<'a, Owned> { } } -impl<'a> Index<'a, Shared> { +impl Index<'_, Shared> { /// You cannot modify the content of a shared index, thus the delete_document_by_filter call /// must fail. If the task successfully enqueue itself, we'll wait for the task to finishes, /// and if it succeed the function will panic. @@ -411,7 +411,7 @@ impl Index<'_, State> { self.service.get(url).await } - pub async fn get_document_by_filter(&self, payload: Value) -> (Value, StatusCode) { + pub async fn fetch_documents(&self, payload: Value) -> (Value, StatusCode) { let url = format!("/indexes/{}/documents/fetch", urlencode(self.uid.as_ref())); self.service.post(url, payload).await } diff --git a/crates/meilisearch/tests/common/mod.rs b/crates/meilisearch/tests/common/mod.rs index 3aae2fe80..4d57a6163 100644 --- a/crates/meilisearch/tests/common/mod.rs +++ b/crates/meilisearch/tests/common/mod.rs @@ -34,6 +34,10 @@ impl Value { } } + pub fn has_uid(&self) -> bool { + self["uid"].as_u64().is_some() || self["taskUid"].as_u64().is_some() + } + /// Return `true` if the `status` field is set to `succeeded`. /// Panic if the `status` field doesn't exists. #[track_caller] @@ -46,11 +50,30 @@ impl Value { // Panic if the json doesn't contain the `status` field set to "succeeded" #[track_caller] - pub fn succeeded(&self) -> &Self { + pub fn succeeded(&self) -> Self { if !self.is_success() { panic!("Called succeeded on {}", serde_json::to_string_pretty(&self.0).unwrap()); } - self + self.clone() + } + + /// Return `true` if the `status` field is set to `failed`. + /// Panic if the `status` field doesn't exists. + #[track_caller] + pub fn is_fail(&self) -> bool { + if !self["status"].is_string() { + panic!("Called `is_fail` on {}", serde_json::to_string_pretty(&self.0).unwrap()); + } + self["status"] == serde_json::Value::String(String::from("failed")) + } + + // Panic if the json doesn't contain the `status` field set to "succeeded" + #[track_caller] + pub fn failed(&self) -> Self { + if !self.is_fail() { + panic!("Called failed on {}", serde_json::to_string_pretty(&self.0).unwrap()); + } + self.clone() } } @@ -407,7 +430,7 @@ pub async fn shared_index_with_test_set() -> &'static Index<'static, Shared> { ) .await; assert_eq!(code, 202); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); index }) .await diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 49214d646..7e30c5d17 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -88,6 +88,10 @@ impl Server { self.service.api_key = Some(api_key.as_ref().to_string()); } + pub fn clear_api_key(&mut self) { + self.service.api_key = None; + } + /// Fetch and use the default admin key for nexts http requests. pub async fn use_admin_key(&mut self, master_key: impl AsRef) { self.use_api_key(master_key); @@ -121,6 +125,12 @@ impl Server { self.service.post("/indexes", body).await } + pub async fn delete_index(&self, uid: impl AsRef) -> (Value, StatusCode) { + let url = format!("/indexes/{}", urlencoding::encode(uid.as_ref())); + let (value, code) = self.service.delete(url).await; + (value, code) + } + pub fn index_with_encoder(&self, uid: impl AsRef, encoder: Encoder) -> Index<'_> { Index { uid: uid.as_ref().to_string(), @@ -159,10 +169,18 @@ impl Server { self.service.get("/tasks").await } + pub async fn batches(&self) -> (Value, StatusCode) { + self.service.get("/batches").await + } + pub async fn set_features(&self, value: Value) -> (Value, StatusCode) { self.service.patch("/experimental-features", value).await } + pub async fn set_network(&self, value: Value) -> (Value, StatusCode) { + self.service.patch("/network", value).await + } + pub async fn get_metrics(&self) -> (Value, StatusCode) { self.service.get("/metrics").await } @@ -222,6 +240,26 @@ impl Server { (value, code) } + pub async fn list_indexes( + &self, + offset: Option, + limit: Option, + ) -> (Value, StatusCode) { + let (offset, limit) = ( + offset.map(|offset| format!("offset={offset}")), + limit.map(|limit| format!("limit={limit}")), + ); + let query_parameter = offset + .as_ref() + .zip(limit.as_ref()) + .map(|(offset, limit)| format!("{offset}&{limit}")) + .or_else(|| offset.xor(limit)); + if let Some(query_parameter) = query_parameter { + self.service.get(format!("/indexes?{query_parameter}")).await + } else { + self.service.get("/indexes").await + } + } pub async fn update_raw_index_fail( &self, uid: impl AsRef, @@ -361,7 +399,18 @@ impl Server { pub async fn wait_task(&self, update_id: u64) -> Value { // try several times to get status, or panic to not wait forever let url = format!("/tasks/{}", update_id); - for _ in 0..100 { + // Increase timeout for vector-related tests + let max_attempts = if url.contains("/tasks/") { + if update_id > 1000 { + 400 // 200 seconds for vector tests + } else { + 100 // 50 seconds for other tests + } + } else { + 100 // 50 seconds for other tests + }; + + for _ in 0..max_attempts { let (response, status_code) = self.service.get(&url).await; assert_eq!(200, status_code, "response: {}", response); @@ -388,6 +437,10 @@ impl Server { pub async fn get_features(&self) -> (Value, StatusCode) { self.service.get("/experimental-features").await } + + pub async fn get_network(&self) -> (Value, StatusCode) { + self.service.get("/network").await + } } pub fn default_settings(dir: impl AsRef) -> Opt { diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index eebc5dc63..ad8bae19f 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -980,7 +980,7 @@ async fn add_documents_no_index_creation() { snapshot!(code, @"202 Accepted"); assert_eq!(response["taskUid"], 0); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get_task(0).await; snapshot!(code, @"200 OK"); @@ -1059,9 +1059,9 @@ async fn document_addition_with_primary_key() { } "###); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(0).await; + let (response, code) = index.get_task(response.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1169,7 +1169,7 @@ async fn replace_document() { } "###); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -1178,12 +1178,12 @@ async fn replace_document() { } ]); - let (_response, code) = index.add_documents(documents, None).await; + let (task, code) = index.add_documents(documents, None).await; snapshot!(code,@"202 Accepted"); - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); - let (response, code) = index.get_task(1).await; + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1220,9 +1220,89 @@ async fn replace_document() { #[actix_rt::test] async fn add_no_documents() { let server = Server::new().await; - let index = server.index("test"); - let (_response, code) = index.add_documents(json!([]), None).await; + let index = server.index("kefir"); + let (task, code) = index.add_documents(json!([]), None).await; snapshot!(code, @"202 Accepted"); + let task = server.wait_task(task.uid()).await; + let task = task.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 0, + "indexedDocuments": 0 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (task, _code) = index.add_documents(json!([]), Some("kefkef")).await; + let task = server.wait_task(task.uid()).await; + let task = task.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 0, + "indexedDocuments": 0 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (task, _code) = index.add_documents(json!([{ "kefkef": 1 }]), None).await; + let task = server.wait_task(task.uid()).await; + let task = task.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + let (documents, _status) = index.get_all_documents(GetAllDocumentsOptions::default()).await; + snapshot!(documents, @r#" + { + "results": [ + { + "kefkef": 1 + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "#); } #[actix_rt::test] @@ -1264,15 +1344,18 @@ async fn error_add_documents_bad_document_id() { let server = Server::new().await; let index = server.index("test"); index.create(Some("docid")).await; + + // unsupported characters + let documents = json!([ { "docid": "foo & bar", "content": "foobar" } ]); - index.add_documents(documents, None).await; - index.wait_task(1).await; - let (response, code) = index.get_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1288,7 +1371,81 @@ async fn error_add_documents_bad_document_id() { "indexedDocuments": 0 }, "error": { - "message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_id" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // More than 512 bytes + let documents = json!([ + { + "docid": "a".repeat(600), + "content": "foobar" + } + ]); + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await.failed(); + let (response, code) = index.get_task(value.uid()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 2, + "batchUid": 2, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_id" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Exactly 512 bytes + let documents = json!([ + { + "docid": "a".repeat(512), + "content": "foobar" + } + ]); + let (value, _code) = index.add_documents(documents, None).await; + index.wait_task(value.uid()).await.failed(); + let (response, code) = index.get_task(value.uid()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 3, + "batchUid": 3, + "indexUid": "test", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_document_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_id" @@ -1312,9 +1469,9 @@ async fn error_add_documents_missing_document_id() { "content": "foobar" } ]); - index.add_documents(documents, None).await; - index.wait_task(1).await; - let (response, code) = index.get_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1362,7 +1519,7 @@ async fn error_document_field_limit_reached_in_one_document() { let (response, code) = index.update_documents(documents, Some("id")).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await; + let response = index.wait_task(response.uid()).await.failed(); snapshot!(code, @"202 Accepted"); // Documents without a primary key are not accepted. snapshot!(response, @@ -1624,8 +1781,8 @@ async fn add_documents_with_geo_field() { }, ]); - index.add_documents(documents, None).await; - let response = index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -1646,6 +1803,275 @@ async fn add_documents_with_geo_field() { "finishedAt": "[date]" } "###); + + let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; + + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "results": [ + { + "id": "1" + }, + { + "id": "2", + "_geo": null + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + } + }, + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) + .await; + snapshot!(code, @"200 OK"); + // we are expecting docs 4 and 3 first as they have geo + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), + @r###" + { + "hits": [ + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + }, + "_geoDistance": 5522018 + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "_geoDistance": 5522018 + }, + { + "id": "1" + }, + { + "id": "2", + "_geo": null + } + ], + "query": "", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "###); +} + +#[actix_rt::test] +async fn update_documents_with_geo_field() { + let server = Server::new().await; + let index = server.index("doggo"); + index.update_settings(json!({"sortableAttributes": ["_geo"]})).await; + + let documents = json!([ + { + "id": "1", + }, + { + "id": "2", + "_geo": null, + }, + { + "id": "3", + "_geo": { "lat": 1, "lng": 1 }, + }, + { + "id": "4", + "_geo": { "lat": "1", "lng": "1" }, + }, + ]); + + let (task, _status_code) = index.add_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 1, + "batchUid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) + .await; + snapshot!(code, @"200 OK"); + // we are expecting docs 4 and 3 first as they have geo + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), + @r###" + { + "hits": [ + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + }, + "_geoDistance": 5522018 + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "_geoDistance": 5522018 + }, + { + "id": "1" + }, + { + "id": "2", + "_geo": null + } + ], + "query": "", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "###); + + let updated_documents = json!([{ + "id": "3", + "doggo": "kefir", + }]); + let (task, _status_code) = index.update_documents(updated_documents, None).await; + let response = index.wait_task(task.uid()).await; + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "uid": 2, + "batchUid": 2, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; + + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), + @r###" + { + "results": [ + { + "id": "1" + }, + { + "id": "2", + "_geo": null + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "doggo": "kefir" + }, + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) + .await; + snapshot!(code, @"200 OK"); + // the search response should not have changed: we are expecting docs 4 and 3 first as they have geo + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), + @r###" + { + "hits": [ + { + "id": "4", + "_geo": { + "lat": "1", + "lng": "1" + }, + "_geoDistance": 5522018 + }, + { + "id": "3", + "_geo": { + "lat": 1, + "lng": 1 + }, + "doggo": "kefir", + "_geoDistance": 5522018 + }, + { + "id": "1" + }, + { + "id": "2", + "_geo": null + } + ], + "query": "", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "###); } #[actix_rt::test] @@ -1663,9 +2089,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(2).await; - let (response, code) = index.get_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1681,7 +2107,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", + "message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1701,9 +2127,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(3).await; - let (response, code) = index.get_task(3).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1719,7 +2145,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1739,9 +2165,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(4).await; - let (response, code) = index.get_task(4).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1757,7 +2183,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", + "message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1777,9 +2203,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(5).await; - let (response, code) = index.get_task(5).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1795,7 +2221,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1815,9 +2241,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(6).await; - let (response, code) = index.get_task(6).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1833,7 +2259,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1853,9 +2279,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(7).await; - let (response, code) = index.get_task(7).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1871,7 +2297,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1891,9 +2317,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(8).await; - let (response, code) = index.get_task(8).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1909,7 +2335,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1929,9 +2355,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(9).await; - let (response, code) = index.get_task(9).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1947,7 +2373,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -1967,9 +2393,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(10).await; - let (response, code) = index.get_task(10).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -1985,7 +2411,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", + "message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2005,9 +2431,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(11).await; - let (response, code) = index.get_task(11).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -2023,7 +2449,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", + "message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2043,9 +2469,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(12).await; - let (response, code) = index.get_task(12).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -2061,7 +2487,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2081,9 +2507,9 @@ async fn add_documents_invalid_geo_field() { } ]); - index.add_documents(documents, None).await; - index.wait_task(13).await; - let (response, code) = index.get_task(13).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" @@ -2099,7 +2525,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", + "message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2123,7 +2549,7 @@ async fn add_documents_invalid_geo_field() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await; + let response = index.wait_task(response.uid()).await.failed(); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2138,7 +2564,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2160,7 +2586,7 @@ async fn add_documents_invalid_geo_field() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await; + let response = index.wait_task(response.uid()).await.failed(); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2175,7 +2601,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2197,7 +2623,7 @@ async fn add_documents_invalid_geo_field() { let (response, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await; + let response = index.wait_task(response.uid()).await.failed(); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @r###" { @@ -2212,7 +2638,7 @@ async fn add_documents_invalid_geo_field() { "indexedDocuments": 0 }, "error": { - "message": "Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", + "message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2241,7 +2667,7 @@ async fn add_invalid_geo_and_then_settings() { ]); let (ret, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - let ret = index.wait_task(ret.uid()).await; + let ret = index.wait_task(ret.uid()).await.succeeded(); snapshot!(ret, @r###" { "uid": "[uid]", @@ -2264,7 +2690,7 @@ async fn add_invalid_geo_and_then_settings() { let (ret, code) = index.update_settings(json!({ "sortableAttributes": ["_geo"] })).await; snapshot!(code, @"202 Accepted"); - let ret = index.wait_task(ret.uid()).await; + let ret = index.wait_task(ret.uid()).await.failed(); snapshot!(ret, @r###" { "uid": "[uid]", @@ -2279,7 +2705,7 @@ async fn add_invalid_geo_and_then_settings() { ] }, "error": { - "message": "Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.", + "message": "Index `test`: Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.", "code": "invalid_document_geo_field", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_geo_field" @@ -2331,9 +2757,9 @@ async fn error_primary_key_inference() { } ]); - index.add_documents(documents, None).await; - index.wait_task(0).await; - let (response, code) = index.get_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2372,9 +2798,9 @@ async fn error_primary_key_inference() { } ]); - index.add_documents(documents, None).await; - index.wait_task(1).await; - let (response, code) = index.get_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.failed(); + let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2411,9 +2837,9 @@ async fn error_primary_key_inference() { } ]); - index.add_documents(documents, None).await; - index.wait_task(2).await; - let (response, code) = index.get_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2450,14 +2876,14 @@ async fn add_documents_with_primary_key_twice() { } ]); - index.add_documents(documents.clone(), Some("title")).await; - index.wait_task(0).await; - let (response, _code) = index.get_task(0).await; + let (task, _status_code) = index.add_documents(documents.clone(), Some("title")).await; + index.wait_task(task.uid()).await.succeeded(); + let (response, _code) = index.get_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); - index.add_documents(documents, Some("title")).await; - index.wait_task(1).await; - let (response, _code) = index.get_task(1).await; + let (task, _status_code) = index.add_documents(documents, Some("title")).await; + index.wait_task(task.uid()).await.succeeded(); + let (response, _code) = index.get_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); } diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index c50823183..4dfe2cc79 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -7,10 +7,10 @@ use crate::json; async fn delete_one_document_unexisting_index() { let server = Server::new().await; let index = server.index("test"); - let (_response, code) = index.delete_document(0).await; + let (task, code) = index.delete_document(0).await; assert_eq!(code, 202); - let response = index.wait_task(0).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); } @@ -22,7 +22,7 @@ async fn delete_one_unexisting_document() { index.create(None).await; let (response, code) = index.delete_document(0).await; assert_eq!(code, 202, "{}", response); - let update = index.wait_task(0).await; + let update = index.wait_task(response.uid()).await; assert_eq!(update["status"], "succeeded"); } @@ -30,11 +30,12 @@ async fn delete_one_unexisting_document() { async fn delete_one_document() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(json!([{ "id": 0, "content": "foobar" }]), None).await; - index.wait_task(0).await; - let (_response, code) = server.index("test").delete_document(0).await; - assert_eq!(code, 202); - index.wait_task(1).await; + let (task, _status_code) = + index.add_documents(json!([{ "id": 0, "content": "foobar" }]), None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, status_code) = server.index("test").delete_document(0).await; + assert_eq!(status_code, 202); + index.wait_task(task.uid()).await.succeeded(); let (_response, code) = index.get_document(0, None).await; assert_eq!(code, 404); @@ -44,10 +45,10 @@ async fn delete_one_document() { async fn clear_all_documents_unexisting_index() { let server = Server::new().await; let index = server.index("test"); - let (_response, code) = index.clear_all_documents().await; + let (task, code) = index.clear_all_documents().await; assert_eq!(code, 202); - let response = index.wait_task(0).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); } @@ -56,17 +57,17 @@ async fn clear_all_documents_unexisting_index() { async fn clear_all_documents() { let server = Server::new().await; let index = server.index("test"); - index + let (task, _status_code) = index .add_documents( json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }]), None, ) .await; - index.wait_task(0).await; - let (_response, code) = index.clear_all_documents().await; + index.wait_task(task.uid()).await.succeeded(); + let (task, code) = index.clear_all_documents().await; assert_eq!(code, 202); - let _update = index.wait_task(1).await; + let _update = index.wait_task(task.uid()).await; let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert!(response["results"].as_array().unwrap().is_empty()); @@ -76,12 +77,12 @@ async fn clear_all_documents() { async fn clear_all_documents_empty_index() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - - let (_response, code) = index.clear_all_documents().await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, code) = index.clear_all_documents().await; assert_eq!(code, 202); - let _update = index.wait_task(0).await; + let _update = index.wait_task(task.uid()).await; let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert!(response["results"].as_array().unwrap().is_empty()); @@ -91,7 +92,7 @@ async fn clear_all_documents_empty_index() { async fn error_delete_batch_unexisting_index() { let server = Server::new().await; let index = server.index("test"); - let (_, code) = index.delete_batch(vec![]).await; + let (task, code) = index.delete_batch(vec![]).await; let expected_response = json!({ "message": "Index `test` not found.", "code": "index_not_found", @@ -100,7 +101,7 @@ async fn error_delete_batch_unexisting_index() { }); assert_eq!(code, 202); - let response = index.wait_task(0).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); assert_eq!(response["error"], expected_response); @@ -110,12 +111,12 @@ async fn error_delete_batch_unexisting_index() { async fn delete_batch() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }, { "id": 3, "content": "foobar" }]), Some("id")).await; - index.wait_task(0).await; - let (_response, code) = index.delete_batch(vec![1, 0]).await; + let (task,_status_code) = index.add_documents(json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }, { "id": 3, "content": "foobar" }]), Some("id")).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, code) = index.delete_batch(vec![1, 0]).await; assert_eq!(code, 202); - let _update = index.wait_task(1).await; + let _update = index.wait_task(task.uid()).await; let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 1); @@ -126,12 +127,12 @@ async fn delete_batch() { async fn delete_no_document_batch() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }, { "id": 3, "content": "foobar" }]), Some("id")).await; - index.wait_task(0).await; + let (task,_status_code) = index.add_documents(json!([{ "id": 1, "content": "foobar" }, { "id": 0, "content": "foobar" }, { "id": 3, "content": "foobar" }]), Some("id")).await; + index.wait_task(task.uid()).await.succeeded(); let (_response, code) = index.delete_batch(vec![]).await; assert_eq!(code, 202, "{}", _response); - let _update = index.wait_task(1).await; + let _update = index.wait_task(_response.uid()).await; let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 3); @@ -142,7 +143,7 @@ async fn delete_document_by_filter() { let server = Server::new().await; let index = server.index("doggo"); index.update_settings_filterable_attributes(json!(["color"])).await; - index + let (task, _status_code) = index .add_documents( json!([ { "id": 0, "color": "red" }, @@ -153,13 +154,17 @@ async fn delete_document_by_filter() { Some("id"), ) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); let (stats, _) = index.stats().await; snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 4, + "rawDocumentDbSize": 42, + "avgDocumentSize": 10, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 3, "id": 4 @@ -180,7 +185,7 @@ async fn delete_document_by_filter() { } "###); - let response = index.wait_task(2).await; + let response = index.wait_task(response.uid()).await; snapshot!(json_string!(response, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": 2, @@ -206,7 +211,11 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 16, + "avgDocumentSize": 8, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 1, "id": 2 @@ -246,7 +255,7 @@ async fn delete_document_by_filter() { } "###); - let response = index.wait_task(3).await; + let response = index.wait_task(response.uid()).await; snapshot!(json_string!(response, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": 3, @@ -272,7 +281,11 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 12, + "avgDocumentSize": 12, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 1, "id": 1 @@ -302,7 +315,7 @@ async fn delete_document_by_complex_filter() { let server = Server::new().await; let index = server.index("doggo"); index.update_settings_filterable_attributes(json!(["color"])).await; - index + let (task, _status_code) = index .add_documents( json!([ { "id": 0, "color": "red" }, @@ -314,7 +327,7 @@ async fn delete_document_by_complex_filter() { Some("id"), ) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index .delete_document_by_filter( json!({ "filter": ["color != red", "color != green", "color EXISTS"] }), @@ -331,7 +344,7 @@ async fn delete_document_by_complex_filter() { } "###); - let response = index.wait_task(2).await; + let response = index.wait_task(response.uid()).await; snapshot!(json_string!(response, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": 2, @@ -390,7 +403,7 @@ async fn delete_document_by_complex_filter() { } "###); - let response = index.wait_task(3).await; + let response = index.wait_task(response.uid()).await; snapshot!(json_string!(response, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###" { "uid": 3, diff --git a/crates/meilisearch/tests/documents/errors.rs b/crates/meilisearch/tests/documents/errors.rs index c90b9ed49..afca9498b 100644 --- a/crates/meilisearch/tests/documents/errors.rs +++ b/crates/meilisearch/tests/documents/errors.rs @@ -604,7 +604,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"doggo = bernese\"" }, "error": { - "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", + "message": "Index `EMPTY_INDEX`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -636,7 +636,7 @@ async fn delete_document_by_filter() { "originalFilter": "\"catto = jorts\"" }, "error": { - "message": "Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts", + "message": "Index `SHARED_DOCUMENTS`: Attribute `catto` is not filterable. Available filterable attribute patterns are: `id`, `title`.\n1:6 catto = jorts", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -667,7 +667,7 @@ async fn fetch_document_by_filter() { .await; index.wait_task(task.uid()).await.succeeded(); - let (response, code) = index.get_document_by_filter(json!(null)).await; + let (response, code) = index.fetch_documents(json!(null)).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -678,7 +678,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "offset": "doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "offset": "doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -689,7 +689,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "limit": "doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "limit": "doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -700,7 +700,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "fields": "doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "fields": "doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -711,7 +711,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "filter": true })).await; + let (response, code) = index.fetch_documents(json!({ "filter": true })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -722,7 +722,7 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = index.get_document_by_filter(json!({ "filter": "cool doggo" })).await; + let (response, code) = index.fetch_documents(json!({ "filter": "cool doggo" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { @@ -733,12 +733,11 @@ async fn fetch_document_by_filter() { } "###); - let (response, code) = - index.get_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + let (response, code) = index.fetch_documents(json!({ "filter": "doggo = bernese" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Attribute `doggo` is not filterable. Available filterable attributes are: `color`.\n1:6 doggo = bernese", + "message": "Attribute `doggo` is not filterable. Available filterable attribute patterns are: `color`.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -760,19 +759,9 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); - let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await; - snapshot!(response, @r###" - { - "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); // FETCH ALL DOCUMENTS BY POST - let (response, _code) = - index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; + let (response, _code) = index.fetch_documents(json!({ "retrieveVectors": "tamo" })).await; snapshot!(response, @r###" { "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", @@ -781,15 +770,6 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); - let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await; - snapshot!(response, @r###" - { - "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); // GET A SINGLE DOCUMENT let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; @@ -801,13 +781,4 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); - let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; - snapshot!(response, @r###" - { - "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); } diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index d66d36c56..f87a18b9f 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -371,7 +371,7 @@ async fn get_document_by_filter() { .await; index.wait_task(task.uid()).await.succeeded(); - let (response, code) = index.get_document_by_filter(json!({})).await; + let (response, code) = index.fetch_documents(json!({})).await; let (response2, code2) = index.get_all_documents_raw("").await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" @@ -401,7 +401,7 @@ async fn get_document_by_filter() { assert_eq!(code, code2); assert_eq!(response, response2); - let (response, code) = index.get_document_by_filter(json!({ "filter": "color = blue" })).await; + let (response, code) = index.fetch_documents(json!({ "filter": "color = blue" })).await; let (response2, code2) = index.get_all_documents_raw("?filter=color=blue").await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" @@ -424,9 +424,8 @@ async fn get_document_by_filter() { assert_eq!(code, code2); assert_eq!(response, response2); - let (response, code) = index - .get_document_by_filter(json!({ "offset": 1, "limit": 1, "filter": "color != blue" })) - .await; + let (response, code) = + index.fetch_documents(json!({ "offset": 1, "limit": 1, "filter": "color != blue" })).await; let (response2, code2) = index.get_all_documents_raw("?filter=color!=blue&offset=1&limit=1").await; snapshot!(code, @"200 OK"); @@ -446,9 +445,7 @@ async fn get_document_by_filter() { assert_eq!(response, response2); let (response, code) = index - .get_document_by_filter( - json!({ "limit": 1, "filter": "color != blue", "fields": ["color"] }), - ) + .fetch_documents(json!({ "limit": 1, "filter": "color != blue", "fields": ["color"] })) .await; let (response2, code2) = index.get_all_documents_raw("?limit=1&filter=color!=blue&fields=color").await; @@ -471,7 +468,7 @@ async fn get_document_by_filter() { // Now testing more complex filter that the get route can't represent let (response, code) = - index.get_document_by_filter(json!({ "filter": [["color = blue", "color = red"]] })).await; + index.fetch_documents(json!({ "filter": [["color = blue", "color = red"]] })).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" { @@ -495,9 +492,8 @@ async fn get_document_by_filter() { } "###); - let (response, code) = index - .get_document_by_filter(json!({ "filter": [["color != blue"], "color EXISTS"] })) - .await; + let (response, code) = + index.fetch_documents(json!({ "filter": [["color != blue"], "color EXISTS"] })).await; snapshot!(code, @"200 OK"); snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" { @@ -514,21 +510,330 @@ async fn get_document_by_filter() { "###); } +#[actix_rt::test] +async fn get_document_by_ids() { + let server = Server::new_shared(); + let index = server.unique_index(); + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = index + .fetch_documents(json!({ + "ids": ["0", 1, 2, 3] + })) + .await; + let (response2, code2) = index.get_all_documents_raw("?ids=0,1,2,3").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + }, + { + "id": 3 + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index.fetch_documents(json!({ "ids": [2, "1"] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=2,1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 1, + "color": "blue" + }, + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = + index.fetch_documents(json!({ "offset": 1, "limit": 1, "ids": ["0", 0, 3] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=3,0&offset=1&limit=1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 3 + } + ], + "offset": 1, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = + index.fetch_documents(json!({ "limit": 1, "ids": [0, 3], "fields": ["color"] })).await; + let (response2, code2) = index.get_all_documents_raw("?limit=1&ids=0,3&fields=color").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "color": "red" + } + ], + "offset": 0, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + // Now testing more complex requests that the get route can't represent + + let (response, code) = index.fetch_documents(json!({ "ids": [] })).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); +} + +#[actix_rt::test] +async fn get_document_invalid_ids() { + let server = Server::new_shared(); + let index = server.unique_index(); + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = index.fetch_documents(json!({"ids": ["0", "illegal/docid"] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=0,illegal/docid").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "message": "In `.ids[1]`: Document identifier `\"illegal/docid\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", + "code": "invalid_document_ids", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_ids" + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); +} + +#[actix_rt::test] +async fn get_document_not_found_ids() { + let server = Server::new_shared(); + let index = server.unique_index(); + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = index.fetch_documents(json!({"ids": ["0", 3, 42] })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=0,3,42").await; + // the document with id 42 is not in the results since it doesn't exist + // however, no error is raised + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 3 + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); +} + +#[actix_rt::test] +async fn get_document_by_ids_and_filter() { + let server = Server::new_shared(); + let index = server.unique_index(); + index.update_settings_filterable_attributes(json!(["color"])).await; + let (task, _code) = index + .add_documents( + json!([ + { "id": 0, "color": "red" }, + { "id": 1, "color": "blue" }, + { "id": 2, "color": "blue" }, + { "id": 3 }, + ]), + Some("id"), + ) + .await; + index.wait_task(task.uid()).await.succeeded(); + + let (response, code) = + index.fetch_documents(json!({"ids": [2], "filter": "color = blue" })).await; + let (response2, code2) = index.get_all_documents_raw("?ids=2&filter=color=blue").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index + .fetch_documents( + json!({ "offset": 1, "limit": 1, "ids": [0, 1, 2, 3], "filter": "color != blue" }), + ) + .await; + let (response2, code2) = + index.get_all_documents_raw("?ids=0,1,2,3&filter=color!=blue&offset=1&limit=1").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 3 + } + ], + "offset": 1, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + let (response, code) = index + .fetch_documents(json!({ "limit": 1, "ids": [0, 1, 2,3], "filter": "color != blue", "fields": ["color"] })) + .await; + let (response2, code2) = + index.get_all_documents_raw("?ids=0,1,2,3&limit=1&filter=color!=blue&fields=color").await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "color": "red" + } + ], + "offset": 0, + "limit": 1, + "total": 2 + } + "###); + assert_eq!(code, code2); + assert_eq!(response, response2); + + // Now testing more complex filter that the get route can't represent + + let (response, code) = index + .fetch_documents(json!({ "ids": [0, "2"], "filter": [["color = blue", "color = red"]] })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [ + { + "id": 0, + "color": "red" + }, + { + "id": 2, + "color": "blue" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (response, code) = index + .fetch_documents(json!({ "filter": [["color != blue"], "color EXISTS"], "ids": [1, 2, 3] })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); +} + #[actix_rt::test] async fn get_document_with_vectors() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ diff --git a/crates/meilisearch/tests/documents/update_documents.rs b/crates/meilisearch/tests/documents/update_documents.rs index c0703e81b..aaf529ce5 100644 --- a/crates/meilisearch/tests/documents/update_documents.rs +++ b/crates/meilisearch/tests/documents/update_documents.rs @@ -172,7 +172,7 @@ async fn error_update_documents_bad_document_id() { assert_eq!( response["error"]["message"], json!( - r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes."# + r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes."# ) ); assert_eq!(response["error"]["code"], json!("invalid_document_id")); diff --git a/crates/meilisearch/tests/dumps/data.rs b/crates/meilisearch/tests/dumps/data.rs index d353aaf1d..cb46aa41f 100644 --- a/crates/meilisearch/tests/dumps/data.rs +++ b/crates/meilisearch/tests/dumps/data.rs @@ -22,6 +22,7 @@ pub enum GetDump { TestV5, TestV6WithExperimental, + TestV6WithBatchesAndEnqueuedTasks, } impl GetDump { @@ -74,6 +75,10 @@ impl GetDump { "tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump" ) .into(), + GetDump::TestV6WithBatchesAndEnqueuedTasks => { + exist_relative_path!("tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump") + .into() + } } } } diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index c7d157b00..ff0b027cb 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -27,9 +27,26 @@ async fn import_dump_v1_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -78,8 +95,11 @@ async fn import_dump_v1_movie_raw() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -169,7 +189,11 @@ async fn import_dump_v1_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "genres": 53, "id": 53, @@ -241,8 +265,11 @@ async fn import_dump_v1_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -327,9 +354,26 @@ async fn import_dump_v1_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -390,8 +434,11 @@ async fn import_dump_v1_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -474,9 +521,26 @@ async fn import_dump_v2_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -525,8 +589,11 @@ async fn import_dump_v2_movie_raw() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -611,9 +678,26 @@ async fn import_dump_v2_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -672,8 +756,11 @@ async fn import_dump_v2_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -758,9 +845,26 @@ async fn import_dump_v2_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -818,8 +922,11 @@ async fn import_dump_v2_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -902,9 +1009,26 @@ async fn import_dump_v3_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -953,8 +1077,11 @@ async fn import_dump_v3_movie_raw() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1039,9 +1166,26 @@ async fn import_dump_v3_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1100,8 +1244,11 @@ async fn import_dump_v3_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1186,9 +1333,26 @@ async fn import_dump_v3_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1246,8 +1410,11 @@ async fn import_dump_v3_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1330,9 +1497,26 @@ async fn import_dump_v4_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1381,8 +1565,11 @@ async fn import_dump_v4_movie_raw() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1467,9 +1654,26 @@ async fn import_dump_v4_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "avgDocumentSize": 414, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1528,8 +1732,11 @@ async fn import_dump_v4_movie_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1614,9 +1821,26 @@ async fn import_dump_v4_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "rawDocumentDbSize": 8606, + "avgDocumentSize": 162, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1674,8 +1898,11 @@ async fn import_dump_v4_rubygems_with_settings() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "### ); @@ -1762,33 +1989,37 @@ async fn import_dump_v5() { server.wait_task(task["uid"].as_u64().unwrap()).await; } - let expected_stats = json!({ - "numberOfDocuments": 10, - "isIndexing": false, - "fieldDistribution": { - "cast": 10, - "director": 10, - "genres": 10, - "id": 10, - "overview": 10, - "popularity": 10, - "poster_path": 10, - "producer": 10, - "production_companies": 10, - "release_date": 10, - "tagline": 10, - "title": 10, - "vote_average": 10, - "vote_count": 10 - } - }); - let index1 = server.index("test"); let index2 = server.index("test2"); let (stats, code) = index1.stats().await; snapshot!(code, @"200 OK"); - assert_eq!(stats, expected_stats); + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 10, + "rawDocumentDbSize": 6782, + "avgDocumentSize": 678, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "cast": 10, + "director": 10, + "genres": 10, + "id": 10, + "overview": 10, + "popularity": 10, + "poster_path": 10, + "producer": 10, + "production_companies": 10, + "release_date": 10, + "tagline": 10, + "title": 10, + "vote_average": 10, + "vote_count": 10 + } + } + "###); let (docs, code) = index2.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(code, @"200 OK"); @@ -1799,7 +2030,34 @@ async fn import_dump_v5() { let (stats, code) = index2.stats().await; snapshot!(code, @"200 OK"); - assert_eq!(stats, expected_stats); + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 10, + "rawDocumentDbSize": 6782, + "avgDocumentSize": 678, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "cast": 10, + "director": 10, + "genres": 10, + "id": 10, + "overview": 10, + "popularity": 10, + "poster_path": 10, + "producer": 10, + "production_companies": 10, + "release_date": 10, + "tagline": 10, + "title": 10, + "vote_average": 10, + "vote_count": 10 + } + } + "###); let (keys, code) = server.list_api_keys("").await; snapshot!(code, @"200 OK"); @@ -1869,11 +2127,13 @@ async fn import_dump_v6_containing_experimental_features() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": false, "metrics": false, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -1921,8 +2181,11 @@ async fn import_dump_v6_containing_experimental_features() { "pagination": { "maxTotalHits": 1000 }, + "embedders": {}, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); @@ -1954,6 +2217,70 @@ async fn import_dump_v6_containing_experimental_features() { .await; } +#[actix_rt::test] +async fn import_dump_v6_containing_batches_and_enqueued_tasks() { + let temp = tempfile::tempdir().unwrap(); + + let options = Opt { + import_dump: Some(GetDump::TestV6WithBatchesAndEnqueuedTasks.path()), + ..default_settings(temp.path()) + }; + let mut server = Server::new_auth_with_options(options, temp).await; + server.use_api_key("MASTER_KEY"); + server.wait_task(2).await.succeeded(); + let (tasks, _) = server.tasks().await; + snapshot!(json_string!(tasks, { ".results[1].startedAt" => "[date]", ".results[1].finishedAt" => "[date]", ".results[1].duration" => "[date]" }), name: "tasks"); + let (batches, _) = server.batches().await; + snapshot!(json_string!(batches, { + ".results[0].startedAt" => "[date]", + ".results[0].finishedAt" => "[date]", + ".results[0].duration" => "[date]", + ".results[0].stats.progressTrace" => "[progressTrace]", + ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]", + }), name: "batches"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + assert_eq!(indexes["results"].as_array().unwrap().len(), 1); + assert_eq!(indexes["results"][0]["uid"], json!("kefir")); + assert_eq!(indexes["results"][0]["primaryKey"], json!("id")); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false + } + "###); + + let index = server.index("kefir"); + let (documents, _) = index.get_all_documents_raw("").await; + snapshot!(documents, @r#" + { + "results": [ + { + "id": 1, + "dog": "kefir" + }, + { + "id": 2, + "dog": "intel" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "#); +} + // In this test we must generate the dump ourselves to ensure the // `user provided` vectors are well set #[actix_rt::test] @@ -1962,16 +2289,7 @@ async fn generate_and_import_dump_containing_vectors() { let temp = tempfile::tempdir().unwrap(); let mut opt = default_settings(temp.path()); let server = Server::new_with_options(opt.clone()).await.unwrap(); - let (code, _) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); + let index = server.index("pets"); let (response, code) = index .update_settings(json!( @@ -2037,11 +2355,13 @@ async fn generate_and_import_dump_containing_vectors() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": true, "metrics": false, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -2097,12 +2417,15 @@ async fn generate_and_import_dump_containing_vectors() { "source": "huggingFace", "model": "sentence-transformers/all-MiniLM-L6-v2", "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "useModel", "documentTemplate": "{{doc.doggo}}", "documentTemplateMaxBytes": 400 } }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } "###); diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap new file mode 100644 index 000000000..b38340ef6 --- /dev/null +++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap @@ -0,0 +1,79 @@ +--- +source: crates/meilisearch/tests/dumps/mod.rs +--- +{ + "results": [ + { + "uid": 2, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + }, + "progressTrace": "[progressTrace]", + "writeChannelCongestion": "[writeChannelCongestion]" + }, + "duration": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 1, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.144827890S", + "startedAt": "2025-02-04T10:15:21.275640274Z", + "finishedAt": "2025-02-04T10:15:21.420468164Z" + }, + { + "uid": 0, + "progress": null, + "details": {}, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexCreation": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.032902186S", + "startedAt": "2025-02-04T10:14:43.559526162Z", + "finishedAt": "2025-02-04T10:14:43.592428348Z" + } + ], + "total": 3, + "limit": 20, + "from": 2, + "next": null +} diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/tasks.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/tasks.snap new file mode 100644 index 000000000..99dc06f24 --- /dev/null +++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/tasks.snap @@ -0,0 +1,78 @@ +--- +source: crates/meilisearch/tests/dumps/mod.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 3, + "batchUid": null, + "indexUid": null, + "status": "succeeded", + "type": "dumpCreation", + "canceledBy": null, + "details": { + "dumpUid": null + }, + "error": null, + "duration": "PT0.000629059S", + "enqueuedAt": "2025-02-04T10:22:31.318175268Z", + "startedAt": "2025-02-04T10:22:31.331701375Z", + "finishedAt": "2025-02-04T10:22:31.332330434Z" + }, + { + "uid": 2, + "batchUid": 2, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[date]", + "enqueuedAt": "2025-02-04T10:15:49.212484063Z", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 1, + "batchUid": null, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.144827890S", + "enqueuedAt": "2025-02-04T10:15:21.258630973Z", + "startedAt": "2025-02-04T10:15:21.275640274Z", + "finishedAt": "2025-02-04T10:15:21.420468164Z" + }, + { + "uid": 0, + "batchUid": null, + "indexUid": "kefir", + "status": "succeeded", + "type": "indexCreation", + "canceledBy": null, + "details": { + "primaryKey": null + }, + "error": null, + "duration": "PT0.032902186S", + "enqueuedAt": "2025-02-04T10:14:43.550379968Z", + "startedAt": "2025-02-04T10:14:43.559526162Z", + "finishedAt": "2025-02-04T10:14:43.592428348Z" + } + ], + "total": 4, + "limit": 20, + "from": 3, + "next": null +} diff --git a/crates/meilisearch/tests/features/mod.rs b/crates/meilisearch/tests/features/mod.rs index 2473a345a..34cd40e38 100644 --- a/crates/meilisearch/tests/features/mod.rs +++ b/crates/meilisearch/tests/features/mod.rs @@ -7,7 +7,7 @@ use crate::json; /// Feature name to test against. /// This will have to be changed by a different one when that feature is stabilized. /// All tests that need to set a feature can make use of this constant. -const FEATURE_NAME: &str = "vectorStore"; +const FEATURE_NAME: &str = "metrics"; #[actix_rt::test] async fn experimental_features() { @@ -18,11 +18,13 @@ async fn experimental_features() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": false, "metrics": false, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -31,11 +33,13 @@ async fn experimental_features() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": true, - "metrics": false, + "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -44,11 +48,13 @@ async fn experimental_features() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": true, - "metrics": false, + "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -58,11 +64,13 @@ async fn experimental_features() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": true, - "metrics": false, + "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -72,11 +80,13 @@ async fn experimental_features() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": true, - "metrics": false, + "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); } @@ -93,11 +103,13 @@ async fn experimental_feature_metrics() { meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "vectorStore": false, "metrics": true, "logsRoute": false, "editDocumentsByFunction": false, - "containsFilter": false + "containsFilter": false, + "network": false, + "getTaskDocumentsRoute": false, + "compositeEmbedders": false } "###); @@ -132,14 +144,6 @@ async fn experimental_feature_metrics() { let (response, code) = server.get_metrics().await; meili_snap::snapshot!(code, @"200 OK"); meili_snap::snapshot!(response, @"null"); - - // startup without flag respects persisted metrics value - let disable_metrics = - Opt { experimental_enable_metrics: false, ..default_settings(dir.path()) }; - let server_no_flag = Server::new_with_options(disable_metrics).await.unwrap(); - let (response, code) = server_no_flag.get_metrics().await; - meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(response, @"null"); } #[actix_rt::test] @@ -152,7 +156,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `vectorStore`, `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`", + "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" @@ -165,7 +169,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Invalid value type at `.vectorStore`: expected a boolean, but found a positive integer: `42`", + "message": "Invalid value type at `.metrics`: expected a boolean, but found a positive integer: `42`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" @@ -178,7 +182,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Invalid value type at `.vectorStore`: expected a boolean, but found a string: `\"true\"`", + "message": "Invalid value type at `.metrics`: expected a boolean, but found a string: `\"true\"`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/crates/meilisearch/tests/index/create_index.rs b/crates/meilisearch/tests/index/create_index.rs index 9b9fbd039..e8efd14e2 100644 --- a/crates/meilisearch/tests/index/create_index.rs +++ b/crates/meilisearch/tests/index/create_index.rs @@ -115,7 +115,7 @@ async fn create_index_with_primary_key() { assert_eq!(response["status"], "enqueued"); - let response = index.wait_task(response.uid()).await; + let response = index.wait_task(response.uid()).await.succeeded(); assert_eq!(response["status"], "succeeded"); assert_eq!(response["type"], "indexCreation"); @@ -130,8 +130,7 @@ async fn create_index_with_invalid_primary_key() { let index = server.unique_index(); let (response, code) = index.add_documents(documents, Some("title")).await; assert_eq!(code, 202); - - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.failed(); let (response, code) = index.get().await; assert_eq!(code, 200); @@ -141,8 +140,7 @@ async fn create_index_with_invalid_primary_key() { let (response, code) = index.add_documents(documents, Some("id")).await; assert_eq!(code, 202); - - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.failed(); let (response, code) = index.get().await; assert_eq!(code, 200); diff --git a/crates/meilisearch/tests/index/get_index.rs b/crates/meilisearch/tests/index/get_index.rs index ce08251be..a436b649b 100644 --- a/crates/meilisearch/tests/index/get_index.rs +++ b/crates/meilisearch/tests/index/get_index.rs @@ -1,22 +1,22 @@ +use crate::json; use meili_snap::{json_string, snapshot}; -use serde_json::{json, Value}; +use serde_json::Value; -use crate::common::Server; +use crate::common::{shared_does_not_exists_index, Server}; #[actix_rt::test] async fn create_and_get_index() { - let server = Server::new().await; - let index = server.index("test"); - let (_, code) = index.create(None).await; + let server = Server::new_shared(); + let index = server.unique_index(); + let (response, code) = index.create(None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.get().await; assert_eq!(code, 200); - assert_eq!(response["uid"], "test"); assert!(response.get("createdAt").is_some()); assert!(response.get("updatedAt").is_some()); assert_eq!(response["createdAt"], response["updatedAt"]); @@ -26,13 +26,12 @@ async fn create_and_get_index() { #[actix_rt::test] async fn error_get_unexisting_index() { - let server = Server::new().await; - let index = server.index("test"); + let index = shared_does_not_exists_index().await; let (response, code) = index.get().await; let expected_response = json!({ - "message": "Index `test` not found.", + "message": "Index `DOES_NOT_EXISTS` not found.", "code": "index_not_found", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#index_not_found" @@ -55,9 +54,9 @@ async fn no_index_return_empty_list() { async fn list_multiple_indexes() { let server = Server::new().await; server.index("test").create(None).await; - server.index("test1").create(Some("key")).await; + let (task, _status_code) = server.index("test1").create(Some("key")).await; - server.index("test").wait_task(1).await; + server.index("test").wait_task(task.uid()).await.succeeded(); let (response, code) = server.list_indexes(None, None).await; assert_eq!(code, 200); @@ -73,8 +72,8 @@ async fn get_and_paginate_indexes() { let server = Server::new().await; const NB_INDEXES: usize = 50; for i in 0..NB_INDEXES { - server.index(&format!("test_{i:02}")).create(None).await; - server.index(&format!("test_{i:02}")).wait_task(i as u64).await; + server.index(format!("test_{i:02}")).create(None).await; + server.index(format!("test_{i:02}")).wait_task(i as u64).await; } // basic @@ -179,14 +178,14 @@ async fn get_and_paginate_indexes() { #[actix_rt::test] async fn get_invalid_index_uid() { - let server = Server::new().await; - let index = server.index("this is not a valid index name"); - let (response, code) = index.get().await; + let server = Server::new_shared(); + let (response, code) = + server.create_index_fail(json!({ "uid": "this is not a valid index name" })).await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "`this is not a valid index name` is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.uid`: `this is not a valid index name` is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes.", "code": "invalid_index_uid", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_index_uid" diff --git a/crates/meilisearch/tests/index/stats.rs b/crates/meilisearch/tests/index/stats.rs index d0b0a56b9..291cb0ce0 100644 --- a/crates/meilisearch/tests/index/stats.rs +++ b/crates/meilisearch/tests/index/stats.rs @@ -5,11 +5,11 @@ use crate::json; async fn stats() { let server = Server::new().await; let index = server.index("test"); - let (_, code) = index.create(Some("id")).await; + let (task, code) = index.create(Some("id")).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.stats().await; @@ -33,7 +33,7 @@ async fn stats() { assert_eq!(code, 202); assert_eq!(response["taskUid"], 1); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.stats().await; diff --git a/crates/meilisearch/tests/index/update_index.rs b/crates/meilisearch/tests/index/update_index.rs index 36ec27306..a9b02e7d4 100644 --- a/crates/meilisearch/tests/index/update_index.rs +++ b/crates/meilisearch/tests/index/update_index.rs @@ -13,9 +13,9 @@ async fn update_primary_key() { assert_eq!(code, 202); - index.update(Some("primary")).await; + let (task, _status_code) = index.update(Some("primary")).await; - let response = index.wait_task(1).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); @@ -46,9 +46,9 @@ async fn create_and_update_with_different_encoding() { assert_eq!(code, 202); let index = server.index_with_encoder("test", Encoder::Brotli); - index.update(Some("primary")).await; + let (task, _status_code) = index.update(Some("primary")).await; - let response = index.wait_task(1).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); } @@ -57,17 +57,17 @@ async fn create_and_update_with_different_encoding() { async fn update_nothing() { let server = Server::new().await; let index = server.index("test"); - let (_, code) = index.create(None).await; + let (task1, code) = index.create(None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(task1.uid()).await.succeeded(); - let (_, code) = index.update(None).await; + let (task2, code) = index.update(None).await; assert_eq!(code, 202); - let response = index.wait_task(1).await; + let response = index.wait_task(task2.uid()).await; assert_eq!(response["status"], "succeeded"); } @@ -88,14 +88,14 @@ async fn error_update_existing_primary_key() { ]); index.add_documents(documents, None).await; - let (_, code) = index.update(Some("primary")).await; + let (task, code) = index.update(Some("primary")).await; assert_eq!(code, 202); - let response = index.wait_task(2).await; + let response = index.wait_task(task.uid()).await; let expected_response = json!({ - "message": "Index already has a primary key: `id`.", + "message": "Index `test`: Index already has a primary key: `id`.", "code": "index_primary_key_already_exists", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#index_primary_key_already_exists" @@ -107,11 +107,11 @@ async fn error_update_existing_primary_key() { #[actix_rt::test] async fn error_update_unexisting_index() { let server = Server::new().await; - let (_, code) = server.index("test").update(None).await; + let (task, code) = server.index("test").update(None).await; assert_eq!(code, 202); - let response = server.index("test").wait_task(0).await; + let response = server.index("test").wait_task(task.uid()).await; let expected_response = json!({ "message": "Index `test` not found.", diff --git a/crates/meilisearch/tests/integration.rs b/crates/meilisearch/tests/integration.rs index 85deb9cdf..927eb4617 100644 --- a/crates/meilisearch/tests/integration.rs +++ b/crates/meilisearch/tests/integration.rs @@ -7,6 +7,7 @@ mod dumps; mod features; mod index; mod logs; +mod network; mod search; mod settings; mod similar; @@ -14,6 +15,7 @@ mod snapshot; mod stats; mod swap_indexes; mod tasks; +mod upgrade; mod vector; // Tests are isolated by features in different modules to allow better readability, test diff --git a/crates/meilisearch/tests/logs/mod.rs b/crates/meilisearch/tests/logs/mod.rs index 26482b561..e4dc50a9c 100644 --- a/crates/meilisearch/tests/logs/mod.rs +++ b/crates/meilisearch/tests/logs/mod.rs @@ -94,7 +94,7 @@ async fn basic_test_log_stream_route() { "enqueuedAt": "[date]" } "###); - server.wait_task(ret.uid()).await; + server.wait_task(ret.uid()).await.succeeded(); let req = actix_web::test::TestRequest::delete().uri("/logs/stream"); let req = req.to_request(); diff --git a/crates/meilisearch/tests/network/mod.rs b/crates/meilisearch/tests/network/mod.rs new file mode 100644 index 000000000..1c3661a06 --- /dev/null +++ b/crates/meilisearch/tests/network/mod.rs @@ -0,0 +1,606 @@ +use serde_json::Value::Null; + +use crate::common::Server; +use crate::json; + +#[actix_rt::test] +async fn error_network_not_enabled() { + let server = Server::new().await; + + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Using the /network route requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + let (response, code) = server.set_network(json!({"self": "myself"})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Using the /network route requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); +} + +#[actix_rt::test] +async fn errors_on_param() { + let server = Server::new().await; + + let (response, code) = server.set_features(json!({"network": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["network"]), @r#"true"#); + + // non-existing param + let (response, code) = server.set_network(json!({"selfie": "myself"})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Unknown field `selfie`: expected one of `remotes`, `self`", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + } + "###); + + // self not a string + let (response, code) = server.set_network(json!({"self": 42})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.self`: expected a string, but found a positive integer: `42`", + "code": "invalid_network_self", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_self" + } + "###); + + // remotes not an object + let (response, code) = server.set_network(json!({"remotes": 42})).await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.remotes`: expected an object, but found a positive integer: `42`", + "code": "invalid_network_remotes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_remotes" + } + "###); + + // new remote without url + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "searchApiKey": "http://localhost:7700" + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Missing field `.remotes.new.url`", + "code": "missing_network_url", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_network_url" + } + "###); + + // remote with url not a string + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "url": 7700 + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.remotes.new.url`: expected a string, but found a positive integer: `7700`", + "code": "invalid_network_url", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_url" + } + "###); + + // remote with non-existing param + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "url": "http://localhost:7700", + "doggo": "Intel the Beagle" + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Unknown field `doggo` inside `.remotes.new`: expected one of `url`, `searchApiKey`", + "code": "invalid_network_remotes", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_remotes" + } + "###); + + // remote with non-string searchApiKey + let (response, code) = server + .set_network(json!({"remotes": { + "new": { + "url": "http://localhost:7700", + "searchApiKey": 1204664602099962445u64, + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Invalid value type at `.remotes.new.searchApiKey`: expected a string, but found a positive integer: `1204664602099962445`", + "code": "invalid_network_search_api_key", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_search_api_key" + } + "###); + + // setting `null` on URL a posteriori + let (response, code) = server + .set_network(json!({"remotes": { + "kefir": { + "url": "http://localhost:7700", + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": null, + "remotes": { + "kefir": { + "url": "http://localhost:7700", + "searchApiKey": null + } + } + } + "###); + let (response, code) = server + .set_network(json!({"remotes": { + "kefir": { + "url": Null, + } + }})) + .await; + + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Field `.remotes.kefir.url` cannot be set to `null`", + "code": "invalid_network_url", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_network_url" + } + "###); +} + +#[actix_rt::test] +async fn auth() { + let mut server = Server::new_auth().await; + server.use_api_key("MASTER_KEY"); + + let (response, code) = server.set_features(json!({"network": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["network"]), @r#"true"#); + + let (get_network_key, code) = server + .add_api_key(json!({ + "actions": ["network.get"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let get_network_key = get_network_key["key"].clone(); + + let (update_network_key, code) = server + .add_api_key(json!({ + "actions": ["network.update"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let update_network_key = update_network_key["key"].clone(); + + let (search_api_key, code) = server + .add_api_key(json!({ + "actions": ["search"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let search_api_key = search_api_key["key"].clone(); + + // try with master key + let (response, code) = server + .set_network(json!({ + "self": "master" + })) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "master", + "remotes": {} + } + "###); + + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" +{ + "self": "master", + "remotes": {} +} +"###); + + // try get with get permission + server.use_api_key(get_network_key.as_str().unwrap()); + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" +{ + "self": "master", + "remotes": {} +} +"###); + + // try update with update permission + server.use_api_key(update_network_key.as_str().unwrap()); + + let (response, code) = server + .set_network(json!({ + "self": "api_key" + })) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" +{ + "self": "api_key", + "remotes": {} +} +"###); + + // try with the other's permission + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); + + server.use_api_key(get_network_key.as_str().unwrap()); + let (response, code) = server + .set_network(json!({ + "self": "get_api_key" + })) + .await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); + // try either with bad permission + server.use_api_key(search_api_key.as_str().unwrap()); + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); + + let (response, code) = server + .set_network(json!({ + "self": "get_api_key" + })) + .await; + + meili_snap::snapshot!(code, @"403 Forbidden"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "The provided API key is invalid.", + "code": "invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#invalid_api_key" + } + "###); +} + +#[actix_rt::test] +async fn get_and_set_network() { + let server = Server::new().await; + + let (response, code) = server.set_features(json!({"network": true})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["network"]), @r#"true"#); + + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": null, + "remotes": {} + } + "###); + + // adding self + let (response, code) = server.set_network(json!({"self": "myself"})).await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": {} + } + "###); + + // adding remotes + let (response, code) = server + .set_network(json!({"remotes": { + "myself": { + "url": "http://localhost:7700" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "foo" + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "myself": { + "url": "http://localhost:7700", + "searchApiKey": null + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "foo" + } + } + } + "###); + + // partially updating one remote + let (response, code) = server + .set_network(json!({"remotes": { + "thy": { + "searchApiKey": "bar" + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "myself": { + "url": "http://localhost:7700", + "searchApiKey": null + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // adding one remote + let (response, code) = server + .set_network(json!({"remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + } + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "myself": { + "url": "http://localhost:7700", + "searchApiKey": null + }, + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // deleting one remote + let (response, code) = server + .set_network(json!({"remotes": { + "myself": Null, + }})) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "myself", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // removing self + let (response, code) = server.set_network(json!({"self": Null})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": null, + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // setting self again + let (response, code) = server.set_network(json!({"self": "thy"})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // doing nothing + let (response, code) = server.set_network(json!({})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // still doing nothing + let (response, code) = server.set_network(json!({"remotes": {}})).await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // good time to check GET + let (response, code) = server.get_network().await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": { + "them": { + "url": "http://localhost:7702", + "searchApiKey": "baz" + }, + "thy": { + "url": "http://localhost:7701", + "searchApiKey": "bar" + } + } + } + "###); + + // deleting everything + let (response, code) = server + .set_network(json!({ + "remotes": Null, + })) + .await; + + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "self": "thy", + "remotes": {} + } + "###); +} diff --git a/crates/meilisearch/tests/search/distinct.rs b/crates/meilisearch/tests/search/distinct.rs index 2023c01a8..094ef7bbf 100644 --- a/crates/meilisearch/tests/search/distinct.rs +++ b/crates/meilisearch/tests/search/distinct.rs @@ -151,8 +151,8 @@ async fn distinct_search_with_offset_no_ranking() { let documents = DOCUMENTS.clone(); index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; - index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; - index.wait_task(1).await; + let (task, _status_code) = index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; + index.wait_task(task.uid()).await.succeeded(); fn get_hits(response: &Value) -> Vec<&str> { let hits_array = response["hits"].as_array().unwrap(); @@ -210,8 +210,8 @@ async fn distinct_search_with_pagination_no_ranking() { let documents = DOCUMENTS.clone(); index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await; - index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; - index.wait_task(1).await; + let (task, _status_code) = index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await; + index.wait_task(task.uid()).await.succeeded(); fn get_hits(response: &Value) -> Vec<&str> { let hits_array = response["hits"].as_array().unwrap(); diff --git a/crates/meilisearch/tests/search/errors.rs b/crates/meilisearch/tests/search/errors.rs index 6840f8fba..2b63a07b1 100644 --- a/crates/meilisearch/tests/search/errors.rs +++ b/crates/meilisearch/tests/search/errors.rs @@ -1,8 +1,10 @@ use meili_snap::*; -use crate::common::{shared_does_not_exists_index, Server}; +use crate::common::{shared_does_not_exists_index, Server, DOCUMENTS, NESTED_DOCUMENTS}; use crate::json; +use super::test_settings_documents_indexing_swapping_and_search; + #[actix_rt::test] async fn search_unexisting_index() { let index = shared_does_not_exists_index().await; @@ -430,7 +432,7 @@ async fn search_non_filterable_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute is `title`.", + "message": "Invalid facet distribution: Attribute `doggo` is not filterable. Available filterable attributes patterns are: `title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -441,7 +443,7 @@ async fn search_non_filterable_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attribute is `title`.", + "message": "Invalid facet distribution: Attribute `doggo` is not filterable. Available filterable attributes patterns are: `title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -461,7 +463,7 @@ async fn search_non_filterable_facets_multiple_filterable() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution: Attribute `doggo` is not filterable. Available filterable attributes patterns are: `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -472,7 +474,7 @@ async fn search_non_filterable_facets_multiple_filterable() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attribute `doggo` is not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution: Attribute `doggo` is not filterable. Available filterable attributes patterns are: `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -491,7 +493,7 @@ async fn search_non_filterable_facets_no_filterable() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, this index does not have configured filterable attributes.", + "message": "Invalid facet distribution: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -502,7 +504,7 @@ async fn search_non_filterable_facets_no_filterable() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, this index does not have configured filterable attributes.", + "message": "Invalid facet distribution: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -522,7 +524,7 @@ async fn search_non_filterable_facets_multiple_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attributes `doggo, neko` are not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution: Attributes `doggo, neko` are not filterable. Available filterable attributes patterns are: `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -533,7 +535,7 @@ async fn search_non_filterable_facets_multiple_facets() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid facet distribution, attributes `doggo, neko` are not filterable. The available filterable attributes are `genres, title`.", + "message": "Invalid facet distribution: Attributes `doggo, neko` are not filterable. Available filterable attributes patterns are: `genres, title`.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -636,14 +638,11 @@ async fn search_bad_matching_strategy() { #[actix_rt::test] async fn filter_invalid_syntax_object() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - index - .search(json!({"filter": "title & Glass"}), |response, code| { + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "title & Glass"}), + |response, code| { snapshot!(response, @r###" { "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", @@ -653,20 +652,18 @@ async fn filter_invalid_syntax_object() { } "###); snapshot!(code, @"400 Bad Request"); - }) - .await; + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_syntax_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - index - .search(json!({"filter": ["title & Glass"]}), |response, code| { + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["title & Glass"]}), + |response, code| { snapshot!(response, @r###" { "message": "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `title & Glass`.\n1:14 title & Glass", @@ -676,206 +673,327 @@ async fn filter_invalid_syntax_array() { } "###); snapshot!(code, @"400 Bad Request"); - }) - .await; + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_syntax_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "Found unexpected characters at the end of the filter: `XOR title = Glass`. You probably forgot an `OR` or an `AND` rule.\n15:32 title = Glass XOR title = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "title = Glass XOR title = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "title = Glass XOR title = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "Found unexpected characters at the end of the filter: `XOR title = Glass`. You probably forgot an `OR` or an `AND` rule.\n15:32 title = Glass XOR title = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_attribute_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["many = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["many = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_invalid_attribute_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "many = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "many = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_attribute_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["_geo = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["_geo = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_attribute_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "_geo = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "_geo = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_attribute_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["_geoDistance = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["_geoDistance = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_attribute_string() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "_geoDistance = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "_geoDistance = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_point_array() { - let server = Server::new_shared(); - let index = server.unique_index(); - - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; - - let expected_response = json!({ - "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": ["_geoPoint = Glass"]}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": ["_geoPoint = Glass"]}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; } #[actix_rt::test] async fn filter_reserved_geo_point_string() { - let server = Server::new_shared(); - let index = server.unique_index(); + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["title"]}), + &json!({"filter": "_geoPoint = Glass"}), + |response, code| { + snapshot!(response, @r###" + { + "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + snapshot!(code, @"400 Bad Request"); + }, + ) + .await; +} - let (task, _code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - index.wait_task(task.uid()).await; +#[actix_rt::test] +async fn search_with_pattern_filter_settings_errors() { + // Check if the Equality filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]}), + &json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r#" + { + "message": "Index `test`: Filter operator `=` is not allowed for the attribute `cattos`.\n - Note: allowed operators: OR, AND, NOT, <, >, <=, >=, TO, IS EMPTY, IS NULL, EXISTS.\n - Note: field `cattos` matched rule #0 in `filterableAttributes`\n - Hint: enable equality in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `cattos` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); + }, + ) + .await; - let expected_response = json!({ - "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - }); - index - .search(json!({"filter": "_geoPoint = Glass"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); - }) - .await; + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]}), + &json!({ + "filter": "cattos IN [pĆ©sti, simba]" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r#" + { + "message": "Index `test`: Filter operator `=` is not allowed for the attribute `cattos`.\n - Note: allowed operators: OR, AND, NOT, <, >, <=, >=, TO, IS EMPTY, IS NULL, EXISTS.\n - Note: field `cattos` matched rule #0 in `filterableAttributes`\n - Hint: enable equality in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `cattos` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); + }, +) +.await; + + // Check if the Comparison filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["cattos","doggos.age"]}]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r#" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`\n - Hint: enable comparison in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `doggos.age` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r#" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`\n - Hint: enable comparison in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `doggos.age` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]}), + &json!({ + "filter": "doggos.age 2 TO 4" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r#" + { + "message": "Index `test`: Filter operator `TO` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`\n - Hint: enable comparison in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `doggos.age` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); + }, + ) + .await; } #[actix_rt::test] @@ -884,7 +1002,7 @@ async fn sort_geo_reserved_attribute() { let index = server.unique_index(); let (task, _code) = index.update_settings(json!({"sortableAttributes": ["id"]})).await; - index.wait_task(task.uid()).await; + index.wait_task(task.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", @@ -911,7 +1029,7 @@ async fn sort_reserved_attribute() { let index = server.unique_index(); let (task, _code) = index.update_settings(json!({"sortableAttributes": ["id"]})).await; - index.wait_task(task.uid()).await; + index.wait_task(task.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geoDistance` is a reserved keyword and thus can't be used as a sort expression.", @@ -940,7 +1058,7 @@ async fn sort_unsortable_attribute() { index.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ - "message": "Attribute `title` is not sortable. Available sortable attributes are: `id`.", + "message": format!("Index `{}`: Attribute `title` is not sortable. Available sortable attributes are: `id`.", index.uid), "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -998,7 +1116,7 @@ async fn sort_unset_ranking_rule() { index.wait_task(response.uid()).await.succeeded(); let expected_response = json!({ - "message": "You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", + "message": format!("Index `{}`: You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", index.uid), "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1018,86 +1136,80 @@ async fn sort_unset_ranking_rule() { #[actix_rt::test] async fn search_on_unknown_field() { - let server = Server::new_shared(); - let index = server.unique_index(); - let (response, _code) = - index.update_settings_searchable_attributes(json!(["id", "title"])).await; - index.wait_task(response.uid()).await.succeeded(); - - index - .search( - json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), - |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); - }, - ) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"searchableAttributes": ["id", "title"]}), + &json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + } + "###); + }, + ) + .await; } #[actix_rt::test] async fn search_on_unknown_field_plus_joker() { - let server = Server::new_shared(); - let index = server.unique_index(); - let (response, _code) = - index.update_settings_searchable_attributes(json!(["id", "title"])).await; - index.wait_task(response.uid()).await.succeeded(); + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"searchableAttributes": ["id", "title"]}), + &json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + } + "###); + }, + ) + .await; - index - .search( - json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}), - |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); - }, - ) - .await; - - index - .search( - json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}), - |response, code| { - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", - "code": "invalid_search_attributes_to_search_on", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" - } - "###); - }, - ) - .await; + test_settings_documents_indexing_swapping_and_search( + &DOCUMENTS, + &json!({"searchableAttributes": ["id", "title"]}), + &json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Index `test`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", + "code": "invalid_search_attributes_to_search_on", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on" + } + "###); + }, + ) + .await; } #[actix_rt::test] async fn distinct_at_search_time() { - let server = Server::new_shared(); - let index = server.unique_index(); + let server = Server::new().await; + let index = server.index("test"); let (task, _) = index.create(None).await; index.wait_task(task.uid()).await.succeeded(); + let (response, _code) = + index.add_documents(json!([{"id": 1, "color": "Doggo", "machin": "Action"}]), None).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", + "message": "Index `test`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", "code": "invalid_search_distinct", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" @@ -1105,14 +1217,14 @@ async fn distinct_at_search_time() { "###); let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await; - index.wait_task(task.uid()).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", + "message": "Index `test`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes patterns are: `color, machin`.", "code": "invalid_search_distinct", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" @@ -1120,14 +1232,14 @@ async fn distinct_at_search_time() { "###); let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await; - index.wait_task(task.uid()).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", + "message": "Index `test`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes patterns are: `color, <..hidden-attributes>`.", "code": "invalid_search_distinct", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_distinct" diff --git a/crates/meilisearch/tests/search/facet_search.rs b/crates/meilisearch/tests/search/facet_search.rs index 12d2226a9..65e204702 100644 --- a/crates/meilisearch/tests/search/facet_search.rs +++ b/crates/meilisearch/tests/search/facet_search.rs @@ -1,7 +1,9 @@ use meili_snap::snapshot; +use meilisearch::Opt; use once_cell::sync::Lazy; +use tempfile::TempDir; -use crate::common::{Server, Value}; +use crate::common::{default_settings, Server, Value, NESTED_DOCUMENTS}; use crate::json; static DOCUMENTS: Lazy = Lazy::new(|| { @@ -34,6 +36,62 @@ static DOCUMENTS: Lazy = Lazy::new(|| { ]) }); +async fn test_settings_documents_indexing_swapping_and_facet_search( + documents: &Value, + settings: &Value, + query: &Value, + test: impl Fn(Value, actix_http::StatusCode) + std::panic::UnwindSafe + Clone, +) { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { ..default_settings(temp.path()) }).await.unwrap(); + + eprintln!("Documents -> Settings -> test"); + let index = server.index("test"); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (response, code) = index.facet_search(query.clone()).await; + insta::allow_duplicates! { + test(response, code); + } + + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + eprintln!("Settings -> Documents -> test"); + let index = server.index("test"); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (response, code) = index.facet_search(query.clone()).await; + insta::allow_duplicates! { + test(response, code); + } + + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); +} + #[actix_rt::test] async fn simple_facet_search() { let server = Server::new().await; @@ -41,8 +99,8 @@ async fn simple_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -57,6 +115,116 @@ async fn simple_facet_search() { assert_eq!(response["facetHits"].as_array().unwrap().len(), 1); } +#[actix_rt::test] +async fn simple_facet_search_on_movies() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = json!([ + { + "id": 1, + "title": "Carol", + "genres": [ + "Romance", + "Drama" + ], + "color": [ + "red" + ], + "platforms": [ + "MacOS", + "Linux", + "Windows" + ] + }, + { + "id": 2, + "title": "Wonder Woman", + "genres": [ + "Action", + "Adventure" + ], + "color": [ + "green" + ], + "platforms": [ + "MacOS" + ] + }, + { + "id": 3, + "title": "Life of Pi", + "genres": [ + "Adventure", + "Drama" + ], + "color": [ + "blue" + ], + "platforms": [ + "Windows" + ] + }, + { + "id": 4, + "title": "Mad Max: Fury Road", + "genres": [ + "Adventure", + "Science Fiction" + ], + "color": [ + "red" + ], + "platforms": [ + "MacOS", + "Linux" + ] + }, + { + "id": 5, + "title": "Moana", + "genres": [ + "Fantasy", + "Action" + ], + "color": [ + "red" + ], + "platforms": [ + "Windows" + ] + }, + { + "id": 6, + "title": "Philadelphia", + "genres": [ + "Drama" + ], + "color": [ + "blue" + ], + "platforms": [ + "MacOS", + "Linux", + "Windows" + ] + } + ]); + let (response, code) = + index.update_settings_filterable_attributes(json!(["genres", "color"])).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetQuery": "", "facetName": "genres", "q": "" })).await; + + assert_eq!(code, 200, "{}", response); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":2},{"value":"Adventure","count":3},{"value":"Drama","count":3},{"value":"Fantasy","count":1},{"value":"Romance","count":1},{"value":"Science Fiction","count":1}]"###); +} + #[actix_rt::test] async fn advanced_facet_search() { let server = Server::new().await; @@ -65,8 +233,8 @@ async fn advanced_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "enabled": false })).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -89,8 +257,8 @@ async fn more_advanced_facet_search() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; index.update_settings_typo_tolerance(json!({ "disableOnWords": ["adventre"] })).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "adventre"})).await; @@ -113,8 +281,8 @@ async fn simple_facet_search_with_max_values() { let documents = DOCUMENTS.clone(); index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -135,8 +303,8 @@ async fn simple_facet_search_by_count_with_max_values() { ) .await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -151,8 +319,8 @@ async fn non_filterable_facet_search_error() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -170,8 +338,8 @@ async fn facet_search_dont_support_words() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "words"})).await; @@ -188,8 +356,8 @@ async fn simple_facet_search_with_sort_by_count() { let documents = DOCUMENTS.clone(); index.update_settings_faceting(json!({ "sortFacetValuesBy": { "*": "count" } })).await; index.update_settings_filterable_attributes(json!(["genres"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; @@ -200,3 +368,583 @@ async fn simple_facet_search_with_sort_by_count() { assert_eq!(hits[0], json!({ "value": "Action", "count": 3 })); assert_eq!(hits[1], json!({ "value": "Adventure", "count": 2 })); } + +#[actix_rt::test] +async fn add_documents_and_deactivate_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 400, "{}", response); + snapshot!(response, @r###" + { + "message": "The facet search is disabled for this index", + "code": "facet_search_disabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#facet_search_disabled" + } + "###); +} + +#[actix_rt::test] +async fn deactivate_facet_search_and_add_documents() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 400, "{}", response); + snapshot!(response, @r###" + { + "message": "The facet search is disabled for this index", + "code": "facet_search_disabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#facet_search_disabled" + } + "###); +} + +#[actix_rt::test] +async fn deactivate_facet_search_add_documents_and_activate_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "facetSearch": true, + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); +} + +#[actix_rt::test] +async fn deactivate_facet_search_add_documents_and_reset_facet_search() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "facetSearch": false, + "filterableAttributes": ["genres"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + let documents = DOCUMENTS.clone(); + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "facetSearch": serde_json::Value::Null, + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await; + + assert_eq!(code, 200, "{}", response); + assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 2); +} + +#[actix_rt::test] +async fn facet_search_with_filterable_attributes_rules() { + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["genres"]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":3},{"value":"Adventure","count":2}]"###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"], "features": {"facetSearch": true, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"Action","count":3},{"value":"Adventure","count":2}]"###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": ["doggos.name"]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"bobby","count":1},{"value":"buddy","count":1}]"###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"], "features": {"facetSearch": true, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(response["facetHits"], @r###"[{"value":"bobby","count":1},{"value":"buddy","count":1}]"###); + }, + ).await; +} + +#[actix_rt::test] +async fn facet_search_with_filterable_attributes_rules_errors() { + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": ["genres"]}), + &json!({"facetName": "invalid", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `invalid` is not facet-searchable. Available facet-searchable attributes patterns are: `genres`. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"]}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `genres` is not facet-searchable. Note: this attribute matches rule #0 in filterableAttributes, but this rule does not enable facetSearch.\nHint: enable facetSearch in rule #0 by adding `\"facetSearch\": true` to the rule.\nHint: prepend another rule matching genres with facetSearch: true before rule #0""###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"], "features": {"facetSearch": false, "filter": {"equality": true, "comparison": true}}}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `genres` is not facet-searchable. Note: this attribute matches rule #0 in filterableAttributes, but this rule does not enable facetSearch.\nHint: enable facetSearch in rule #0 by adding `\"facetSearch\": true` to the rule.\nHint: prepend another rule matching genres with facetSearch: true before rule #0""###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["genres"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "genres", "facetQuery": "a"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `genres` is not facet-searchable. Note: this attribute matches rule #0 in filterableAttributes, but this rule does not enable facetSearch.\nHint: enable facetSearch in rule #0 by adding `\"facetSearch\": true` to the rule.\nHint: prepend another rule matching genres with facetSearch: true before rule #0""###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"]}]}), + &json!({"facetName": "invalid.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `invalid.name` is not facet-searchable. This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.""###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"], "features": {"facetSearch": false, "filter": {"equality": true, "comparison": true}}}]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `doggos.name` is not facet-searchable. Note: this attribute matches rule #0 in filterableAttributes, but this rule does not enable facetSearch.\nHint: enable facetSearch in rule #0 by adding `\"facetSearch\": true` to the rule.\nHint: prepend another rule matching doggos.name with facetSearch: true before rule #0""###); + }, + ).await; + + test_settings_documents_indexing_swapping_and_facet_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["doggos.name"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}]}), + &json!({"facetName": "doggos.name", "facetQuery": "b"}), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(response["message"], @r###""Attribute `doggos.name` is not facet-searchable. Note: this attribute matches rule #0 in filterableAttributes, but this rule does not enable facetSearch.\nHint: enable facetSearch in rule #0 by adding `\"facetSearch\": true` to the rule.\nHint: prepend another rule matching doggos.name with facetSearch: true before rule #0""###); + }, + ).await; +} + +#[actix_rt::test] +async fn distinct_facet_search_on_movies() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = json!([ + { + "id": 1, + "title": "Carol", + "genres": ["Romance", "Drama", "Blob"], + "color": "crimson" + }, + { + "id": 2, + "title": "Wonder Woman", + "genres": ["Action", "Adventure", "Blob"], + "color": "emerald" + }, + { + "id": 3, + "title": "Life of Pi", + "genres": ["Adventure", "Drama", "Blob"], + "color": "azure" + }, + { + "id": 4, + "title": "Mad Max: Fury Road", + "genres": ["Adventure", "Science Fiction", "Blob"], + "color": "scarlet" + }, + { + "id": 5, + "title": "Moana", + "genres": ["Fantasy", "Action", "Blob"], + "color": "coral" + }, + { + "id": 6, + "title": "Philadelphia", + "genres": ["Drama", "Blob"], + "color": "navy" + }, + { + "id": 7, + "title": "The Matrix", + "genres": ["Science Fiction", "Action", "Blob"], + "color": "onyx" + }, + { + "id": 8, + "title": "Inception", + "genres": ["Science Fiction", "Thriller", "Blob"], + "color": "cerulean" + }, + { + "id": 9, + "title": "The Shawshank Redemption", + "genres": ["Drama", "Blob"], + "color": "slate" + }, + { + "id": 10, + "title": "Pulp Fiction", + "genres": ["Crime", "Drama", "Blob"], + "color": "gold" + }, + { + "id": 11, + "title": "The Dark Knight", + "genres": ["Action", "Crime", "Blob"], + "color": "obsidian" + }, + { + "id": 12, + "title": "Forrest Gump", + "genres": ["Drama", "Romance", "Blob"], + "color": "jade" + }, + { + "id": 13, + "title": "The Godfather", + "genres": ["Crime", "Drama", "Blob"], + "color": "sepia" + }, + { + "id": 14, + "title": "Fight Club", + "genres": ["Drama", "Thriller", "Blob"], + "color": "ruby" + }, + { + "id": 15, + "title": "Goodfellas", + "genres": ["Crime", "Biography", "Blob"], + "color": "charcoal" + }, + { + "id": 16, + "title": "The Silence of the Lambs", + "genres": ["Crime", "Thriller", "Blob"], + "color": "amethyst" + }, + { + "id": 17, + "title": "Schindler's List", + "genres": ["Biography", "Drama", "Blob"], + "color": "ebony" + }, + { + "id": 18, + "title": "The Lord of the Rings", + "genres": ["Adventure", "Fantasy", "Blob"], + "color": "forest" + }, + { + "id": 19, + "title": "Star Wars", + "genres": ["Science Fiction", "Adventure", "Blob"], + "color": "amber" + }, + { + "id": 20, + "title": "Jurassic Park", + "genres": ["Adventure", "Science Fiction", "Blob"], + "color": "lime" + }, + { + "id": 21, + "title": "Titanic", + "genres": ["Drama", "Romance", "Blob"], + "color": "sapphire" + }, + { + "id": 22, + "title": "The Avengers", + "genres": ["Action", "Science Fiction", "Blob"], + "color": "burgundy" + }, + { + "id": 23, + "title": "Avatar", + "genres": ["Science Fiction", "Adventure", "Blob"], + "color": "turquoise" + }, + { + "id": 24, + "title": "The Green Mile", + "genres": ["Crime", "Fantasy", "Blob"], + "color": "emerald" + }, + { + "id": 25, + "title": "Gladiator", + "genres": ["Action", "Drama", "Blob"], + "color": "sepia" + }, + { + "id": 26, + "title": "The Departed", + "genres": ["Crime", "Thriller", "Blob"], + "color": "crimson" + }, + { + "id": 27, + "title": "Saving Private Ryan", + "genres": ["Drama", "War", "Blob"], + "color": "slate" + }, + { + "id": 28, + "title": "Interstellar", + "genres": ["Science Fiction", "Adventure", "Blob"], + "color": "azure" + }, + { + "id": 29, + "title": "The Pianist", + "genres": ["Biography", "Drama", "Blob"], + "color": "onyx" + }, + { + "id": 30, + "title": "The Usual Suspects", + "genres": ["Crime", "Mystery", "Blob"], + "color": "charcoal" + }, + { + "id": 31, + "title": "The Sixth Sense", + "genres": ["Mystery", "Thriller", "Blob"], + "color": "amethyst" + }, + { + "id": 32, + "title": "The Princess Bride", + "genres": ["Adventure", "Romance", "Blob"], + "color": "ruby" + }, + { + "id": 33, + "title": "Blade Runner", + "genres": ["Science Fiction", "Noir", "Blob"], + "color": "sapphire" + }, + { + "id": 34, + "title": "The Big Lebowski", + "genres": ["Comedy", "Crime", "Blob"], + "color": "gold" + }, + { + "id": 35, + "title": "Good Will Hunting", + "genres": ["Drama", "Romance", "Blob"], + "color": "turquoise" + }, + { + "id": 36, + "title": "The Terminator", + "genres": ["Action", "Science Fiction", "Blob"], + "color": "obsidian" + }, + { + "id": 37, + "title": "Casablanca", + "genres": ["Drama", "Romance", "Blob"], + "color": "jade" + }, + { + "id": 38, + "title": "The Exorcist", + "genres": ["Horror", "Thriller", "Blob"], + "color": "burgundy" + }, + { + "id": 39, + "title": "Apocalypse Now", + "genres": ["Drama", "War", "Blob"], + "color": "forest" + }, + { + "id": 40, + "title": "Back to the Future", + "genres": ["Adventure", "Comedy", "Blob"], + "color": "amber" + }, + { + "id": 41, + "title": "The Graduate", + "genres": ["Comedy", "Drama", "Blob"], + "color": "azure" + }, + { + "id": 42, + "title": "Alien", + "genres": ["Horror", "Science Fiction", "Blob"], + "color": "obsidian" + }, + { + "id": 43, + "title": "The Breakfast Club", + "genres": ["Drama", "Comedy", "Blob"], + "color": "coral" + }, + { + "id": 44, + "title": "Die Hard", + "genres": ["Action", "Thriller", "Blob"], + "color": "scarlet" + }, + { + "id": 45, + "title": "The Sound of Music", + "genres": ["Drama", "Musical", "Blob"], + "color": "emerald" + }, + { + "id": 46, + "title": "Jaws", + "genres": ["Horror", "Thriller", "Blob"], + "color": "navy" + }, + { + "id": 47, + "title": "Rocky", + "genres": ["Drama", "Sport", "Blob"], + "color": "burgundy" + }, + { + "id": 48, + "title": "E.T. the Extra-Terrestrial", + "genres": ["Adventure", "Science Fiction", "Blob"], + "color": "amber" + }, + { + "id": 49, + "title": "The Godfather Part II", + "genres": ["Crime", "Drama", "Blob"], + "color": "sepia" + }, + { + "id": 50, + "title": "One Flew Over the Cuckoo's Nest", + "genres": ["Drama", "Blob"], + "color": "slate" + } + ]); + let (response, code) = + index.update_settings_filterable_attributes(json!(["genres", "color"])).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + let (response, code) = index.update_settings_distinct_attribute(json!("color")).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + let (response, code) = + index.facet_search(json!({"facetQuery": "blob", "facetName": "genres", "q": "" })).await; + + // non-exhaustive facet count is counting 27 documents with the facet query "blob" but there are only 23 documents with a distinct color. + assert_eq!(code, 200, "{}", response); + snapshot!(response["facetHits"], @r###"[{"value":"Blob","count":27}]"###); + + let (response, code) = + index.facet_search(json!({"facetQuery": "blob", "facetName": "genres", "q": "", "exhaustiveFacetCount": true })).await; + + // exhaustive facet count is counting 23 documents with the facet query "blob" which is the number of distinct colors. + assert_eq!(code, 200, "{}", response); + snapshot!(response["facetHits"], @r###"[{"value":"Blob","count":23}]"###); +} diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs new file mode 100644 index 000000000..4219d2ec1 --- /dev/null +++ b/crates/meilisearch/tests/search/filters.rs @@ -0,0 +1,758 @@ +use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; +use tempfile::TempDir; + +use super::test_settings_documents_indexing_swapping_and_search; +use crate::{ + common::{default_settings, shared_index_with_documents, Server, DOCUMENTS, NESTED_DOCUMENTS}, + json, +}; + +#[actix_rt::test] +async fn search_with_filter_string_notation() { + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + let (task, code) = index.add_documents(documents, None).await; + meili_snap::snapshot!(code, @"202 Accepted"); + let res = index.wait_task(task.uid()).await; + meili_snap::snapshot!(res["status"], @r###""succeeded""###); + + index + .search( + json!({ + "filter": "title = GlƤss" + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + }, + ) + .await; + + let index = server.index("nested"); + + let (_, code) = + index.update_settings(json!({"filterableAttributes": ["cattos", "doggos.age"]})).await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = NESTED_DOCUMENTS.clone(); + let (task, code) = index.add_documents(documents, None).await; + meili_snap::snapshot!(code, @"202 Accepted"); + let res = index.wait_task(task.uid()).await; + meili_snap::snapshot!(res["status"], @r###""succeeded""###); + + index + .search( + json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + assert_eq!(response["hits"][0]["id"], json!(852)); + }, + ) + .await; + + index + .search( + json!({ + "filter": "doggos.age > 5" + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 2); + assert_eq!(response["hits"][0]["id"], json!(654)); + assert_eq!(response["hits"][1]["id"], json!(951)); + }, + ) + .await; +} + +#[actix_rt::test] +async fn search_with_filter_array_notation() { + let index = shared_index_with_documents().await; + let (response, code) = index + .search_post(json!({ + "filter": ["title = GlƤss"] + })) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + + let (response, code) = index + .search_post(json!({ + "filter": [["title = GlƤss", "title = \"Shazam!\"", "title = \"Escape Room\""]] + })) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 3); +} + +#[actix_rt::test] +async fn search_with_contains_filter() { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { + experimental_contains_filter: true, + ..default_settings(temp.path()) + }) + .await + .unwrap(); + let index = server.index("movies"); + + index.update_settings(json!({"filterableAttributes": ["title"]})).await; + + let documents = DOCUMENTS.clone(); + let (request, _code) = index.add_documents(documents, None).await; + index.wait_task(request.uid()).await.succeeded(); + + let (response, code) = index + .search_post(json!({ + "filter": "title CONTAINS cap" + })) + .await; + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 2); +} + +#[actix_rt::test] +async fn search_with_pattern_filter_settings() { + // Check if the Equality filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{"attributePatterns": ["cattos","doggos.age"]}]}), + &json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]}), + &json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter works with patterns + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestifĆ©rĆ©" + ] + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ] + } + ] + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn search_with_pattern_filter_settings_scenario_1() { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { ..default_settings(temp.path()) }).await.unwrap(); + + eprintln!("Documents -> Settings -> test"); + let index = server.index("test"); + + let (task, code) = index.add_documents(NESTED_DOCUMENTS.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter works + index + .search( + json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter returns an error + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`\n - Hint: enable comparison in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `doggos.age` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // Update the settings activate comparison filter + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": true} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter works + index + .search( + json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter works + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestifĆ©rĆ©" + ] + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ] + } + ] + "###); + }, + ) + .await; + + // Update the settings deactivate equality filter + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": false, "comparison": true} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter returns an error + index + .search( + json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `=` is not allowed for the attribute `cattos`.\n - Note: allowed operators: OR, AND, NOT, <, >, <=, >=, TO, IS EMPTY, IS NULL, EXISTS.\n - Note: field `cattos` matched rule #0 in `filterableAttributes`\n - Hint: enable equality in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `cattos` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // Check if the Comparison filter works + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + }, + { + "id": 654, + "father": "pierre", + "mother": "sabine", + "doggos": [ + { + "name": "gros bill", + "age": 8 + } + ], + "cattos": [ + "simba", + "pestifĆ©rĆ©" + ] + }, + { + "id": 951, + "father": "jean-baptiste", + "mother": "sophie", + "doggos": [ + { + "name": "turbo", + "age": 5 + }, + { + "name": "fast", + "age": 6 + } + ], + "cattos": [ + "moumoute", + "gomez" + ] + } + ] + "###); + }, + ) + .await; + + // rollback the settings + let (task, code) = index + .update_settings(json!({"filterableAttributes": [{ + "attributePatterns": ["cattos","doggos.age"], + "features": { + "facetSearch": false, + "filter": {"equality": true, "comparison": false} + } + }]})) + .await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // Check if the Equality filter works + index + .search( + json!({ + "filter": "cattos = pĆ©sti" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + // Check if the Comparison filter returns an error + index + .search( + json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Filter operator `>` is not allowed for the attribute `doggos.age`.\n - Note: allowed operators: OR, AND, NOT, =, !=, IN, IS EMPTY, IS NULL, EXISTS.\n - Note: field `doggos.age` matched rule #0 in `filterableAttributes`\n - Hint: enable comparison in rule #0 by modifying the features.filter object\n - Hint: prepend another rule matching `doggos.age` with appropriate filter features before rule #0", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn test_filterable_attributes_priority() { + // Test that the filterable attributes priority is respected + + // check if doggos.name is filterable + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos.a*"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos.name = bobby" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + // check if doggos.name is filterable 2 + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos.name = bobby" + }), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 852, + "father": "jean", + "mother": "michelle", + "doggos": [ + { + "name": "bobby", + "age": 2 + }, + { + "name": "buddy", + "age": 4 + } + ], + "cattos": "pĆ©sti" + } + ] + "###); + }, + ) + .await; + + // check if doggos.age is not filterable + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos.a*"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos.age > 2" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `doggos.age` is not filterable. Available filterable attribute patterns are: `doggos.*`.\n1:11 doggos.age > 2", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // check if doggos is not filterable + test_settings_documents_indexing_swapping_and_search( + &NESTED_DOCUMENTS, + &json!({"filterableAttributes": [ + // deactivated filter + {"attributePatterns": ["doggos"], "features": {"facetSearch": false, "filter": {"equality": false, "comparison": false}}}, + // activated filter + {"attributePatterns": ["doggos.*"]}, + ]}), + &json!({ + "filter": "doggos EXISTS" + }), + |response, code| { + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `doggos` is not filterable. Available filterable attribute patterns are: `doggos.*`.\n1:7 doggos EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; +} diff --git a/crates/meilisearch/tests/search/formatted.rs b/crates/meilisearch/tests/search/formatted.rs index ee33939fd..2b9383034 100644 --- a/crates/meilisearch/tests/search/formatted.rs +++ b/crates/meilisearch/tests/search/formatted.rs @@ -4,6 +4,58 @@ use super::*; use crate::common::Server; use crate::json; +#[actix_rt::test] +async fn search_formatted_from_sdk() { + let server = Server::new_shared(); + let index = server.unique_index(); + + index + .update_settings( + json!({ "filterableAttributes": ["genre"], "searchableAttributes": ["title"] }), + ) + .await; + + let documents = json!([ + { "id": 123, "title": "Pride and Prejudice", "genre": "romance" }, + { "id": 456, "title": "Le Petit Prince", "genre": "adventure" }, + { "id": 1, "title": "Alice In Wonderland", "genre": "adventure" }, + { "id": 2, "title": "Le Rouge et le Noir", "genre": "romance" }, + { "id": 1344, "title": "The Hobbit", "genre": "adventure" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "genre": "fantasy" }, + { "id": 7, "title": "Harry Potter and the Chamber of Secrets", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy" } + ]); + let (response, _) = index.add_documents(documents, None).await; + index.wait_task(response.uid()).await; + + index + .search( + json!({ "q":"prince", + "attributesToCrop": ["title"], + "cropLength": 2, + "filter": "genre = adventure", + "attributesToHighlight": ["title"], + "attributesToRetrieve": ["title"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + allow_duplicates! { + assert_json_snapshot!(response["hits"][0], + { "._rankingScore" => "[score]" }, + @r###" + { + "title": "Le Petit Prince", + "_formatted": { + "title": "…Petit Prince" + } + } + "###); + } + }, + ) + .await; +} + #[actix_rt::test] async fn formatted_contain_wildcard() { let server = Server::new_shared(); @@ -13,7 +65,7 @@ async fn formatted_contain_wildcard() { let documents = NESTED_DOCUMENTS.clone(); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); index.search(json!({ "q": "pĆ©sti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"], "showMatchesPosition": true }), |response, code| @@ -22,7 +74,7 @@ async fn formatted_contain_wildcard() { allow_duplicates! { assert_json_snapshot!(response["hits"][0], { "._rankingScore" => "[score]" }, - @r###" + @r#" { "_formatted": { "id": "852", @@ -32,12 +84,12 @@ async fn formatted_contain_wildcard() { "cattos": [ { "start": 0, - "length": 5 + "length": 6 } ] } } - "###); + "#); } } ) @@ -67,7 +119,7 @@ async fn formatted_contain_wildcard() { allow_duplicates! { assert_json_snapshot!(response["hits"][0], { "._rankingScore" => "[score]" }, - @r###" + @r#" { "id": 852, "cattos": "pĆ©sti", @@ -79,12 +131,12 @@ async fn formatted_contain_wildcard() { "cattos": [ { "start": 0, - "length": 5 + "length": 6 } ] } } - "###) + "#) } }) .await; @@ -346,7 +398,7 @@ async fn displayedattr_2_smol() { let documents = NESTED_DOCUMENTS.clone(); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); index .search(json!({ "attributesToRetrieve": ["father", "id"], "attributesToHighlight": ["mother"], "attributesToCrop": ["cattos"] }), @@ -544,7 +596,7 @@ async fn test_cjk_highlight() { { "id": 1, "title": "å¤§å«åˆ°äŗ†ę‰«ē½—é‚£é‡Œ" }, ]); let (response, _) = index.add_documents(documents, None).await; - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); index .search(json!({"q": "恧", "attributesToHighlight": ["title"]}), |response, code| { diff --git a/crates/meilisearch/tests/search/geo.rs b/crates/meilisearch/tests/search/geo.rs index e92056191..a314ca241 100644 --- a/crates/meilisearch/tests/search/geo.rs +++ b/crates/meilisearch/tests/search/geo.rs @@ -1,9 +1,12 @@ use meili_snap::{json_string, snapshot}; +use meilisearch_types::milli::constants::RESERVED_GEO_FIELD_NAME; use once_cell::sync::Lazy; use crate::common::{Server, Value}; use crate::json; +use super::test_settings_documents_indexing_swapping_and_search; + static DOCUMENTS: Lazy = Lazy::new(|| { json!([ { @@ -46,8 +49,8 @@ async fn geo_sort_with_geo_strings() { let documents = DOCUMENTS.clone(); index.update_settings_filterable_attributes(json!(["_geo"])).await; index.update_settings_sortable_attributes(json!(["_geo"])).await; - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -128,7 +131,7 @@ async fn bug_4640() { index.add_documents(documents, None).await; index.update_settings_filterable_attributes(json!(["_geo"])).await; let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await; - index.wait_task(ret.uid()).await; + index.wait_task(ret.uid()).await.succeeded(); // Sort the document with the second one first index @@ -184,3 +187,184 @@ async fn bug_4640() { ) .await; } + +#[actix_rt::test] +async fn geo_asc_with_words() { + let documents = json!([ + { "id": 0, "doggo": "jean", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, "doggo": "intel", RESERVED_GEO_FIELD_NAME: { "lat": 88, "lng": 0 } }, + { "id": 2, "doggo": "jean bob", RESERVED_GEO_FIELD_NAME: { "lat": -89, "lng": 0 } }, + { "id": 3, "doggo": "jean michel", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 178 } }, + { "id": 4, "doggo": "bob marley", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": -179 } }, + ]); + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "geo:asc"]}), + &json!({"q": "jean"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 0, + "doggo": "jean", + "_geo": { + "lat": 0, + "lng": 0 + } + }, + { + "id": 2, + "doggo": "jean bob", + "_geo": { + "lat": -89, + "lng": 0 + } + }, + { + "id": 3, + "doggo": "jean michel", + "_geo": { + "lat": 0, + "lng": 178 + } + } + ], + "query": "jean", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "geo:asc"]}), + &json!({"q": "bob"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 2, + "doggo": "jean bob", + "_geo": { + "lat": -89, + "lng": 0 + } + }, + { + "id": 4, + "doggo": "bob marley", + "_geo": { + "lat": 0, + "lng": -179 + } + } + ], + "query": "bob", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "geo:asc"]}), + &json!({"q": "intel"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 1, + "doggo": "intel", + "_geo": { + "lat": 88, + "lng": 0 + } + } + ], + "query": "intel", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "###); + }, + ) + .await; +} + +#[actix_rt::test] +async fn geo_sort_with_words() { + let documents = json!([ + { "id": 0, "doggo": "jean", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, "doggo": "intel", RESERVED_GEO_FIELD_NAME: { "lat": 88, "lng": 0 } }, + { "id": 2, "doggo": "jean bob", RESERVED_GEO_FIELD_NAME: { "lat": -89, "lng": 0 } }, + { "id": 3, "doggo": "jean michel", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 178 } }, + { "id": 4, "doggo": "bob marley", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": -179 } }, + ]); + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({"searchableAttributes": ["id", "doggo"], "rankingRules": ["words", "sort"], "sortableAttributes": [RESERVED_GEO_FIELD_NAME]}), + &json!({"q": "jean", "sort": ["_geoPoint(0.0, 0.0):asc"]}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "id": 0, + "doggo": "jean", + "_geo": { + "lat": 0, + "lng": 0 + }, + "_geoDistance": 0 + }, + { + "id": 2, + "doggo": "jean bob", + "_geo": { + "lat": -89, + "lng": 0 + }, + "_geoDistance": 9896348 + }, + { + "id": 3, + "doggo": "jean michel", + "_geo": { + "lat": 0, + "lng": 178 + }, + "_geoDistance": 19792697 + } + ], + "query": "jean", + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "###); + }, + ) + .await; +} diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index 00a65d9aa..3282a357a 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -11,49 +11,23 @@ async fn index_with_documents_user_provided<'a>( ) -> Index<'a> { let index = server.index("test"); - let (response, code) = server.set_features(json!({"vectorStore": true})).await; - - meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); - let (response, code) = index .update_settings(json!({ "embedders": {"default": { "source": "userProvided", "dimensions": 2}}} )) .await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.add_documents(documents.clone(), None).await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); index } async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> Index<'a> { let index = server.index("test"); - let (response, code) = server.set_features(json!({"vectorStore": true})).await; - - meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); - let (response, code) = index .update_settings(json!({ "embedders": {"default": { "source": "huggingFace", @@ -63,11 +37,11 @@ async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> I }}} )) .await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.add_documents(documents.clone(), None).await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); index } @@ -260,7 +234,7 @@ async fn distribution_shift() { snapshot!(code, @"202 Accepted"); let response = server.wait_task(response.uid()).await; - snapshot!(response["details"], @r###"{"embedders":{"default":{"distribution":{"mean":0.998,"sigma":0.01}}}}"###); + snapshot!(response["details"], @r#"{"embedders":{"default":{"distribution":{"mean":0.998,"sigma":0.01}}}}"#); let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); @@ -573,7 +547,7 @@ async fn retrieve_vectors() { .update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} )) .await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index .search_post( @@ -623,7 +597,7 @@ async fn retrieve_vectors() { let (response, code) = index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; assert_eq!(202, code, "{:?}", response); - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index .search_post( diff --git a/crates/meilisearch/tests/search/locales.rs b/crates/meilisearch/tests/search/locales.rs index c01d854e2..282589d6a 100644 --- a/crates/meilisearch/tests/search/locales.rs +++ b/crates/meilisearch/tests/search/locales.rs @@ -98,8 +98,8 @@ async fn simple_search() { json!({"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"]}), ) .await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // english index @@ -220,8 +220,8 @@ async fn force_locales() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // chinese detection index @@ -298,8 +298,8 @@ async fn force_locales_with_pattern() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // chinese detection index @@ -374,8 +374,8 @@ async fn force_locales_with_pattern_nested() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // chinese index @@ -449,8 +449,8 @@ async fn force_different_locales_with_pattern() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // force chinese index @@ -527,8 +527,8 @@ async fn auto_infer_locales_at_search_with_attributes_to_search_on() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // auto infer any language index @@ -601,8 +601,8 @@ async fn auto_infer_locales_at_search() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -700,8 +700,8 @@ async fn force_different_locales_with_pattern_nested() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); // chinese index @@ -778,8 +778,8 @@ async fn settings_change() { let index = server.index("test"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index .update_settings(json!({ "searchableAttributes": ["document_en", "document_ja", "document_zh"], @@ -798,7 +798,7 @@ async fn settings_change() { "enqueuedAt": "[date]" } "###); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); // chinese index @@ -861,7 +861,7 @@ async fn settings_change() { "enqueuedAt": "[date]" } "###); - index.wait_task(2).await; + index.wait_task(response.uid()).await.succeeded(); // chinese index @@ -915,8 +915,8 @@ async fn invalid_locales() { json!({"searchableAttributes": ["name_en", "name_ja", "name_zh", "author_en", "author_ja", "author_zh", "description_en", "description_ja", "description_zh"]}), ) .await; - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({"q": "Atta", "locales": ["invalid"]})).await; snapshot!(code, @"400 Bad Request"); @@ -1033,8 +1033,8 @@ async fn simple_facet_search() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index .facet_search(json!({"facetName": "name_zh", "facetQuery": "é€²ę’ƒ", "locales": ["cmn"]})) @@ -1095,8 +1095,8 @@ async fn facet_search_with_localized_attributes() { "enqueuedAt": "[date]" } "###); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index .facet_search(json!({"facetName": "name_zh", "facetQuery": "进击", "locales": ["cmn"]})) @@ -1165,7 +1165,7 @@ async fn swedish_search() { ] })) .await; - index.wait_task(1).await; + index.wait_task(_response.uid()).await.succeeded(); // infer swedish index @@ -1286,7 +1286,7 @@ async fn german_search() { ] })) .await; - index.wait_task(1).await; + index.wait_task(_response.uid()).await.succeeded(); // infer swedish index diff --git a/crates/meilisearch/tests/search/matching_strategy.rs b/crates/meilisearch/tests/search/matching_strategy.rs index a4cb19f62..3b4325c10 100644 --- a/crates/meilisearch/tests/search/matching_strategy.rs +++ b/crates/meilisearch/tests/search/matching_strategy.rs @@ -8,8 +8,8 @@ use crate::json; async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> { let index = server.index("test"); - index.add_documents(documents.clone(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents.clone(), None).await; + index.wait_task(task.uid()).await.succeeded(); index } diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index 8cafe1dd8..d7a09b58e 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -4,6 +4,7 @@ mod distinct; mod errors; mod facet_search; +mod filters; mod formatted; mod geo; mod hybrid; @@ -15,15 +16,64 @@ mod pagination; mod restrict_searchable; mod search_queue; +use meili_snap::{json_string, snapshot}; use meilisearch::Opt; use tempfile::TempDir; use crate::common::{ default_settings, shared_index_with_documents, shared_index_with_nested_documents, Server, - DOCUMENTS, FRUITS_DOCUMENTS, NESTED_DOCUMENTS, SCORE_DOCUMENTS, VECTOR_DOCUMENTS, + Value, DOCUMENTS, FRUITS_DOCUMENTS, NESTED_DOCUMENTS, SCORE_DOCUMENTS, VECTOR_DOCUMENTS, }; use crate::json; +async fn test_settings_documents_indexing_swapping_and_search( + documents: &Value, + settings: &Value, + query: &Value, + test: impl Fn(Value, actix_http::StatusCode) + std::panic::UnwindSafe + Clone, +) { + let temp = TempDir::new().unwrap(); + let server = Server::new_with_options(Opt { ..default_settings(temp.path()) }).await.unwrap(); + + eprintln!("Documents -> Settings -> test"); + let index = server.index("test"); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + index.search(query.clone(), test.clone()).await; + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + eprintln!("Settings -> Documents -> test"); + let index = server.index("test"); + + let (task, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + let (task, code) = index.add_documents(documents.clone(), None).await; + assert_eq!(code, 202, "{}", task); + let response = index.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); + + index.search(query.clone(), test.clone()).await; + let (task, code) = server.delete_index("test").await; + assert_eq!(code, 202, "{}", task); + let response = server.wait_task(task.uid()).await; + assert!(response.is_success(), "{:?}", response); +} + #[actix_rt::test] async fn simple_placeholder_search() { let index = shared_index_with_documents().await; @@ -62,6 +112,105 @@ async fn simple_search() { .await; } +#[actix_rt::test] +async fn search_with_stop_word() { + // related to https://github.com/meilisearch/meilisearch/issues/4984 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index + .update_settings(json!({"stopWords": ["the", "The", "a", "an", "to", "in", "of"]})) + .await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(1).await; + + // prefix search + index + .search(json!({"q": "to the", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @"[]"); + }) + .await; + + // non-prefix search + index + .search(json!({"q": "to the ", "attributesToHighlight": ["title"], "attributesToRetrieve": ["title"] }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "_formatted": { + "title": "Shazam!" + } + }, + { + "title": "Captain Marvel", + "_formatted": { + "title": "Captain Marvel" + } + }, + { + "title": "Escape Room", + "_formatted": { + "title": "Escape Room" + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "_formatted": { + "title": "How to Train Your Dragon: The Hidden World" + } + }, + { + "title": "GlƤss", + "_formatted": { + "title": "GlƤss" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn search_with_typo_settings() { + // related to https://github.com/meilisearch/meilisearch/issues/5240 + let server = Server::new().await; + let index = server.index("test"); + + let (_, code) = index + .update_settings(json!({"typoTolerance": { "disableOnAttributes": ["title", "id"]}})) + .await; + meili_snap::snapshot!(code, @"202 Accepted"); + + let documents = DOCUMENTS.clone(); + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + + index + .search(json!({"q": "287947" }), |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Shazam!", + "id": "287947", + "color": [ + "green", + "blue" + ] + } + ] + "###); + }) + .await; +} + #[actix_rt::test] async fn phrase_search_with_stop_word() { // related to https://github.com/meilisearch/meilisearch/issues/3521 @@ -72,8 +221,8 @@ async fn phrase_search_with_stop_word() { meili_snap::snapshot!(code, @"202 Accepted"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "how \"to\" train \"the" }), |response, code| { @@ -152,11 +301,12 @@ async fn negative_special_cases_search() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); - index.update_settings(json!({"synonyms": { "escape": ["glƤss"] }})).await; - index.wait_task(1).await; + let (task, _status_code) = + index.update_settings(json!({"synonyms": { "escape": ["glƤss"] }})).await; + index.wait_task(task.uid()).await.succeeded(); // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass index @@ -181,8 +331,8 @@ async fn test_kanji_language_detection() { { "id": 1, "title": "ę±äŗ¬ć®ćŠåÆæåøć€‚" }, { "id": 2, "title": "×”Ö·×©Ö¼××•Ö¼×¢Öø×œ ×”Ö·×žÖøÖ¼×”Ö“×™×Ø (דהַחוּםד) לֹא ×™Öø×›×•Ö¹×œ ל֓קְפֹּׄ 9.94 ×žÖ¶×˜Ö°×ØÖ“×™×, × Öø×›×•Ö¹×Ÿ? ברר, 1.5°C- בַּחוּׄ!" } ]); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "ę±äŗ¬"}), |response, code| { @@ -204,11 +354,11 @@ async fn test_thai_language() { { "id": 1, "title": "ąøŖąøšąø¹ą¹ˆąøŖąø”ąøøąø™ą¹„ąøžąø£ąøŠąø²ą¹€ąø‚ąøµąø¢ąø§ 100 กรัด ąøˆąø³ąø™ąø§ąø™ 6 ก้อน" }, { "id": 2, "title": "ąøŖąøšąø¹ą¹ˆąøŖąø”ąøøąø™ą¹„ąøžąø£ąøąø²ąø‡ą¹ąø”ąø‡ąøœąøŖąø”ąø§ą¹ˆąø²ąø™ąø«ąø²ąø‡ąøˆąø£ą¹€ąø‚ą¹‰ 100 กรัด ąøˆąø³ąø™ąø§ąø™ 6 ก้อน" } ]); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); - index.update_settings(json!({"rankingRules": ["exactness"]})).await; - index.wait_task(1).await; + let (task, _status_code) = index.update_settings(json!({"rankingRules": ["exactness"]})).await; + index.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "สบู"}), |response, code| { @@ -254,118 +404,6 @@ async fn search_multiple_params() { .await; } -#[actix_rt::test] -async fn search_with_filter_string_notation() { - let server = Server::new().await; - let index = server.index("test"); - - let (_, code) = index.update_settings(json!({"filterableAttributes": ["title"]})).await; - meili_snap::snapshot!(code, @"202 Accepted"); - - let documents = DOCUMENTS.clone(); - let (_, code) = index.add_documents(documents, None).await; - meili_snap::snapshot!(code, @"202 Accepted"); - let res = index.wait_task(1).await; - meili_snap::snapshot!(res["status"], @r###""succeeded""###); - - index - .search( - json!({ - "filter": "title = GlƤss" - }), - |response, code| { - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 1); - }, - ) - .await; - - let index = server.index("nested"); - - let (_, code) = - index.update_settings(json!({"filterableAttributes": ["cattos", "doggos.age"]})).await; - meili_snap::snapshot!(code, @"202 Accepted"); - - let documents = NESTED_DOCUMENTS.clone(); - let (_, code) = index.add_documents(documents, None).await; - meili_snap::snapshot!(code, @"202 Accepted"); - let res = index.wait_task(3).await; - meili_snap::snapshot!(res["status"], @r###""succeeded""###); - - index - .search( - json!({ - "filter": "cattos = pĆ©sti" - }), - |response, code| { - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 1); - assert_eq!(response["hits"][0]["id"], json!(852)); - }, - ) - .await; - - index - .search( - json!({ - "filter": "doggos.age > 5" - }), - |response, code| { - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 2); - assert_eq!(response["hits"][0]["id"], json!(654)); - assert_eq!(response["hits"][1]["id"], json!(951)); - }, - ) - .await; -} - -#[actix_rt::test] -async fn search_with_filter_array_notation() { - let index = shared_index_with_documents().await; - let (response, code) = index - .search_post(json!({ - "filter": ["title = GlƤss"] - })) - .await; - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 1); - - let (response, code) = index - .search_post(json!({ - "filter": [["title = GlƤss", "title = \"Shazam!\"", "title = \"Escape Room\""]] - })) - .await; - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 3); -} - -#[actix_rt::test] -async fn search_with_contains_filter() { - let temp = TempDir::new().unwrap(); - let server = Server::new_with_options(Opt { - experimental_contains_filter: true, - ..default_settings(temp.path()) - }) - .await - .unwrap(); - let index = server.index("movies"); - - index.update_settings(json!({"filterableAttributes": ["title"]})).await; - - let documents = DOCUMENTS.clone(); - let (request, _code) = index.add_documents(documents, None).await; - index.wait_task(request.uid()).await.succeeded(); - - let (response, code) = index - .search_post(json!({ - "filter": "title CONTAINS cap" - })) - .await; - assert_eq!(code, 200, "{}", response); - assert_eq!(response["hits"].as_array().unwrap().len(), 2); -} - #[actix_rt::test] async fn search_with_sort_on_numbers() { let index = shared_index_with_documents().await; @@ -488,7 +526,7 @@ async fn search_facet_distribution() { |response, code| { assert_eq!(code, 200, "{}", response); let dist = response["facetDistribution"].as_object().unwrap(); - assert_eq!(dist.len(), 1); + assert_eq!(dist.len(), 1, "{:?}", dist); assert_eq!( dist["doggos.name"], json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) @@ -505,7 +543,7 @@ async fn search_facet_distribution() { |response, code| { assert_eq!(code, 200, "{}", response); let dist = response["facetDistribution"].as_object().unwrap(); - assert_eq!(dist.len(), 3); + assert_eq!(dist.len(), 3, "{:?}", dist); assert_eq!( dist["doggos.name"], json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) @@ -541,8 +579,8 @@ async fn displayed_attributes() { index.update_settings(json!({ "displayedAttributes": ["title"] })).await; let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.search_post(json!({ "attributesToRetrieve": ["title", "id"] })).await; @@ -556,8 +594,8 @@ async fn placeholder_search_is_hard_limited() { let index = server.index("test"); let documents: Vec<_> = (0..1200).map(|i| json!({ "id": i, "text": "I am unique!" })).collect(); - index.add_documents(documents.into(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents.into(), None).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -584,8 +622,9 @@ async fn placeholder_search_is_hard_limited() { ) .await; - index.update_settings(json!({ "pagination": { "maxTotalHits": 10_000 } })).await; - index.wait_task(1).await; + let (task, _status_code) = + index.update_settings(json!({ "pagination": { "maxTotalHits": 10_000 } })).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -619,8 +658,8 @@ async fn search_is_hard_limited() { let index = server.index("test"); let documents: Vec<_> = (0..1200).map(|i| json!({ "id": i, "text": "I am unique!" })).collect(); - index.add_documents(documents.into(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents.into(), None).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -649,8 +688,9 @@ async fn search_is_hard_limited() { ) .await; - index.update_settings(json!({ "pagination": { "maxTotalHits": 10_000 } })).await; - index.wait_task(1).await; + let (task, _status_code) = + index.update_settings(json!({ "pagination": { "maxTotalHits": 10_000 } })).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -688,8 +728,8 @@ async fn faceting_max_values_per_facet() { index.update_settings(json!({ "filterableAttributes": ["number"] })).await; let documents: Vec<_> = (0..10_000).map(|id| json!({ "id": id, "number": id * 10 })).collect(); - index.add_documents(json!(documents), None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(json!(documents), None).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -704,8 +744,9 @@ async fn faceting_max_values_per_facet() { ) .await; - index.update_settings(json!({ "faceting": { "maxValuesPerFacet": 10_000 } })).await; - index.wait_task(2).await; + let (task, _status_code) = + index.update_settings(json!({ "faceting": { "maxValuesPerFacet": 10_000 } })).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -729,7 +770,7 @@ async fn test_score_details() { let documents = DOCUMENTS.clone(); let res = index.add_documents(json!(documents), None).await; - index.wait_task(res.0.uid()).await; + index.wait_task(res.0.uid()).await.succeeded(); index .search( @@ -748,13 +789,6 @@ async fn test_score_details() { "green", "red" ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - }, "_rankingScoreDetails": { "words": { "order": 0, @@ -802,7 +836,7 @@ async fn test_score() { let documents = SCORE_DOCUMENTS.clone(); let res = index.add_documents(json!(documents), None).await; - index.wait_task(res.0.uid()).await; + index.wait_task(res.0.uid()).await.succeeded(); index .search( @@ -855,7 +889,7 @@ async fn test_score_threshold() { let documents = SCORE_DOCUMENTS.clone(); let res = index.add_documents(json!(documents), None).await; - index.wait_task(res.0.uid()).await; + index.wait_task(res.0.uid()).await.succeeded(); index .search( @@ -1011,7 +1045,7 @@ async fn test_degraded_score_details() { index.add_documents(json!(documents), None).await; // We can't really use anything else than 0ms here; otherwise, the test will get flaky. let (res, _code) = index.update_settings(json!({ "searchCutoffMs": 0 })).await; - index.wait_task(res.uid()).await; + index.wait_task(res.uid()).await.succeeded(); index .search( @@ -1089,206 +1123,6 @@ async fn test_degraded_score_details() { .await; } -#[actix_rt::test] -async fn experimental_feature_vector_store() { - let server = Server::new().await; - let index = server.index("test"); - - let documents = DOCUMENTS.clone(); - - index.add_documents(json!(documents), None).await; - index.wait_task(0).await; - - let (response, code) = index - .search_post(json!({ - "vector": [1.0, 2.0, 3.0], - "hybrid": { - "embedder": "manual", - }, - "showRankingScore": true - })) - .await; - - { - meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); - } - - index - .search(json!({ - "retrieveVectors": true, - "showRankingScore": true - }), |response, code|{ - meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); - }) - .await; - - let (response, code) = server.set_features(json!({"vectorStore": true})).await; - meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(response["vectorStore"], @"true"); - - let (response, code) = index - .update_settings(json!({"embedders": { - "manual": { - "source": "userProvided", - "dimensions": 3, - } - }})) - .await; - - meili_snap::snapshot!(response, @r###" - { - "taskUid": 1, - "indexUid": "test", - "status": "enqueued", - "type": "settingsUpdate", - "enqueuedAt": "[date]" - } - "###); - meili_snap::snapshot!(code, @"202 Accepted"); - let response = index.wait_task(response.uid()).await; - - meili_snap::snapshot!(meili_snap::json_string!(response["status"]), @"\"succeeded\""); - - let (response, code) = index - .search_post(json!({ - "vector": [1.0, 2.0, 3.0], - "hybrid": { - "embedder": "manual", - }, - "showRankingScore": true, - "retrieveVectors": true, - })) - .await; - - meili_snap::snapshot!(code, @"200 OK"); - // vector search returns all documents that don't have vectors in the last bucket, like all sorts - meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###" - [ - { - "title": "Shazam!", - "id": "287947", - "color": [ - "green", - "blue" - ], - "_vectors": { - "manual": { - "embeddings": [ - [ - 1.0, - 2.0, - 3.0 - ] - ], - "regenerate": false - } - }, - "_rankingScore": 1.0 - }, - { - "title": "Captain Marvel", - "id": "299537", - "color": [ - "yellow", - "blue" - ], - "_vectors": { - "manual": { - "embeddings": [ - [ - 1.0, - 2.0, - 54.0 - ] - ], - "regenerate": false - } - }, - "_rankingScore": 0.9129111766815186 - }, - { - "title": "GlƤss", - "id": "450465", - "color": [ - "blue", - "red" - ], - "_vectors": { - "manual": { - "embeddings": [ - [ - -100.0, - 340.0, - 90.0 - ] - ], - "regenerate": false - } - }, - "_rankingScore": 0.8106412887573242 - }, - { - "title": "How to Train Your Dragon: The Hidden World", - "id": "166428", - "color": [ - "green", - "red" - ], - "_vectors": { - "manual": { - "embeddings": [ - [ - -100.0, - 231.0, - 32.0 - ] - ], - "regenerate": false - } - }, - "_rankingScore": 0.7412010431289673 - }, - { - "title": "Escape Room", - "id": "522681", - "color": [ - "yellow", - "red" - ], - "_vectors": { - "manual": { - "embeddings": [ - [ - 10.0, - -23.0, - 32.0 - ] - ], - "regenerate": false - } - }, - "_rankingScore": 0.6972063183784485 - } - ] - "###); -} - #[cfg(feature = "default")] #[actix_rt::test] async fn camelcased_words() { @@ -1303,8 +1137,8 @@ async fn camelcased_words() { { "id": 3, "title": "TestAb" }, { "id": 4, "title": "testab" }, ]); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "deLonghi"}), |response, code| { @@ -1521,13 +1355,14 @@ async fn simple_search_with_strange_synonyms() { let server = Server::new().await; let index = server.index("test"); - index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await; - let r = index.wait_task(0).await; + let (task, _status_code) = + index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await; + let r = index.wait_task(task.uid()).await; meili_snap::snapshot!(r["status"], @r###""succeeded""###); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "How to train"}), |response, code| { @@ -1540,14 +1375,7 @@ async fn simple_search_with_strange_synonyms() { "color": [ "green", "red" - ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - } + ] } ] "###); @@ -1565,14 +1393,7 @@ async fn simple_search_with_strange_synonyms() { "color": [ "green", "red" - ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - } + ] } ] "###); @@ -1590,14 +1411,7 @@ async fn simple_search_with_strange_synonyms() { "color": [ "green", "red" - ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - } + ] } ] "###); @@ -1613,11 +1427,12 @@ async fn change_attributes_settings() { index.update_settings(json!({ "searchableAttributes": ["father", "mother"] })).await; let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(json!(documents), None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(json!(documents), None).await; + index.wait_task(task.uid()).await.succeeded(); - index.update_settings(json!({ "searchableAttributes": ["father", "mother", "doggos"], "filterableAttributes": ["doggos"] })).await; - index.wait_task(2).await; + let (task,_status_code) = + index.update_settings(json!({ "searchableAttributes": ["father", "mother", "doggos"], "filterableAttributes": ["doggos"] })).await; + index.wait_task(task.uid()).await.succeeded(); // search index @@ -1680,3 +1495,344 @@ async fn change_attributes_settings() { ) .await; } + +#[actix_rt::test] +async fn test_nested_fields() { + let documents = json!([ + { + "id": 0, + "title": "The zeroth document", + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule", + }, + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field", + }, + { + "prout": "truc", + "machin": "lol", + }, + ], + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied", + }, + ]); + + let settings = json!({ + "searchableAttributes": ["title", "nested.object", "nested.machin"], + "filterableAttributes": ["title", "nested.object", "nested.machin"] + }); + + // Test empty search returns all documents + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "document"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "title": "The zeroth document" + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied" + } + ] + "###); + }, + ) + .await; + + // Test searching specific documents + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "zeroth"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "title": "The zeroth document" + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "first"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + } + ] + "###); + }, + ) + .await; + + // Test searching nested fields + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "field"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "array"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + // nested is not searchable + snapshot!(json_string!(response["hits"]), @"[]"); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"q": "lied"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + // nested is not searchable + snapshot!(json_string!(response["hits"]), @"[]"); + }, + ) + .await; + + // Test filtering on nested fields + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": "nested.object = field"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + } + ] + "###); + }, + ) + .await; + + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": "nested.machin = bidule"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + } + ] + "###); + }, + ) + .await; + + // Test filtering on non-filterable nested field fails + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": "nested = array"}), + |response, code| { + assert_eq!(code, 400, "{}", response); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `nested` is not filterable. Available filterable attribute patterns are: `nested.machin`, `nested.object`, `title`.\n1:7 nested = array", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; + + // Test filtering on non-filterable nested field fails + test_settings_documents_indexing_swapping_and_search( + &documents, + &settings, + &json!({"filter": r#"nested = "I lied""#}), + |response, code| { + assert_eq!(code, 400, "{}", response); + snapshot!(json_string!(response), @r###" + { + "message": "Index `test`: Attribute `nested` is not filterable. Available filterable attribute patterns are: `nested.machin`, `nested.object`, `title`.\n1:7 nested = \"I lied\"", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "###); + }, + ) + .await; +} + +/// Modifying facets with different casing should work correctly +#[actix_rt::test] +async fn change_facet_casing() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "filterableAttributes": ["dog"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index + .add_documents( + json!([ + { + "id": 1, + "dog": "Bouvier Bernois" + } + ]), + None, + ) + .await; + index.wait_task(response.uid()).await; + + let (response, _code) = index + .add_documents( + json!([ + { + "id": 1, + "dog": "bouvier bernois" + } + ]), + None, + ) + .await; + index.wait_task(response.uid()).await; + + index + .search(json!({ "facets": ["dog"] }), |response, code| { + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response["facetDistribution"]), @r###" + { + "dog": { + "bouvier bernois": 1 + } + } + "###); + }) + .await; +} diff --git a/crates/meilisearch/tests/search/multi.rs b/crates/meilisearch/tests/search/multi/mod.rs similarity index 92% rename from crates/meilisearch/tests/search/multi.rs rename to crates/meilisearch/tests/search/multi/mod.rs index 8d7340f0d..8a83fd3c0 100644 --- a/crates/meilisearch/tests/search/multi.rs +++ b/crates/meilisearch/tests/search/multi/mod.rs @@ -5,6 +5,8 @@ use crate::common::Server; use crate::json; use crate::search::{SCORE_DOCUMENTS, VECTOR_DOCUMENTS}; +mod proxy; + #[actix_rt::test] async fn search_empty_list() { let server = Server::new().await; @@ -89,8 +91,8 @@ async fn simple_search_single_index() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -110,14 +112,7 @@ async fn simple_search_single_index() { "color": [ "blue", "red" - ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - } + ] } ], "query": "glass", @@ -135,14 +130,7 @@ async fn simple_search_single_index() { "color": [ "yellow", "blue" - ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - } + ] } ], "query": "captain", @@ -161,8 +149,8 @@ async fn federation_single_search_single_index() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -180,13 +168,6 @@ async fn federation_single_search_single_index() { "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 0, @@ -208,8 +189,8 @@ async fn federation_multiple_search_single_index() { let index = server.index("test"); let documents = SCORE_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -283,8 +264,8 @@ async fn federation_two_search_single_index() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -303,13 +284,6 @@ async fn federation_two_search_single_index() { "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 0, @@ -323,13 +297,6 @@ async fn federation_two_search_single_index() { "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 1, @@ -351,8 +318,8 @@ async fn simple_search_missing_index_uid() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -376,8 +343,8 @@ async fn federation_simple_search_missing_index_uid() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -401,8 +368,8 @@ async fn simple_search_illegal_index_uid() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -426,8 +393,8 @@ async fn federation_search_illegal_index_uid() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -451,13 +418,13 @@ async fn simple_search_two_indexes() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (add_task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(add_task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -477,14 +444,7 @@ async fn simple_search_two_indexes() { "color": [ "blue", "red" - ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - } + ] } ], "query": "glass", @@ -510,14 +470,7 @@ async fn simple_search_two_indexes() { "age": 4 } ], - "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - } + "cattos": "pĆ©sti" }, { "id": 654, @@ -532,14 +485,7 @@ async fn simple_search_two_indexes() { "cattos": [ "simba", "pestifĆ©rĆ©" - ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - } + ] } ], "query": "pĆ©sti", @@ -558,13 +504,13 @@ async fn federation_two_search_two_indexes() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -583,13 +529,6 @@ async fn federation_two_search_two_indexes() { "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 0, @@ -611,13 +550,6 @@ async fn federation_two_search_two_indexes() { } ], "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -638,13 +570,6 @@ async fn federation_two_search_two_indexes() { "simba", "pestifĆ©rĆ©" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -666,18 +591,18 @@ async fn federation_multiple_search_multiple_indexes() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("score"); let documents = SCORE_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -705,13 +630,6 @@ async fn federation_multiple_search_multiple_indexes() { "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 0, @@ -733,13 +651,6 @@ async fn federation_multiple_search_multiple_indexes() { } ], "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 2, @@ -771,13 +682,6 @@ async fn federation_multiple_search_multiple_indexes() { "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 1, @@ -791,13 +695,6 @@ async fn federation_multiple_search_multiple_indexes() { "yellow", "red" ], - "_vectors": { - "manual": [ - 10, - -23, - 32 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 3, @@ -822,13 +719,6 @@ async fn federation_multiple_search_multiple_indexes() { "moumoute", "gomez" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 4, @@ -867,13 +757,6 @@ async fn federation_multiple_search_multiple_indexes() { "simba", "pestifĆ©rĆ©" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 2, @@ -896,13 +779,6 @@ async fn federation_multiple_search_multiple_indexes() { "green", "red" ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - }, "_federation": { "indexUid": "test", "queriesPosition": 6, @@ -924,8 +800,8 @@ async fn search_one_index_doesnt_exist() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -950,8 +826,8 @@ async fn federation_one_index_doesnt_exist() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -1021,13 +897,13 @@ async fn search_one_query_error() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -1038,7 +914,7 @@ async fn search_one_query_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Invalid facet distribution, this index does not have configured filterable attributes.", + "message": "Inside `.queries[0]`: Invalid facet distribution: Attribute `title` is not filterable. This index does not have configured filterable attributes.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -1053,13 +929,13 @@ async fn federation_one_query_error() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -1070,7 +946,7 @@ async fn federation_one_query_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1085,13 +961,13 @@ async fn federation_one_query_sort_error() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -1102,7 +978,7 @@ async fn federation_one_query_sort_error() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1117,13 +993,13 @@ async fn search_multiple_query_errors() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -1134,7 +1010,7 @@ async fn search_multiple_query_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Invalid facet distribution, this index does not have configured filterable attributes.", + "message": "Inside `.queries[0]`: Invalid facet distribution: Attribute `title` is not filterable. This index does not have configured filterable attributes.", "code": "invalid_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_facets" @@ -1149,13 +1025,13 @@ async fn federation_multiple_query_errors() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -1166,7 +1042,7 @@ async fn federation_multiple_query_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", + "message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1181,13 +1057,13 @@ async fn federation_multiple_query_sort_errors() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -1198,7 +1074,7 @@ async fn federation_multiple_query_sort_errors() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[0]`: Attribute `title` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1213,13 +1089,13 @@ async fn federation_multiple_query_errors_interleaved() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -1231,7 +1107,7 @@ async fn federation_multiple_query_errors_interleaved() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1246,13 +1122,13 @@ async fn federation_multiple_query_sort_errors_interleaved() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"queries": [ @@ -1264,7 +1140,7 @@ async fn federation_multiple_query_sort_errors_interleaved() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", + "message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.", "code": "invalid_search_sort", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_sort" @@ -1280,14 +1156,14 @@ async fn federation_filter() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings( json!({"searchableAttributes": ["name"], "filterableAttributes": ["BOOST"]}), ) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -1348,7 +1224,7 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1363,7 +1239,7 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // two identical placeholder search should have all results from first query let (response, code) = server @@ -1391,13 +1267,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { } ], "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1412,13 +1281,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "cattos": [ "enigma" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1440,13 +1302,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "simba", "pestifĆ©rĆ©" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1472,13 +1327,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "moumoute", "gomez" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1520,13 +1368,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { } ], "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1548,13 +1389,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "simba", "pestifĆ©rĆ©" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1580,13 +1414,6 @@ async fn federation_sort_same_indexes_same_criterion_same_direction() { "moumoute", "gomez" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -1611,7 +1438,7 @@ async fn federation_sort_same_indexes_same_criterion_opposite_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1626,7 +1453,7 @@ async fn federation_sort_same_indexes_same_criterion_opposite_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // two identical placeholder search should have all results from first query let (response, code) = server @@ -1671,7 +1498,7 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1686,7 +1513,7 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // return mothers and fathers ordered accross fields. let (response, code) = server @@ -1714,13 +1541,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { } ], "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -1746,13 +1566,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "moumoute", "gomez" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -1767,13 +1580,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "cattos": [ "enigma" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1795,13 +1601,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "simba", "pestifĆ©rĆ©" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -1843,13 +1642,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "simba", "pestifĆ©rĆ©" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1872,13 +1664,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { } ], "cattos": "pĆ©sti", - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 0, @@ -1904,13 +1689,6 @@ async fn federation_sort_same_indexes_different_criterion_same_direction() { "moumoute", "gomez" ], - "_vectors": { - "manual": [ - 10, - 23, - 32 - ] - }, "_federation": { "indexUid": "nested", "queriesPosition": 1, @@ -1935,7 +1713,7 @@ async fn federation_sort_same_indexes_different_criterion_opposite_direction() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -1950,7 +1728,7 @@ async fn federation_sort_same_indexes_different_criterion_opposite_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // two identical placeholder search should have all results from first query let (response, code) = server @@ -1995,7 +1773,7 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2010,13 +1788,13 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2031,7 +1809,7 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // return titles ordered accross indexes let (response, code) = server @@ -2101,13 +1879,6 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2122,13 +1893,6 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { "yellow", "red" ], - "_vectors": { - "manual": [ - 10, - -23, - 32 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2143,13 +1907,6 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2164,13 +1921,6 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { "green", "red" ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2185,13 +1935,6 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { "green", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2226,13 +1969,6 @@ async fn federation_sort_different_indexes_same_criterion_same_direction() { "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 1, @@ -2307,7 +2043,7 @@ async fn federation_sort_different_ranking_rules() { let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2322,13 +2058,13 @@ async fn federation_sort_different_ranking_rules() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2343,7 +2079,7 @@ async fn federation_sort_different_ranking_rules() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // return titles ordered accross indexes let (response, code) = server @@ -2413,13 +2149,6 @@ async fn federation_sort_different_ranking_rules() { "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2434,13 +2163,6 @@ async fn federation_sort_different_ranking_rules() { "yellow", "red" ], - "_vectors": { - "manual": [ - 10, - -23, - 32 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2455,13 +2177,6 @@ async fn federation_sort_different_ranking_rules() { "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2476,13 +2191,6 @@ async fn federation_sort_different_ranking_rules() { "green", "red" ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2497,13 +2205,6 @@ async fn federation_sort_different_ranking_rules() { "green", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2546,7 +2247,7 @@ async fn federation_sort_different_indexes_same_criterion_opposite_direction() { let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2561,13 +2262,13 @@ async fn federation_sort_different_indexes_same_criterion_opposite_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2582,7 +2283,7 @@ async fn federation_sort_different_indexes_same_criterion_opposite_direction() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // all results from query 0 let (response, code) = server @@ -2628,7 +2329,7 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2643,13 +2344,13 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2664,7 +2365,7 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // return titles ordered accross indexes let (response, code) = server @@ -2714,13 +2415,6 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2755,13 +2449,6 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() "yellow", "red" ], - "_vectors": { - "manual": [ - 10, - -23, - 32 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2776,13 +2463,6 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() "blue", "red" ], - "_vectors": { - "manual": [ - -100, - 340, - 90 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2797,13 +2477,6 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() "green", "red" ], - "_vectors": { - "manual": [ - -100, - 231, - 32 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2818,13 +2491,6 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() "green", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 3 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 0, @@ -2879,13 +2545,6 @@ async fn federation_sort_different_indexes_different_criterion_same_direction() "yellow", "blue" ], - "_vectors": { - "manual": [ - 1, - 2, - 54 - ] - }, "_federation": { "indexUid": "movies", "queriesPosition": 1, @@ -2940,7 +2599,7 @@ async fn federation_sort_different_indexes_different_criterion_opposite_directio let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2955,13 +2614,13 @@ async fn federation_sort_different_indexes_different_criterion_opposite_directio ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -2976,7 +2635,7 @@ async fn federation_sort_different_indexes_different_criterion_opposite_directio ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // all results from query 0 first let (response, code) = server @@ -3020,18 +2679,18 @@ async fn federation_limit_offset() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("score"); let documents = SCORE_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); { let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -3338,18 +2997,18 @@ async fn federation_formatting() { let index = server.index("test"); let documents = DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("nested"); let documents = NESTED_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(1).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); let index = server.index("score"); let documents = SCORE_DOCUMENTS.clone(); - index.add_documents(documents, None).await; - index.wait_task(2).await; + let (task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); { let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -3685,14 +3344,14 @@ async fn federation_invalid_weight() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings( json!({"searchableAttributes": ["name"], "filterableAttributes": ["BOOST"]}), ) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -3719,14 +3378,14 @@ async fn federation_null_weight() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings( json!({"searchableAttributes": ["name"], "filterableAttributes": ["BOOST"]}), ) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -3787,7 +3446,7 @@ async fn federation_federated_contains_pagination() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // fail when a federated query contains "limit" let (response, code) = server @@ -3867,11 +3526,11 @@ async fn federation_federated_contains_facets() { ) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // empty facets are actually OK let (response, code) = server @@ -3951,7 +3610,7 @@ async fn federation_non_faceted_for_an_index() { ) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("fruits-no-name"); @@ -3961,17 +3620,17 @@ async fn federation_non_faceted_for_an_index() { ) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("fruits-no-facets"); let (value, _) = index.update_settings(json!({"searchableAttributes": ["name"]})).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // fails let (response, code) = server @@ -3988,7 +3647,7 @@ async fn federation_non_faceted_for_an_index() { snapshot!(code, @"400 Bad Request"); insta::assert_json_snapshot!(response, { ".processingTimeMs" => "[time]" }, @r###" { - "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution, attribute `name` is not filterable. The available filterable attributes are `BOOST, id`.\n - Note: index `fruits-no-name` used in `.queries[1]`", + "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution: Attribute `name` is not filterable. Available filterable attributes patterns are: `BOOST, id`.\n - Note: index `fruits-no-name` used in `.queries[1]`", "code": "invalid_multi_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_multi_search_facets" @@ -4010,7 +3669,7 @@ async fn federation_non_faceted_for_an_index() { snapshot!(code, @"400 Bad Request"); insta::assert_json_snapshot!(response, { ".processingTimeMs" => "[time]" }, @r###" { - "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution, attribute `name` is not filterable. The available filterable attributes are `BOOST, id`.\n - Note: index `fruits-no-name` is not used in queries", + "message": "Inside `.federation.facetsByIndex.fruits-no-name`: Invalid facet distribution: Attribute `name` is not filterable. Available filterable attributes patterns are: `BOOST, id`.\n - Note: index `fruits-no-name` is not used in queries", "code": "invalid_multi_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_multi_search_facets" @@ -4031,14 +3690,14 @@ async fn federation_non_faceted_for_an_index() { ]})) .await; snapshot!(code, @"400 Bad Request"); - insta::assert_json_snapshot!(response, { ".processingTimeMs" => "[time]" }, @r###" + insta::assert_json_snapshot!(response, { ".processingTimeMs" => "[time]" }, @r#" { - "message": "Inside `.federation.facetsByIndex.fruits-no-facets`: Invalid facet distribution, this index does not have configured filterable attributes.\n - Note: index `fruits-no-facets` is not used in queries", + "message": "Inside `.federation.facetsByIndex.fruits-no-facets`: Invalid facet distribution: Attributes `BOOST, id` are not filterable. This index does not have configured filterable attributes.\n - Note: index `fruits-no-facets` is not used in queries", "code": "invalid_multi_search_facets", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_multi_search_facets" } - "###); + "#); // also fails let (response, code) = server @@ -4071,7 +3730,7 @@ async fn federation_non_federated_contains_federation_option() { let documents = FRUITS_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // fail when a non-federated query contains "federationOptions" let (response, code) = server @@ -4094,13 +3753,6 @@ async fn federation_non_federated_contains_federation_option() { #[actix_rt::test] async fn federation_vector_single_index() { let server = Server::new().await; - let (_, code) = server - .set_features(json!({ - "vectorStore": true - })) - .await; - - snapshot!(code, @"200 OK"); let index = server.index("vectors"); @@ -4116,12 +3768,12 @@ async fn federation_vector_single_index() { } }})) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let documents = VECTOR_DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // same embedder let (response, code) = server @@ -4302,13 +3954,6 @@ async fn federation_vector_single_index() { #[actix_rt::test] async fn federation_vector_two_indexes() { let server = Server::new().await; - let (_, code) = server - .set_features(json!({ - "vectorStore": true - })) - .await; - - snapshot!(code, @"200 OK"); let index = server.index("vectors-animal"); @@ -4320,12 +3965,12 @@ async fn federation_vector_two_indexes() { }, }})) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let documents = VECTOR_DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("vectors-sentiment"); @@ -4337,12 +3982,12 @@ async fn federation_vector_two_indexes() { } }})) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let documents = VECTOR_DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": {}, "queries": [ @@ -4802,7 +4447,7 @@ async fn federation_facets_different_indexes_same_facet() { let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -4818,13 +4463,13 @@ async fn federation_facets_different_indexes_same_facet() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -4840,13 +4485,13 @@ async fn federation_facets_different_indexes_same_facet() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman-2"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -4862,7 +4507,7 @@ async fn federation_facets_different_indexes_same_facet() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // return titles ordered accross indexes let (response, code) = server @@ -5369,7 +5014,7 @@ async fn federation_facets_same_indexes() { let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -5384,13 +5029,13 @@ async fn federation_facets_same_indexes() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("doggos-2"); let documents = NESTED_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -5405,7 +5050,7 @@ async fn federation_facets_same_indexes() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (response, code) = server .multi_search(json!({"federation": { @@ -5670,7 +5315,7 @@ async fn federation_inconsistent_merge_order() { let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -5686,13 +5331,13 @@ async fn federation_inconsistent_merge_order() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("movies-2"); let documents = DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -5711,13 +5356,13 @@ async fn federation_inconsistent_merge_order() { } })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let index = server.index("batman"); let documents = SCORE_DOCUMENTS.clone(); let (value, _) = index.add_documents(documents, None).await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (value, _) = index .update_settings(json!({ @@ -5733,7 +5378,7 @@ async fn federation_inconsistent_merge_order() { ] })) .await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // without merging, it works let (response, code) = server diff --git a/crates/meilisearch/tests/search/multi/proxy.rs b/crates/meilisearch/tests/search/multi/proxy.rs new file mode 100644 index 000000000..d267ee153 --- /dev/null +++ b/crates/meilisearch/tests/search/multi/proxy.rs @@ -0,0 +1,2591 @@ +use std::sync::Arc; + +use actix_http::StatusCode; +use meili_snap::{json_string, snapshot}; +use wiremock::matchers::AnyMatcher; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use crate::common::{Server, Value, SCORE_DOCUMENTS}; +use crate::json; + +#[actix_rt::test] +async fn error_feature() { + let server = Server::new().await; + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "remote": "toto" + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Performing a remote federated search requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "queryPosition": 42, + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Using `federationOptions.queryPosition` requires enabling the `network` experimental feature. See https://github.com/orgs/meilisearch/discussions/805", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); +} + +#[actix_rt::test] +async fn error_params() { + let server = Server::new().await; + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "remote": 42 + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.queries[0].federationOptions.remote`: expected a string, but found a positive integer: `42`", + "code": "invalid_multi_search_remote", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_remote" + } + "###); + + let (response, code) = server + .multi_search(json!({ + "federation": {}, + "queries": [ + { + "indexUid": "test", + "federationOptions": { + "queryPosition": "toto", + } + } + ]})) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.queries[0].federationOptions.queryPosition`: expected a positive integer, but found a string: `\"toto\"`", + "code": "invalid_multi_search_query_position", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_query_position" + } + "###); +} + +#[actix_rt::test] +async fn remote_sharding() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + let ms2 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms2.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + let (response, code) = ms2.set_network(json!({"self": "ms2"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms2", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let index2 = ms2.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index2.add_documents(json!(documents[3..5]), None).await; + index2.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + let ms2 = Arc::new(ms2); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + let rms2 = LocalMeili::new(ms2.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + "ms2": { + "url": rms2.url() + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms2.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms2" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Badman", + "id": "E", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.5, + "remote": "ms2" + } + }, + { + "title": "Batman", + "id": "D", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.23106060606060605, + "remote": "ms2" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 5, + "remoteErrors": {} + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Badman", + "id": "E", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.5, + "remote": "ms2" + } + }, + { + "title": "Batman", + "id": "D", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.23106060606060605, + "remote": "ms2" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 5, + "remoteErrors": {} + } + "###); + let (response, _status_code) = ms2.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Badman", + "id": "E", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.5, + "remote": "ms2" + } + }, + { + "title": "Batman", + "id": "D", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.23106060606060605, + "remote": "ms2" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 5, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_unregistered_remote() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms2" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "message": "Invalid `queries[2].federation_options.remote`: remote `ms2` is not registered", + "code": "invalid_multi_search_remote", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_remote" + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "message": "Invalid `queries[2].federation_options.remote`: remote `ms2` is not registered", + "code": "invalid_multi_search_remote", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_multi_search_remote" + } + "###); +} + +#[actix_rt::test] +async fn error_no_weighted_score() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { gobble_headers: true, ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote hit does not contain `._federation.weightedScoreValues`\n - hint: check that the remote instance is a Meilisearch instance running the same version", + "code": "remote_bad_response", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_bad_response" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_response() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { + override_response_body: Some("Returning an HTML page".into()), + ..Default::default() + }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "could not parse response from the remote host as a federated search response:\n - response from remote: Returning an HTML page\n - hint: check that the remote instance is a Meilisearch instance running the same version", + "code": "remote_bad_response", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_bad_response" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_request() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "nottest", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 400:\n - response from remote: {\"message\":\"Inside `.queries[1]`: Index `nottest` not found.\",\"code\":\"index_not_found\",\"type\":\"invalid_request\",\"link\":\"https://docs.meilisearch.com/errors#index_not_found\"}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_request_facets_by_index() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test0"); + let index1 = ms1.index("test1"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": { + "facetsByIndex": { + "test0": [] + } + }, + "queries": [ + { + "q": query, + "indexUid": "test0", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test1", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test0", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test0", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "facetsByIndex": { + "test0": { + "distribution": {}, + "stats": {} + } + }, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 400:\n - response from remote: {\"message\":\"Inside `.federation.facetsByIndex.test0`: Index `test0` not found.\\n - Note: index `test0` is not used in queries\",\"code\":\"index_not_found\",\"type\":\"invalid_request\",\"link\":\"https://docs.meilisearch.com/errors#index_not_found\"}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_bad_request_facets_by_index_facet() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + + let (task, _status_code) = index0.update_settings_filterable_attributes(json!(["id"])).await; + index0.wait_task(task.uid()).await.succeeded(); + + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": { + "facetsByIndex": { + "test": ["id"] + } + }, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "facetsByIndex": { + "test": { + "distribution": { + "id": { + "A": 1, + "B": 1 + } + }, + "stats": {} + } + }, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 400:\n - response from remote: {\"message\":\"Inside `.federation.facetsByIndex.test`: Invalid facet distribution: Attribute `id` is not filterable. This index does not have configured filterable attributes.\\n - Note: index `test` used in `.queries[1]`\",\"code\":\"invalid_multi_search_facets\",\"type\":\"invalid_request\",\"link\":\"https://docs.meilisearch.com/errors#invalid_multi_search_facets\"}\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); +} + +#[actix_rt::test] +async fn error_remote_does_not_answer() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + "ms2": { + "url": "https://thiswebsitedoesnotexist.example" + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms2" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": { + "ms2": { + "message": "error sending request", + "code": "remote_could_not_send_request", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_could_not_send_request" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r#" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": { + "ms2": { + "message": "error sending request", + "code": "remote_could_not_send_request", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_could_not_send_request" + } + } + } + "#); +} + +#[actix_rt::test] +async fn error_remote_404() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": format!("{}/this-route-does-not-exists/", rms1.url()) + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 404:\n - response from remote: null\n - hint: check that the remote instance has the correct index configuration for that request\n - hint: check that the `network` experimental feature is enabled on the remote instance", + "code": "remote_bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#remote_bad_request" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_sharding_auth() { + let ms0 = Server::new().await; + let mut ms1 = Server::new_auth().await; + ms1.use_api_key("MASTER_KEY"); + + let (search_api_key_not_enough_indexes, code) = ms1 + .add_api_key(json!({ + "actions": ["search"], + "indexes": ["nottest"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let search_api_key_not_enough_indexes = search_api_key_not_enough_indexes["key"].clone(); + + let (api_key_not_search, code) = ms1 + .add_api_key(json!({ + "actions": ["documents.*"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let api_key_not_search = api_key_not_search["key"].clone(); + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + ms1.clear_api_key(); + + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1-nottest": { + "url": rms1.url(), + "searchApiKey": search_api_key_not_enough_indexes + }, + "ms1-notsearch": { + "url": rms1.url(), + "searchApiKey": api_key_not_search + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1-nottest" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1-notsearch" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1-notsearch": { + "message": "could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `search` action", + "code": "remote_invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#remote_invalid_api_key" + }, + "ms1-nottest": { + "message": "could not authenticate against the remote host\n - hint: check that the remote instance was registered with a valid API key having the `search` action", + "code": "remote_invalid_api_key", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#remote_invalid_api_key" + } + } + } + "###); +} + +#[actix_rt::test] +async fn remote_sharding_auth() { + let ms0 = Server::new().await; + let mut ms1 = Server::new_auth().await; + ms1.use_api_key("MASTER_KEY"); + + let (search_api_key, code) = ms1 + .add_api_key(json!({ + "actions": ["search"], + "indexes": ["*"], + "expiresAt": serde_json::Value::Null + })) + .await; + meili_snap::snapshot!(code, @"201 Created"); + let search_api_key = search_api_key["key"].clone(); + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + ms1.clear_api_key(); + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::new(ms1.clone()).await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url(), + "searchApiKey": "MASTER_KEY" + }, + "ms1-alias": { + "url": rms1.url(), + "searchApiKey": search_api_key + } + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1-alias" + } + }, + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 2, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1-alias" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_500() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { fails: FailurePolicy::Always, ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + } + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host responded with code 500:\n - response from remote: {\"error\":\"provoked error\",\"code\":\"test_error\",\"link\":\"https://docs.meilisearch.com/errors#test_error\"}", + "code": "remote_remote_error", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_remote_error" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + // the response if full because we queried the instance that works + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_500_once() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { fails: FailurePolicy::Once, ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + } + ] + }); + + // Meilisearch is tolerant to a single failure + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +#[actix_rt::test] +async fn error_remote_timeout() { + let ms0 = Server::new().await; + let ms1 = Server::new().await; + + // enable feature + + let (response, code) = ms0.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + let (response, code) = ms1.set_features(json!({"network": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["network"]), @"true"); + + // set self + + let (response, code) = ms0.set_network(json!({"self": "ms0"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms0", + "remotes": {} + } + "###); + let (response, code) = ms1.set_network(json!({"self": "ms1"})).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response), @r###" + { + "self": "ms1", + "remotes": {} + } + "###); + + // add documents + let documents = SCORE_DOCUMENTS.clone(); + let documents = documents.as_array().unwrap(); + let index0 = ms0.index("test"); + let index1 = ms1.index("test"); + let (task, _status_code) = index0.add_documents(json!(documents[0..2]), None).await; + index0.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index1.add_documents(json!(documents[2..3]), None).await; + index1.wait_task(task.uid()).await.succeeded(); + + // wrap servers + let ms0 = Arc::new(ms0); + let ms1 = Arc::new(ms1); + + let rms0 = LocalMeili::new(ms0.clone()).await; + let rms1 = LocalMeili::with_params( + ms1.clone(), + LocalMeiliParams { delay: Some(std::time::Duration::from_secs(6)), ..Default::default() }, + ) + .await; + + // set network + let network = json!({"remotes": { + "ms0": { + "url": rms0.url() + }, + "ms1": { + "url": rms1.url() + }, + }}); + + println!("{}", serde_json::to_string_pretty(&network).unwrap()); + + let (_response, status_code) = ms0.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + let (_response, status_code) = ms1.set_network(network.clone()).await; + snapshot!(status_code, @"200 OK"); + + // perform multi-search + let query = "badman returns"; + let request = json!({ + "federation": {}, + "queries": [ + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms0" + } + }, + { + "q": query, + "indexUid": "test", + "federationOptions": { + "remote": "ms1" + } + } + ] + }); + + let (response, _status_code) = ms0.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2, + "remoteErrors": { + "ms1": { + "message": "remote host did not answer before the deadline", + "code": "remote_timeout", + "type": "system", + "link": "https://docs.meilisearch.com/errors#remote_timeout" + } + } + } + "###); + let (response, _status_code) = ms1.multi_search(request.clone()).await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" + { + "hits": [ + { + "title": "Batman Returns", + "id": "C", + "_federation": { + "indexUid": "test", + "queriesPosition": 1, + "weightedRankingScore": 0.8317901234567902, + "remote": "ms1" + } + }, + { + "title": "Batman the dark knight returns: Part 1", + "id": "A", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + }, + { + "title": "Batman the dark knight returns: Part 2", + "id": "B", + "_federation": { + "indexUid": "test", + "queriesPosition": 0, + "weightedRankingScore": 0.7028218694885362, + "remote": "ms0" + } + } + ], + "processingTimeMs": "[time]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3, + "remoteErrors": {} + } + "###); +} + +// test: try all the flattened structs in queries + +// working facet tests with and without merge + +#[derive(Default)] +pub enum FailurePolicy { + #[default] + Never, + Once, + Always, +} + +/// Parameters to change the behavior of the [`LocalMeili`] server. +#[derive(Default)] +pub struct LocalMeiliParams { + /// delay the response by the specified duration + pub delay: Option, + pub fails: FailurePolicy, + /// replace the reponse body with the provided String + pub override_response_body: Option, + pub gobble_headers: bool, +} + +/// A server that exploits [`MockServer`] to provide an URL for testing network and the network. +pub struct LocalMeili { + mock_server: MockServer, +} + +impl LocalMeili { + pub async fn new(server: Arc) -> Self { + Self::with_params(server, Default::default()).await + } + + pub async fn with_params(server: Arc, params: LocalMeiliParams) -> Self { + let mock_server = MockServer::start().await; + + // tokio won't let us execute asynchronous code from a sync function inside of an async test, + // so instead we spawn another thread that will call the service on a brand new tokio runtime + // and communicate via channels... + let (request_sender, request_receiver) = crossbeam_channel::bounded::(0); + let (response_sender, response_receiver) = + crossbeam_channel::bounded::<(Value, StatusCode)>(0); + std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + while let Ok(req) = request_receiver.recv() { + let body = std::str::from_utf8(&req.body).unwrap(); + let headers: Vec<(&str, &str)> = if params.gobble_headers { + vec![("Content-Type", "application/json")] + } else { + req.headers + .iter() + .map(|(name, value)| (name.as_str(), value.to_str().unwrap())) + .collect() + }; + let (value, code) = rt.block_on(async { + match req.method.as_str() { + "POST" => server.service.post_str(&req.url, body, headers.clone()).await, + "PUT" => server.service.put_str(&req.url, body, headers).await, + "PATCH" => server.service.patch(&req.url, req.body_json().unwrap()).await, + "GET" => server.service.get(&req.url).await, + "DELETE" => server.service.delete(&req.url).await, + _ => unimplemented!(), + } + }); + if response_sender.send((value, code)).is_err() { + break; + } + } + println!("exiting mock thread") + }); + + let failed_already = std::sync::atomic::AtomicBool::new(false); + + Mock::given(AnyMatcher) + .respond_with(move |req: &wiremock::Request| { + if let Some(delay) = params.delay { + std::thread::sleep(delay); + } + match params.fails { + FailurePolicy::Never => {} + FailurePolicy::Once => { + let failed_already = + failed_already.fetch_or(true, std::sync::atomic::Ordering::AcqRel); + if !failed_already { + return fail(params.override_response_body.as_deref()); + } + } + FailurePolicy::Always => return fail(params.override_response_body.as_deref()), + } + request_sender.send(req.clone()).unwrap(); + let (value, code) = response_receiver.recv().unwrap(); + let response = ResponseTemplate::new(code.as_u16()); + if let Some(override_response_body) = params.override_response_body.as_deref() { + response.set_body_string(override_response_body) + } else { + response.set_body_json(value) + } + }) + .mount(&mock_server) + .await; + Self { mock_server } + } + + pub fn url(&self) -> String { + self.mock_server.uri() + } +} + +fn fail(override_response_body: Option<&str>) -> ResponseTemplate { + let response = ResponseTemplate::new(500); + if let Some(override_response_body) = override_response_body { + response.set_body_string(override_response_body) + } else { + response.set_body_json(json!({"error": "provoked error", "code": "test_error", "link": "https://docs.meilisearch.com/errors#test_error"})) + } +} diff --git a/crates/meilisearch/tests/search/restrict_searchable.rs b/crates/meilisearch/tests/search/restrict_searchable.rs index ca659c518..ce99c4047 100644 --- a/crates/meilisearch/tests/search/restrict_searchable.rs +++ b/crates/meilisearch/tests/search/restrict_searchable.rs @@ -64,8 +64,8 @@ async fn search_no_searchable_attribute_set() { ) .await; - index.update_settings_searchable_attributes(json!(["*"])).await; - index.wait_task(1).await; + let (task, _status_code) = index.update_settings_searchable_attributes(json!(["*"])).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -77,8 +77,8 @@ async fn search_no_searchable_attribute_set() { ) .await; - index.update_settings_searchable_attributes(json!(["*"])).await; - index.wait_task(2).await; + let (task, _status_code) = index.update_settings_searchable_attributes(json!(["*"])).await; + index.wait_task(task.uid()).await.succeeded(); index .search( @@ -108,8 +108,8 @@ async fn search_on_all_attributes() { async fn search_on_all_attributes_restricted_set() { let server = Server::new().await; let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; - index.update_settings_searchable_attributes(json!(["title"])).await; - index.wait_task(1).await; + let (task, _status_code) = index.update_settings_searchable_attributes(json!(["title"])).await; + index.wait_task(task.uid()).await.succeeded(); index .search(json!({"q": "Captain Marvel", "attributesToSearchOn": ["*"]}), |response, code| { @@ -191,8 +191,10 @@ async fn word_ranking_rule_order() { async fn word_ranking_rule_order_exact_words() { let server = Server::new().await; let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await; - index.update_settings_typo_tolerance(json!({"disableOnWords": ["Captain", "Marvel"]})).await; - index.wait_task(1).await; + let (task, _status_code) = index + .update_settings_typo_tolerance(json!({"disableOnWords": ["Captain", "Marvel"]})) + .await; + index.wait_task(task.uid()).await.succeeded(); // simple search should return 2 documents (ids: 2 and 3). index @@ -358,7 +360,7 @@ async fn search_on_exact_field() { let (response, code) = index.update_settings_typo_tolerance(json!({ "disableOnAttributes": ["exact"] })).await; assert_eq!(202, code, "{:?}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); // Searching on an exact attribute should only return the document matching without typo. index .search(json!({"q": "Marvel", "attributesToSearchOn": ["exact"]}), |response, code| { @@ -367,3 +369,50 @@ async fn search_on_exact_field() { }) .await; } + +#[actix_rt::test] +async fn phrase_search_on_title() { + let server = Server::new().await; + let documents = json!([ + { "id": 8, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 5, "desc": "Document Review", "title": "Document Review Attorney" }, + { "id": 4, "desc": "Document Review", "title": "Document Review Manager - Cyber Incident Response (Remote)" }, + { "id": 3, "desc": "Document Review", "title": "Document Review Paralegal" }, + { "id": 2, "desc": "Document Review", "title": "Document Controller (Saudi National)" }, + { "id": 1, "desc": "Document Review", "title": "Document Reviewer" }, + { "id": 7, "desc": "Document Review", "title": "Document Review Specialist II" }, + { "id": 6, "desc": "Document Review", "title": "Document Review (Entry Level)" } + ]); + let index = index_with_documents(&server, &documents).await; + + index + .search( + json!({"q": "\"Document Review\"", "attributesToSearchOn": ["title"], "attributesToRetrieve": ["title"]}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review Attorney" + }, + { + "title": "Document Review Manager - Cyber Incident Response (Remote)" + }, + { + "title": "Document Review Paralegal" + }, + { + "title": "Document Review Specialist II" + }, + { + "title": "Document Review (Entry Level)" + } + ] + "###); + }, + ) + .await; +} diff --git a/crates/meilisearch/tests/settings/distinct.rs b/crates/meilisearch/tests/settings/distinct.rs index 42c5d38bd..2c5b7517f 100644 --- a/crates/meilisearch/tests/settings/distinct.rs +++ b/crates/meilisearch/tests/settings/distinct.rs @@ -6,16 +6,16 @@ async fn set_and_reset_distinct_attribute() { let server = Server::new().await; let index = server.index("test"); - let (_response, _code) = index.update_settings(json!({ "distinctAttribute": "test"})).await; - index.wait_task(0).await; + let (task1, _code) = index.update_settings(json!({ "distinctAttribute": "test"})).await; + index.wait_task(task1.uid()).await.succeeded(); let (response, _) = index.settings().await; assert_eq!(response["distinctAttribute"], "test"); - index.update_settings(json!({ "distinctAttribute": null })).await; + let (task2, _status_code) = index.update_settings(json!({ "distinctAttribute": null })).await; - index.wait_task(1).await; + index.wait_task(task2.uid()).await.succeeded(); let (response, _) = index.settings().await; @@ -27,16 +27,16 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { let server = Server::new().await; let index = server.index("test"); - let (_response, _code) = index.update_distinct_attribute(json!("test")).await; - index.wait_task(0).await; + let (update_task1, _code) = index.update_distinct_attribute(json!("test")).await; + index.wait_task(update_task1.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; assert_eq!(response, "test"); - index.update_distinct_attribute(json!(null)).await; + let (update_task2, _status_code) = index.update_distinct_attribute(json!(null)).await; - index.wait_task(1).await; + index.wait_task(update_task2.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 6de0db0b3..fbb97f999 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -1,44 +1,187 @@ -use std::collections::HashMap; +use meili_snap::{json_string, snapshot}; -use once_cell::sync::Lazy; - -use crate::common::{Server, Value}; +use crate::common::Server; use crate::json; -static DEFAULT_SETTINGS_VALUES: Lazy> = Lazy::new(|| { - let mut map = HashMap::new(); - map.insert("displayed_attributes", json!(["*"])); - map.insert("searchable_attributes", json!(["*"])); - map.insert("localized_attributes", json!(null)); - map.insert("filterable_attributes", json!([])); - map.insert("distinct_attribute", json!(null)); - map.insert( - "ranking_rules", - json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]), - ); - map.insert("stop_words", json!([])); - map.insert("non_separator_tokens", json!([])); - map.insert("separator_tokens", json!([])); - map.insert("dictionary", json!([])); - map.insert("synonyms", json!({})); - map.insert( - "faceting", - json!({ - "maxValuesPerFacet": json!(100), - "sortFacetValuesBy": { - "*": "alpha" +macro_rules! test_setting_routes { + ($({setting: $setting:ident, update_verb: $update_verb:ident, default_value: $default_value:tt},) *) => { + $( + mod $setting { + use crate::common::Server; + + #[actix_rt::test] + async fn get_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (_response, code) = server.service.get(url).await; + assert_eq!(code, 404); + } + + #[actix_rt::test] + async fn update_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (response, code) = server.service.$update_verb(url, serde_json::Value::Null.into()).await; + assert_eq!(code, 202, "{}", response); + server.index("").wait_task(0).await; + let (response, code) = server.index("test").get().await; + assert_eq!(code, 200, "{}", response); + } + + #[actix_rt::test] + async fn delete_unexisting_index() { + let server = Server::new().await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (_, code) = server.service.delete(url).await; + assert_eq!(code, 202); + let response = server.index("").wait_task(0).await; + assert_eq!(response["status"], "failed"); + } + + #[actix_rt::test] + async fn get_default() { + let server = Server::new().await; + let index = server.index("test"); + let (response, code) = index.create(None).await; + assert_eq!(code, 202, "{}", response); + index.wait_task(0).await; + let url = format!("/indexes/test/settings/{}", + stringify!($setting) + .chars() + .map(|c| if c == '_' { '-' } else { c }) + .collect::()); + let (response, code) = server.service.get(url).await; + assert_eq!(code, 200, "{}", response); + let expected = crate::json!($default_value); + assert_eq!(expected, response); + } } - }), - ); - map.insert( - "pagination", - json!({ - "maxTotalHits": json!(1000), - }), - ); - map.insert("search_cutoff_ms", json!(null)); - map -}); + )* + + #[actix_rt::test] + async fn all_setting_tested() { + let expected = std::collections::BTreeSet::from_iter(meilisearch::routes::indexes::settings::ALL_SETTINGS_NAMES.iter()); + let tested = std::collections::BTreeSet::from_iter([$(stringify!($setting)),*].iter()); + let diff: Vec<_> = expected.difference(&tested).collect(); + assert!(diff.is_empty(), "Not all settings were tested, please add the following settings to the `test_setting_routes!` macro: {:?}", diff); + } + }; +} + +test_setting_routes!( + { + setting: filterable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: displayed_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: localized_attributes, + update_verb: put, + default_value: null + }, + { + setting: searchable_attributes, + update_verb: put, + default_value: ["*"] + }, + { + setting: distinct_attribute, + update_verb: put, + default_value: null + }, + { + setting: stop_words, + update_verb: put, + default_value: [] + }, + { + setting: separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: non_separator_tokens, + update_verb: put, + default_value: [] + }, + { + setting: dictionary, + update_verb: put, + default_value: [] + }, + { + setting: ranking_rules, + update_verb: put, + default_value: ["words", "typo", "proximity", "attribute", "sort", "exactness"] + }, + { + setting: synonyms, + update_verb: put, + default_value: {} + }, + { + setting: pagination, + update_verb: patch, + default_value: {"maxTotalHits": 1000} + }, + { + setting: faceting, + update_verb: patch, + default_value: {"maxValuesPerFacet": 100, "sortFacetValuesBy": {"*": "alpha"}} + }, + { + setting: search_cutoff_ms, + update_verb: put, + default_value: null + }, + { + setting: embedders, + update_verb: patch, + default_value: {} + }, + { + setting: facet_search, + update_verb: put, + default_value: true + }, + { + setting: prefix_search, + update_verb: put, + default_value: "indexingTime" + }, + { + setting: proximity_precision, + update_verb: put, + default_value: "byWord" + }, + { + setting: sortable_attributes, + update_verb: put, + default_value: [] + }, + { + setting: typo_tolerance, + update_verb: patch, + default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": []} + }, +); #[actix_rt::test] async fn get_settings_unexisting_index() { @@ -52,11 +195,11 @@ async fn get_settings() { let server = Server::new().await; let index = server.index("test"); let (response, _code) = index.create(None).await; - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); let settings = response.as_object().unwrap(); - assert_eq!(settings.keys().len(), 17); + assert_eq!(settings.keys().len(), 20); assert_eq!(settings["displayedAttributes"], json!(["*"])); assert_eq!(settings["searchableAttributes"], json!(["*"])); assert_eq!(settings["filterableAttributes"], json!([])); @@ -87,27 +230,18 @@ async fn get_settings() { ); assert_eq!(settings["proximityPrecision"], json!("byWord")); assert_eq!(settings["searchCutoffMs"], json!(null)); + assert_eq!(settings["prefixSearch"], json!("indexingTime")); + assert_eq!(settings["facetSearch"], json!(true)); + assert_eq!(settings["embedders"], json!({})); } #[actix_rt::test] async fn secrets_are_hidden_in_settings() { let server = Server::new().await; - let (response, code) = server.set_features(json!({"vectorStore": true})).await; - - meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let index = server.index("test"); let (response, _code) = index.create(None).await; - index.wait_task(response.uid()).await; + index.wait_task(response.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -142,7 +276,7 @@ async fn secrets_are_hidden_in_settings() { let (response, code) = index.settings().await; meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + meili_snap::snapshot!(meili_snap::json_string!(response), @r#" { "displayedAttributes": [ "*" @@ -199,13 +333,15 @@ async fn secrets_are_hidden_in_settings() { } }, "searchCutoffMs": null, - "localizedAttributes": null + "localizedAttributes": null, + "facetSearch": true, + "prefixSearch": "indexingTime" } - "###); + "#); let (response, code) = server.get_task(settings_update_uid).await; meili_snap::snapshot!(code, @"200 OK"); - meili_snap::snapshot!(meili_snap::json_string!(response["details"]), @r###" + meili_snap::snapshot!(meili_snap::json_string!(response["details"]), @r#" { "embedders": { "default": { @@ -218,7 +354,7 @@ async fn secrets_are_hidden_in_settings() { } } } - "###); + "#); } #[actix_rt::test] @@ -233,15 +369,15 @@ async fn error_update_settings_unknown_field() { async fn test_partial_update() { let server = Server::new().await; let index = server.index("test"); - let (_response, _code) = index.update_settings(json!({"displayedAttributes": ["foo"]})).await; - index.wait_task(0).await; + let (task, _code) = index.update_settings(json!({"displayedAttributes": ["foo"]})).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); assert_eq!(response["displayedAttributes"], json!(["foo"])); assert_eq!(response["searchableAttributes"], json!(["*"])); - let (_response, _) = index.update_settings(json!({"searchableAttributes": ["bar"]})).await; - index.wait_task(1).await; + let (task, _) = index.update_settings(json!({"searchableAttributes": ["bar"]})).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); @@ -253,10 +389,10 @@ async fn test_partial_update() { async fn error_delete_settings_unexisting_index() { let server = Server::new().await; let index = server.index("test"); - let (_response, code) = index.delete_settings().await; + let (task, code) = index.delete_settings().await; assert_eq!(code, 202); - let response = index.wait_task(0).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); } @@ -277,12 +413,12 @@ async fn reset_all_settings() { let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); assert_eq!(response["taskUid"], 0); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); - index + let (update_task,_status_code) = index .update_settings(json!({"displayedAttributes": ["name", "age"], "searchableAttributes": ["name"], "stopWords": ["the"], "filterableAttributes": ["age"], "synonyms": {"puppy": ["dog", "doggo", "potat"] }})) .await; - index.wait_task(1).await; + index.wait_task(update_task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); assert_eq!(response["displayedAttributes"], json!(["name", "age"])); @@ -291,8 +427,8 @@ async fn reset_all_settings() { assert_eq!(response["synonyms"], json!({"puppy": ["dog", "doggo", "potat"] })); assert_eq!(response["filterableAttributes"], json!(["age"])); - index.delete_settings().await; - index.wait_task(2).await; + let (delete_task, _status_code) = index.delete_settings().await; + index.wait_task(delete_task.uid()).await.succeeded(); let (response, code) = index.settings().await; assert_eq!(code, 200); @@ -311,14 +447,14 @@ async fn reset_all_settings() { async fn update_setting_unexisting_index() { let server = Server::new().await; let index = server.index("test"); - let (_response, code) = index.update_settings(json!({})).await; + let (task, code) = index.update_settings(json!({})).await; assert_eq!(code, 202); - let response = index.wait_task(0).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); let (_response, code) = index.get().await; assert_eq!(code, 200); - index.delete_settings().await; - let response = index.wait_task(1).await; + let (task, _status_code) = index.delete_settings().await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "succeeded"); } @@ -338,93 +474,6 @@ async fn error_update_setting_unexisting_index_invalid_uid() { "###); } -macro_rules! test_setting_routes { - ($($setting:ident $write_method:ident), *) => { - $( - mod $setting { - use crate::common::Server; - use super::DEFAULT_SETTINGS_VALUES; - - #[actix_rt::test] - async fn get_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (_response, code) = server.service.get(url).await; - assert_eq!(code, 404); - } - - #[actix_rt::test] - async fn update_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (response, code) = server.service.$write_method(url, serde_json::Value::Null.into()).await; - assert_eq!(code, 202, "{}", response); - server.index("").wait_task(0).await; - let (response, code) = server.index("test").get().await; - assert_eq!(code, 200, "{}", response); - } - - #[actix_rt::test] - async fn delete_unexisting_index() { - let server = Server::new().await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (_, code) = server.service.delete(url).await; - assert_eq!(code, 202); - let response = server.index("").wait_task(0).await; - assert_eq!(response["status"], "failed"); - } - - #[actix_rt::test] - async fn get_default() { - let server = Server::new().await; - let index = server.index("test"); - let (response, code) = index.create(None).await; - assert_eq!(code, 202, "{}", response); - index.wait_task(0).await; - let url = format!("/indexes/test/settings/{}", - stringify!($setting) - .chars() - .map(|c| if c == '_' { '-' } else { c }) - .collect::()); - let (response, code) = server.service.get(url).await; - assert_eq!(code, 200, "{}", response); - let expected = DEFAULT_SETTINGS_VALUES.get(stringify!($setting)).unwrap(); - assert_eq!(expected, &response); - } - } - )* - }; -} - -test_setting_routes!( - filterable_attributes put, - displayed_attributes put, - localized_attributes put, - searchable_attributes put, - distinct_attribute put, - stop_words put, - separator_tokens put, - non_separator_tokens put, - dictionary put, - ranking_rules put, - synonyms put, - pagination patch, - faceting patch, - search_cutoff_ms put -); - #[actix_rt::test] async fn error_set_invalid_ranking_rules() { let server = Server::new().await; @@ -448,18 +497,142 @@ async fn set_and_reset_distinct_attribute_with_dedicated_route() { let server = Server::new().await; let index = server.index("test"); - let (_response, _code) = index.update_distinct_attribute(json!("test")).await; - index.wait_task(0).await; + let (task, _code) = index.update_distinct_attribute(json!("test")).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; assert_eq!(response, "test"); - index.update_distinct_attribute(json!(null)).await; + let (task, _status_code) = index.update_distinct_attribute(json!(null)).await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index.get_distinct_attribute().await; assert_eq!(response, json!(null)); } + +#[actix_rt::test] +async fn granular_filterable_attributes() { + let server = Server::new().await; + let index = server.index("test"); + index.create(None).await; + + let (response, code) = + index.update_settings(json!({ "filterableAttributes": [ + { "attributePatterns": ["name"], "features": { "facetSearch": true, "filter": {"equality": true, "comparison": false} } }, + { "attributePatterns": ["age"], "features": { "facetSearch": false, "filter": {"equality": true, "comparison": true} } }, + { "attributePatterns": ["id"] }, + { "attributePatterns": ["default-filterable-features-null"], "features": { "facetSearch": true } }, + { "attributePatterns": ["default-filterable-features-equality"], "features": { "facetSearch": true, "filter": {"comparison": true} } }, + { "attributePatterns": ["default-filterable-features-comparison"], "features": { "facetSearch": true, "filter": {"equality": true} } }, + { "attributePatterns": ["default-filterable-features-empty"], "features": { "facetSearch": true, "filter": {} } }, + { "attributePatterns": ["default-facet-search"], "features": { "filter": {"equality": true, "comparison": true} } }, + ] })).await; + assert_eq!(code, 202); + index.wait_task(response.uid()).await.succeeded(); + + let (response, code) = index.settings().await; + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["filterableAttributes"]), @r###" + [ + { + "attributePatterns": [ + "name" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "age" + ], + "features": { + "facetSearch": false, + "filter": { + "equality": true, + "comparison": true + } + } + }, + { + "attributePatterns": [ + "id" + ], + "features": { + "facetSearch": false, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-null" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-equality" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": true + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-comparison" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-filterable-features-empty" + ], + "features": { + "facetSearch": true, + "filter": { + "equality": true, + "comparison": false + } + } + }, + { + "attributePatterns": [ + "default-facet-search" + ], + "features": { + "facetSearch": false, + "filter": { + "equality": true, + "comparison": true + } + } + } + ] + "###); +} diff --git a/crates/meilisearch/tests/settings/mod.rs b/crates/meilisearch/tests/settings/mod.rs index ccb4139e6..6b61e6be0 100644 --- a/crates/meilisearch/tests/settings/mod.rs +++ b/crates/meilisearch/tests/settings/mod.rs @@ -1,5 +1,7 @@ mod distinct; mod errors; mod get_settings; +mod prefix_search_settings; mod proximity_settings; mod tokenizer_customization; +mod vectors; diff --git a/crates/meilisearch/tests/settings/prefix_search_settings.rs b/crates/meilisearch/tests/settings/prefix_search_settings.rs new file mode 100644 index 000000000..5da758a7d --- /dev/null +++ b/crates/meilisearch/tests/settings/prefix_search_settings.rs @@ -0,0 +1,458 @@ +use meili_snap::{json_string, snapshot}; +use once_cell::sync::Lazy; + +use crate::common::Server; +use crate::json; + +static DOCUMENTS: Lazy = Lazy::new(|| { + json!([ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + }, + ]) +}); + +#[actix_rt::test] +async fn add_docs_and_disable() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + // only 1 document should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; + + // only 1 document should match + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_and_add_docs() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + // only 1 document should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_add_docs_and_enable() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "indexingTime", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn disable_add_docs_and_reset() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": "disabled", + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "prefixSearch": serde_json::Value::Null, + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(2).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} + +#[actix_rt::test] +async fn default_behavior() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "rankingRules": ["words", "typo", "proximity"], + })) + .await; + assert_eq!("202", code.as_str(), "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, _code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(response.uid()).await; + + // all documents should match + index + .search(json!({"q": "so", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; + + index + .search(json!({"q": "manythe", "attributesToHighlight": ["a", "b"]}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 1, + "a": "Soup of the day", + "b": "manythefishou", + "_formatted": { + "id": "1", + "a": "Soup of the day", + "b": "manythefishou" + } + }, + { + "id": 2, + "a": "Soup of day so", + "b": "manythe manythelazyfish", + "_formatted": { + "id": "2", + "a": "Soup of day so", + "b": "manythe manythelazyfish" + } + }, + { + "id": 3, + "a": "the Soup of day", + "b": "manythelazyfish", + "_formatted": { + "id": "3", + "a": "the Soup of day", + "b": "manythelazyfish" + } + } + ] + "###); + }) + .await; +} diff --git a/crates/meilisearch/tests/settings/proximity_settings.rs b/crates/meilisearch/tests/settings/proximity_settings.rs index 8b206ded4..c5897bc51 100644 --- a/crates/meilisearch/tests/settings/proximity_settings.rs +++ b/crates/meilisearch/tests/settings/proximity_settings.rs @@ -29,8 +29,8 @@ async fn attribute_scale_search() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -39,7 +39,7 @@ async fn attribute_scale_search() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); // the expected order is [1, 3, 2] instead of [3, 1, 2] // because the attribute scale doesn't make the difference between 1 and 3. @@ -102,16 +102,16 @@ async fn attribute_scale_phrase_search() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(task.uid()).await.succeeded(); - let (_response, _code) = index + let (task, _code) = index .update_settings(json!({ "proximityPrecision": "byAttribute", "rankingRules": ["words", "typo", "proximity"], })) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); // the expected order is [1, 3] instead of [3, 1] // because the attribute scale doesn't make the difference between 1 and 3. @@ -170,25 +170,25 @@ async fn word_scale_set_and_reset() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(task.uid()).await.succeeded(); // Set and reset the setting ensuring the swap between the 2 settings is applied. - let (_response, _code) = index + let (update_task1, _code) = index .update_settings(json!({ "proximityPrecision": "byAttribute", "rankingRules": ["words", "typo", "proximity"], })) .await; - index.wait_task(1).await; + index.wait_task(update_task1.uid()).await.succeeded(); - let (_response, _code) = index + let (update_task2, _code) = index .update_settings(json!({ "proximityPrecision": "byWord", "rankingRules": ["words", "typo", "proximity"], })) .await; - index.wait_task(2).await; + index.wait_task(update_task2.uid()).await.succeeded(); // [3, 1, 2] index @@ -285,8 +285,8 @@ async fn attribute_scale_default_ranking_rules() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(DOCUMENTS.clone(), None).await; - index.wait_task(0).await; + let (task, _status_code) = index.add_documents(DOCUMENTS.clone(), None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -294,7 +294,7 @@ async fn attribute_scale_default_ranking_rules() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); // the expected order is [3, 1, 2] index diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-apiKey-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-apiKey-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-apiKey-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-apiKey-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-apiKey-sending_result.snap new file mode 100644 index 000000000..3a9b5bfb8 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-apiKey-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `apiKey` unavailable for source `huggingFace`.\n - note: `apiKey` is available for sources: `openAi`, `ollama`, `rest`\n - note: available fields for source `huggingFace`: `source`, `model`, `revision`, `pooling`, `documentTemplate`, `documentTemplateMaxBytes`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-task_result.snap new file mode 100644 index 000000000..8f0a4edfa --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-binaryQuantized-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "binaryQuantized": false + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-dimensions-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-dimensions-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-dimensions-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-dimensions-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-dimensions-sending_result.snap new file mode 100644 index 000000000..f5dc3b48f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-dimensions-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `dimensions` unavailable for source `huggingFace`.\n - note: `dimensions` is available for sources: `openAi`, `ollama`, `userProvided`, `rest`\n - note: available fields for source `huggingFace`: `source`, `model`, `revision`, `pooling`, `documentTemplate`, `documentTemplateMaxBytes`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-task_result.snap new file mode 100644 index 000000000..757a7b89f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-model-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-task_result.snap new file mode 100644 index 000000000..12d199767 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-pooling-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "pooling": "forceMean" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-task_result.snap new file mode 100644 index 000000000..78d4c44cc --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/huggingFace-revision-task_result.snap @@ -0,0 +1,29 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e" + } + } + }, + "error": { + "message": "Index `test`: Error while generating embeddings: error: fetching file from HG_HUB failed:\n - request error: https://huggingface.co/BAAI/bge-base-en-v1.5/resolve/e4ce9877abf3edfe10b0d82785e83bdcb973e22e/config.json: status code 404", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-task_result.snap new file mode 100644 index 000000000..ac3780eb1 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-apiKey-task_result.snap @@ -0,0 +1,26 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "apiKey": "XXX...", + "dimensions": 768 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-task_result.snap new file mode 100644 index 000000000..b9ae269bb --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-binaryQuantized-task_result.snap @@ -0,0 +1,26 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768, + "binaryQuantized": false + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-task_result.snap new file mode 100644 index 000000000..aef2ba2b0 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-dimensions-task_result.snap @@ -0,0 +1,25 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-task_result.snap new file mode 100644 index 000000000..aef2ba2b0 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-model-task_result.snap @@ -0,0 +1,25 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-pooling-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-pooling-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-pooling-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-pooling-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-pooling-sending_result.snap new file mode 100644 index 000000000..110555f8b --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-pooling-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `pooling` unavailable for source `ollama`.\n - note: `pooling` is available for sources: `huggingFace`\n - note: available fields for source `ollama`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-revision-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-revision-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-revision-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-revision-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-revision-sending_result.snap new file mode 100644 index 000000000..a220caa82 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/ollama-revision-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `revision` unavailable for source `ollama`.\n - note: `revision` is available for sources: `huggingFace`\n - note: available fields for source `ollama`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-task_result.snap new file mode 100644 index 000000000..0cca31fb7 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-apiKey-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "apiKey": "XXX..." + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-task_result.snap new file mode 100644 index 000000000..329e88cac --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-binaryQuantized-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "binaryQuantized": false + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-task_result.snap new file mode 100644 index 000000000..b63a458ca --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-dimensions-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "dimensions": 768 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-task_result.snap new file mode 100644 index 000000000..daa87d395 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-model-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "model": "text-embedding-3-small" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-pooling-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-pooling-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-pooling-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-pooling-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-pooling-sending_result.snap new file mode 100644 index 000000000..958b5184a --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-pooling-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `pooling` unavailable for source `openAi`.\n - note: `pooling` is available for sources: `huggingFace`\n - note: available fields for source `openAi`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-revision-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-revision-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-revision-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-revision-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-revision-sending_result.snap new file mode 100644 index 000000000..acfdeac87 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/openAi-revision-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `revision` unavailable for source `openAi`.\n - note: `revision` is available for sources: `huggingFace`\n - note: available fields for source `openAi`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-task_result.snap new file mode 100644 index 000000000..ed8a6b2ea --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-apiKey-task_result.snap @@ -0,0 +1,32 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "apiKey": "XXX...", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-task_result.snap new file mode 100644 index 000000000..12fd314f5 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-binaryQuantized-task_result.snap @@ -0,0 +1,32 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "binaryQuantized": false, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-task_result.snap new file mode 100644 index 000000000..4f1bbf136 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-dimensions-task_result.snap @@ -0,0 +1,31 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-model-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-model-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-model-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-model-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-model-sending_result.snap new file mode 100644 index 000000000..8ac20a01c --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-model-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `model` unavailable for source `rest`.\n - note: `model` is available for sources: `openAi`, `huggingFace`, `ollama`\n - note: available fields for source `rest`: `source`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `request`, `response`, `headers`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-pooling-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-pooling-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-pooling-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-pooling-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-pooling-sending_result.snap new file mode 100644 index 000000000..31a2a7d15 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-pooling-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `pooling` unavailable for source `rest`.\n - note: `pooling` is available for sources: `huggingFace`\n - note: available fields for source `rest`: `source`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `request`, `response`, `headers`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-revision-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-revision-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-revision-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-revision-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-revision-sending_result.snap new file mode 100644 index 000000000..d732ac50c --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/rest-revision-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `revision` unavailable for source `rest`.\n - note: `revision` is available for sources: `huggingFace`\n - note: available fields for source `rest`: `source`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `request`, `response`, `headers`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-apiKey-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-apiKey-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-apiKey-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-apiKey-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-apiKey-sending_result.snap new file mode 100644 index 000000000..e47bd1e7f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-apiKey-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `apiKey` unavailable for source `userProvided`.\n - note: `apiKey` is available for sources: `openAi`, `ollama`, `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-task_result.snap new file mode 100644 index 000000000..93102fbe5 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-binaryQuantized-task_result.snap @@ -0,0 +1,25 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "userProvided", + "dimensions": 768, + "binaryQuantized": false + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-task_result.snap new file mode 100644 index 000000000..e095014fd --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-dimensions-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "userProvided", + "dimensions": 768 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-model-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-model-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-model-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-model-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-model-sending_result.snap new file mode 100644 index 000000000..acb26f215 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-model-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `model` unavailable for source `userProvided`.\n - note: `model` is available for sources: `openAi`, `huggingFace`, `ollama`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-pooling-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-pooling-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-pooling-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-pooling-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-pooling-sending_result.snap new file mode 100644 index 000000000..466826779 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-pooling-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `pooling` unavailable for source `userProvided`.\n - note: `pooling` is available for sources: `huggingFace`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-revision-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-revision-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-revision-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-revision-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-revision-sending_result.snap new file mode 100644 index 000000000..821d9550d --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters/userProvided-revision-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `revision` unavailable for source `userProvided`.\n - note: `revision` is available for sources: `huggingFace`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-task_result.snap new file mode 100644 index 000000000..0c60b1c6e --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-distribution-task_result.snap @@ -0,0 +1,27 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "distribution": { + "mean": 0.4, + "sigma": 0.1 + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-task_result.snap new file mode 100644 index 000000000..b7f10fd11 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplate-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "documentTemplate": "toto" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-task_result.snap new file mode 100644 index 000000000..93401b927 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-documentTemplateMaxBytes-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "huggingFace", + "documentTemplateMaxBytes": 200 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-headers-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-headers-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-headers-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-headers-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-headers-sending_result.snap new file mode 100644 index 000000000..38f95e6cb --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-headers-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `headers` unavailable for source `huggingFace`.\n - note: `headers` is available for sources: `rest`\n - note: available fields for source `huggingFace`: `source`, `model`, `revision`, `pooling`, `documentTemplate`, `documentTemplateMaxBytes`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-request-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-request-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-request-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-request-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-request-sending_result.snap new file mode 100644 index 000000000..83fc14a3f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-request-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `request` unavailable for source `huggingFace`.\n - note: `request` is available for sources: `rest`\n - note: available fields for source `huggingFace`: `source`, `model`, `revision`, `pooling`, `documentTemplate`, `documentTemplateMaxBytes`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-response-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-response-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-response-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-response-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-response-sending_result.snap new file mode 100644 index 000000000..f4e2f4a6f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-response-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `response` unavailable for source `huggingFace`.\n - note: `response` is available for sources: `rest`\n - note: available fields for source `huggingFace`: `source`, `model`, `revision`, `pooling`, `documentTemplate`, `documentTemplateMaxBytes`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-url-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-url-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-url-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-url-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-url-sending_result.snap new file mode 100644 index 000000000..3f18f89bd --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/huggingFace-url-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `url` unavailable for source `huggingFace`.\n - note: `url` is available for sources: `openAi`, `ollama`, `rest`\n - note: available fields for source `huggingFace`: `source`, `model`, `revision`, `pooling`, `documentTemplate`, `documentTemplateMaxBytes`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-task_result.snap new file mode 100644 index 000000000..5b0056604 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-distribution-task_result.snap @@ -0,0 +1,29 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768, + "distribution": { + "mean": 0.4, + "sigma": 0.1 + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-task_result.snap new file mode 100644 index 000000000..1b42db77b --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplate-task_result.snap @@ -0,0 +1,26 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768, + "documentTemplate": "toto" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-task_result.snap new file mode 100644 index 000000000..a2e8024a6 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-documentTemplateMaxBytes-task_result.snap @@ -0,0 +1,26 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768, + "documentTemplateMaxBytes": 200 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-headers-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-headers-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-headers-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-headers-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-headers-sending_result.snap new file mode 100644 index 000000000..600e8271d --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-headers-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `headers` unavailable for source `ollama`.\n - note: `headers` is available for sources: `rest`\n - note: available fields for source `ollama`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-request-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-request-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-request-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-request-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-request-sending_result.snap new file mode 100644 index 000000000..b257b474e --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-request-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `request` unavailable for source `ollama`.\n - note: `request` is available for sources: `rest`\n - note: available fields for source `ollama`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-response-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-response-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-response-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-response-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-response-sending_result.snap new file mode 100644 index 000000000..de06524f1 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-response-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `response` unavailable for source `ollama`.\n - note: `response` is available for sources: `rest`\n - note: available fields for source `ollama`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-task_result.snap new file mode 100644 index 000000000..4eaf0ba2f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/ollama-url-task_result.snap @@ -0,0 +1,31 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "ollama", + "model": "all-minilm", + "dimensions": 768, + "url": "http://rest.example/" + } + } + }, + "error": { + "message": "Index `test`: Error while generating embeddings: user error: unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `http://rest.example/`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-task_result.snap new file mode 100644 index 000000000..eb6eaf59d --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-distribution-task_result.snap @@ -0,0 +1,27 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "distribution": { + "mean": 0.4, + "sigma": 0.1 + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-task_result.snap new file mode 100644 index 000000000..d1ad94953 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplate-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "documentTemplate": "toto" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-task_result.snap new file mode 100644 index 000000000..dca04b8c2 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-documentTemplateMaxBytes-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "documentTemplateMaxBytes": 200 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-headers-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-headers-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-headers-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-headers-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-headers-sending_result.snap new file mode 100644 index 000000000..117268660 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-headers-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `headers` unavailable for source `openAi`.\n - note: `headers` is available for sources: `rest`\n - note: available fields for source `openAi`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-request-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-request-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-request-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-request-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-request-sending_result.snap new file mode 100644 index 000000000..dcf8000eb --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-request-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `request` unavailable for source `openAi`.\n - note: `request` is available for sources: `rest`\n - note: available fields for source `openAi`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-response-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-response-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-response-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-response-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-response-sending_result.snap new file mode 100644 index 000000000..d834bc900 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-response-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `response` unavailable for source `openAi`.\n - note: `response` is available for sources: `rest`\n - note: available fields for source `openAi`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-task_result.snap new file mode 100644 index 000000000..78d2b853e --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/openAi-url-task_result.snap @@ -0,0 +1,24 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "openAi", + "url": "http://rest.example/" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-task_result.snap new file mode 100644 index 000000000..96841efcc --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-distribution-task_result.snap @@ -0,0 +1,35 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + }, + "distribution": { + "mean": 0.4, + "sigma": 0.1 + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-task_result.snap new file mode 100644 index 000000000..f9bb045ad --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplate-task_result.snap @@ -0,0 +1,32 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "documentTemplate": "toto", + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-task_result.snap new file mode 100644 index 000000000..5085ab19e --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-documentTemplateMaxBytes-task_result.snap @@ -0,0 +1,32 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "documentTemplateMaxBytes": 200, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-task_result.snap new file mode 100644 index 000000000..db6434f0e --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-headers-task_result.snap @@ -0,0 +1,34 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + }, + "headers": { + "custom": "value" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-task_result.snap new file mode 100644 index 000000000..4f1bbf136 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-request-task_result.snap @@ -0,0 +1,31 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-task_result.snap new file mode 100644 index 000000000..4f1bbf136 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-response-task_result.snap @@ -0,0 +1,31 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-task_result.snap new file mode 100644 index 000000000..4f1bbf136 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/rest-url-task_result.snap @@ -0,0 +1,31 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "rest", + "dimensions": 768, + "url": "http://rest.example/", + "request": { + "text": "{{text}}" + }, + "response": { + "embedding": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-sending_code.snap new file mode 100644 index 000000000..ef52a4a70 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +202 Accepted diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-sending_result.snap new file mode 100644 index 000000000..d868ef060 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-sending_result.snap @@ -0,0 +1,10 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "taskUid": "[taskUid]", + "indexUid": "test", + "status": "enqueued", + "type": "settingsUpdate", + "enqueuedAt": "[enqueuedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-task_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-task_result.snap new file mode 100644 index 000000000..be731d19f --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-distribution-task_result.snap @@ -0,0 +1,28 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "uid": "[uid]", + "batchUid": "[batchUid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "userProvided", + "dimensions": 768, + "distribution": { + "mean": 0.4, + "sigma": 0.1 + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[enqueuedAt]", + "startedAt": "[startedAt]", + "finishedAt": "[finishedAt]" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplate-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplate-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplate-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplate-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplate-sending_result.snap new file mode 100644 index 000000000..4922d21cc --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplate-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `documentTemplate` unavailable for source `userProvided`.\n - note: `documentTemplate` is available for sources: `openAi`, `huggingFace`, `ollama`, `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplateMaxBytes-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplateMaxBytes-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplateMaxBytes-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplateMaxBytes-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplateMaxBytes-sending_result.snap new file mode 100644 index 000000000..1899cc0a8 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-documentTemplateMaxBytes-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `documentTemplateMaxBytes` unavailable for source `userProvided`.\n - note: `documentTemplateMaxBytes` is available for sources: `openAi`, `huggingFace`, `ollama`, `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-headers-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-headers-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-headers-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-headers-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-headers-sending_result.snap new file mode 100644 index 000000000..1cd308942 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-headers-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `headers` unavailable for source `userProvided`.\n - note: `headers` is available for sources: `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-request-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-request-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-request-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-request-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-request-sending_result.snap new file mode 100644 index 000000000..48f8ca1eb --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-request-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `request` unavailable for source `userProvided`.\n - note: `request` is available for sources: `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-response-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-response-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-response-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-response-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-response-sending_result.snap new file mode 100644 index 000000000..76c1c8f68 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-response-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `response` unavailable for source `userProvided`.\n - note: `response` is available for sources: `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-url-sending_code.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-url-sending_code.snap new file mode 100644 index 000000000..ef5454296 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-url-sending_code.snap @@ -0,0 +1,4 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +400 Bad Request diff --git a/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-url-sending_result.snap b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-url-sending_result.snap new file mode 100644 index 000000000..7469b3943 --- /dev/null +++ b/crates/meilisearch/tests/settings/snapshots/vectors.rs/bad_parameters_2/userProvided-url-sending_result.snap @@ -0,0 +1,9 @@ +--- +source: crates/meilisearch/tests/settings/vectors.rs +--- +{ + "message": "`.embedders.test`: Field `url` unavailable for source `userProvided`.\n - note: `url` is available for sources: `openAi`, `ollama`, `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" +} diff --git a/crates/meilisearch/tests/settings/tokenizer_customization.rs b/crates/meilisearch/tests/settings/tokenizer_customization.rs index 4602e31f7..190918b34 100644 --- a/crates/meilisearch/tests/settings/tokenizer_customization.rs +++ b/crates/meilisearch/tests/settings/tokenizer_customization.rs @@ -8,14 +8,14 @@ async fn set_and_reset() { let server = Server::new().await; let index = server.index("test"); - let (_response, _code) = index + let (task, _code) = index .update_settings(json!({ "nonSeparatorTokens": ["#", "&"], "separatorTokens": ["&sep", "
"], "dictionary": ["J.R.R.", "J. R. R."], })) .await; - index.wait_task(0).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index.settings().await; snapshot!(json_string!(response["nonSeparatorTokens"]), @r###" @@ -37,7 +37,7 @@ async fn set_and_reset() { ] "###); - index + let (task, _status_code) = index .update_settings(json!({ "nonSeparatorTokens": null, "separatorTokens": null, @@ -45,7 +45,7 @@ async fn set_and_reset() { })) .await; - index.wait_task(1).await; + index.wait_task(task.uid()).await.succeeded(); let (response, _) = index.settings().await; snapshot!(json_string!(response["nonSeparatorTokens"]), @"[]"); @@ -73,17 +73,17 @@ async fn set_and_search() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (add_task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(add_task.uid()).await.succeeded(); - let (_response, _code) = index + let (update_task, _code) = index .update_settings(json!({ "nonSeparatorTokens": ["#", "&"], "separatorTokens": ["
", "&sep"], "dictionary": ["#", "A#", "B#", "C#", "D#", "E#", "F#", "G#"], })) .await; - index.wait_task(1).await; + index.wait_task(update_task.uid()).await.succeeded(); index .search(json!({"q": "&", "attributesToHighlight": ["content"]}), |response, code| { @@ -227,10 +227,10 @@ async fn advanced_synergies() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(documents, None).await; - index.wait_task(0).await; + let (add_task, _status_code) = index.add_documents(documents, None).await; + index.wait_task(add_task.uid()).await.succeeded(); - let (_response, _code) = index + let (update_task, _code) = index .update_settings(json!({ "dictionary": ["J.R.R.", "J. R. R."], "synonyms": { @@ -243,7 +243,7 @@ async fn advanced_synergies() { } })) .await; - index.wait_task(1).await; + index.wait_task(update_task.uid()).await.succeeded(); index .search(json!({"q": "J.R.R.", "attributesToHighlight": ["content"]}), |response, code| { @@ -353,7 +353,7 @@ async fn advanced_synergies() { "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."], })) .await; - index.wait_task(2).await; + index.wait_task(_response.uid()).await.succeeded(); index .search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| { diff --git a/crates/meilisearch/tests/settings/vectors.rs b/crates/meilisearch/tests/settings/vectors.rs new file mode 100644 index 000000000..eb13af772 --- /dev/null +++ b/crates/meilisearch/tests/settings/vectors.rs @@ -0,0 +1,323 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{Server, Value}; + +macro_rules! parameter_test { + ($server:ident, $source:tt, $param:tt) => { + let source = stringify!($source); + let param = stringify!($param); + let index = $server.index("test"); + + let (response, _code) = index + .update_settings(crate::json!({ + "embedders": { + "test": null, + } + })) + .await; + $server.wait_task(response.uid()).await.succeeded(); + + // Add a small delay between API calls + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + let mut value = base_for_source(source); + value[param] = valid_parameter(source, param).0; + let (response, code) = index + .update_settings(crate::json!({ + "embedders": { + "test": value + } + })) + .await; + snapshot!(code, name: concat!(stringify!($source), "-", stringify!($param), "-sending_code")); + snapshot!(json_string!(response, {".enqueuedAt" => "[enqueuedAt]", ".taskUid" => "[taskUid]"}), name: concat!(stringify!($source), "-", stringify!($param), "-sending_result")); + + if response.has_uid() { + let response = $server.wait_task(response.uid()).await; + snapshot!(json_string!(response, {".enqueuedAt" => "[enqueuedAt]", + ".uid" => "[uid]", ".batchUid" => "[batchUid]", + ".duration" => "[duration]", + ".startedAt" => "[startedAt]", + ".finishedAt" => "[finishedAt]"}), name: concat!(stringify!($source), "-", stringify!($param), "-task_result")); + } + }; +} + +#[actix_rt::test] +#[ignore = "Test is failing with timeout issues"] +async fn bad_parameters() { + let server = Server::new().await; + + // for each source, check which parameters are allowed/disallowed + // model + // - openai + parameter_test!(server, openAi, model); + // - huggingFace + parameter_test!(server, huggingFace, model); + // - userProvided + parameter_test!(server, userProvided, model); + // - ollama + parameter_test!(server, ollama, model); + // - rest + parameter_test!(server, rest, model); + // == + + // revision + // - openai + parameter_test!(server, openAi, revision); + // - huggingFace + parameter_test!(server, huggingFace, revision); + // - userProvided + parameter_test!(server, userProvided, revision); + // - ollama + parameter_test!(server, ollama, revision); + // - rest + parameter_test!(server, rest, revision); + // == + + // pooling + // - openai + parameter_test!(server, openAi, pooling); + // - huggingFace + parameter_test!(server, huggingFace, pooling); + // - userProvided + parameter_test!(server, userProvided, pooling); + // - ollama + parameter_test!(server, ollama, pooling); + // - rest + parameter_test!(server, rest, pooling); + // == + + // apiKey + // - openai + parameter_test!(server, openAi, apiKey); + // - huggingFace + parameter_test!(server, huggingFace, apiKey); + // - userProvided + parameter_test!(server, userProvided, apiKey); + // - ollama + parameter_test!(server, ollama, apiKey); + // - rest + parameter_test!(server, rest, apiKey); + // == + + // dimensions + // - openai + parameter_test!(server, openAi, dimensions); + // - huggingFace + parameter_test!(server, huggingFace, dimensions); + // - userProvided + parameter_test!(server, userProvided, dimensions); + // - ollama + parameter_test!(server, ollama, dimensions); + // - rest + parameter_test!(server, rest, dimensions); + // == + + // binaryQuantized + // - openai + parameter_test!(server, openAi, binaryQuantized); + // - huggingFace + parameter_test!(server, huggingFace, binaryQuantized); + // - userProvided + parameter_test!(server, userProvided, binaryQuantized); + // - ollama + parameter_test!(server, ollama, binaryQuantized); + // - rest + parameter_test!(server, rest, binaryQuantized); + // == + + // for each source, check that removing mandatory parameters is a failure +} + +#[actix_rt::test] +#[ignore = "Test is failing with timeout issues"] +async fn bad_parameters_2() { + let server = Server::new().await; + + // documentTemplate + // - openai + parameter_test!(server, openAi, documentTemplate); + // - huggingFace + parameter_test!(server, huggingFace, documentTemplate); + // - userProvided + parameter_test!(server, userProvided, documentTemplate); + // - ollama + parameter_test!(server, ollama, documentTemplate); + // - rest + parameter_test!(server, rest, documentTemplate); + // == + + // documentTemplateMaxBytes + // - openai + parameter_test!(server, openAi, documentTemplateMaxBytes); + // - huggingFace + parameter_test!(server, huggingFace, documentTemplateMaxBytes); + // - userProvided + parameter_test!(server, userProvided, documentTemplateMaxBytes); + // - ollama + parameter_test!(server, ollama, documentTemplateMaxBytes); + // - rest + parameter_test!(server, rest, documentTemplateMaxBytes); + // == + + // url + // - openai + parameter_test!(server, openAi, url); + // - huggingFace + parameter_test!(server, huggingFace, url); + // - userProvided + parameter_test!(server, userProvided, url); + // - ollama + parameter_test!(server, ollama, url); + // - rest + parameter_test!(server, rest, url); + // == + + // request + // - openai + parameter_test!(server, openAi, request); + // - huggingFace + parameter_test!(server, huggingFace, request); + // - userProvided + parameter_test!(server, userProvided, request); + // - ollama + parameter_test!(server, ollama, request); + // - rest + parameter_test!(server, rest, request); + // == + + // response + // - openai + parameter_test!(server, openAi, response); + // - huggingFace + parameter_test!(server, huggingFace, response); + // - userProvided + parameter_test!(server, userProvided, response); + // - ollama + parameter_test!(server, ollama, response); + // - rest + parameter_test!(server, rest, response); + // == + + // headers + // - openai + parameter_test!(server, openAi, headers); + // - huggingFace + parameter_test!(server, huggingFace, headers); + // - userProvided + parameter_test!(server, userProvided, headers); + // - ollama + parameter_test!(server, ollama, headers); + // - rest + parameter_test!(server, rest, headers); + // == + + // distribution + // - openai + parameter_test!(server, openAi, distribution); + // - huggingFace + parameter_test!(server, huggingFace, distribution); + // - userProvided + parameter_test!(server, userProvided, distribution); + // - ollama + parameter_test!(server, ollama, distribution); + // - rest + parameter_test!(server, rest, distribution); + // == +} + +fn base_for_source(source: &'static str) -> Value { + let base_parameters = maplit::btreemap! { + "openAi" => vec![], + "huggingFace" => vec![], + "userProvided" => vec!["dimensions"], + "ollama" => vec!["model", + // add dimensions to avoid actually fetching the model from ollama + "dimensions"], + "rest" => vec!["url", "request", "response", + // add dimensions to avoid actually fetching the model from ollama + "dimensions"], + }; + + let mut value = crate::json!({ + "source": source + }); + + let mandatory_parameters = base_parameters.get(source).unwrap(); + for mandatory_parameter in mandatory_parameters { + value[mandatory_parameter] = valid_parameter(source, mandatory_parameter).0; + } + value +} + +fn valid_parameter(source: &'static str, parameter: &'static str) -> Value { + match (source, parameter) { + ("openAi", "model") => crate::json!("text-embedding-ada-002"), + ("openAi", "revision") => crate::json!("2023-05-15"), + ("openAi", "pooling") => crate::json!("mean"), + ("openAi", "apiKey") => crate::json!("test"), + ("openAi", "dimensions") => crate::json!(1), // Use minimal dimension to avoid model download + ("openAi", "binaryQuantized") => crate::json!(false), + ("openAi", "documentTemplate") => crate::json!("test"), + ("openAi", "documentTemplateMaxBytes") => crate::json!(100), + ("openAi", "url") => crate::json!("http://test"), + ("openAi", "request") => crate::json!({ "test": "test" }), + ("openAi", "response") => crate::json!({ "test": "test" }), + ("openAi", "headers") => crate::json!({ "test": "test" }), + ("openAi", "distribution") => crate::json!("normal"), + ("huggingFace", "model") => crate::json!("test"), + ("huggingFace", "revision") => crate::json!("test"), + ("huggingFace", "pooling") => crate::json!("mean"), + ("huggingFace", "apiKey") => crate::json!("test"), + ("huggingFace", "dimensions") => crate::json!(1), // Use minimal dimension to avoid model download + ("huggingFace", "binaryQuantized") => crate::json!(false), + ("huggingFace", "documentTemplate") => crate::json!("test"), + ("huggingFace", "documentTemplateMaxBytes") => crate::json!(100), + ("huggingFace", "url") => crate::json!("http://test"), + ("huggingFace", "request") => crate::json!({ "test": "test" }), + ("huggingFace", "response") => crate::json!({ "test": "test" }), + ("huggingFace", "headers") => crate::json!({ "test": "test" }), + ("huggingFace", "distribution") => crate::json!("normal"), + ("userProvided", "model") => crate::json!("test"), + ("userProvided", "revision") => crate::json!("test"), + ("userProvided", "pooling") => crate::json!("mean"), + ("userProvided", "apiKey") => crate::json!("test"), + ("userProvided", "dimensions") => crate::json!(1), // Use minimal dimension to avoid model download + ("userProvided", "binaryQuantized") => crate::json!(false), + ("userProvided", "documentTemplate") => crate::json!("test"), + ("userProvided", "documentTemplateMaxBytes") => crate::json!(100), + ("userProvided", "url") => crate::json!("http://test"), + ("userProvided", "request") => crate::json!({ "test": "test" }), + ("userProvided", "response") => crate::json!({ "test": "test" }), + ("userProvided", "headers") => crate::json!({ "test": "test" }), + ("userProvided", "distribution") => crate::json!("normal"), + ("ollama", "model") => crate::json!("test"), + ("ollama", "revision") => crate::json!("test"), + ("ollama", "pooling") => crate::json!("mean"), + ("ollama", "apiKey") => crate::json!("test"), + ("ollama", "dimensions") => crate::json!(1), // Use minimal dimension to avoid model download + ("ollama", "binaryQuantized") => crate::json!(false), + ("ollama", "documentTemplate") => crate::json!("test"), + ("ollama", "documentTemplateMaxBytes") => crate::json!(100), + ("ollama", "url") => crate::json!("http://test"), + ("ollama", "request") => crate::json!({ "test": "test" }), + ("ollama", "response") => crate::json!({ "test": "test" }), + ("ollama", "headers") => crate::json!({ "test": "test" }), + ("ollama", "distribution") => crate::json!("normal"), + ("rest", "model") => crate::json!("test"), + ("rest", "revision") => crate::json!("test"), + ("rest", "pooling") => crate::json!("mean"), + ("rest", "apiKey") => crate::json!("test"), + ("rest", "dimensions") => crate::json!(1), // Use minimal dimension to avoid model download + ("rest", "binaryQuantized") => crate::json!(false), + ("rest", "documentTemplate") => crate::json!("test"), + ("rest", "documentTemplateMaxBytes") => crate::json!(100), + ("rest", "url") => crate::json!("http://test"), + ("rest", "request") => crate::json!({ "test": "test" }), + ("rest", "response") => crate::json!({ "test": "test" }), + ("rest", "headers") => crate::json!({ "test": "test" }), + ("rest", "distribution") => crate::json!("normal"), + _ => panic!("Invalid parameter {} for source {}", parameter, source), + } +} diff --git a/crates/meilisearch/tests/similar/errors.rs b/crates/meilisearch/tests/similar/errors.rs index 1e933e1c0..30ff5b145 100644 --- a/crates/meilisearch/tests/similar/errors.rs +++ b/crates/meilisearch/tests/similar/errors.rs @@ -8,7 +8,6 @@ use crate::json; async fn similar_unexisting_index() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let expected_response = json!({ "message": "Index `test` not found.", @@ -29,7 +28,6 @@ async fn similar_unexisting_index() { async fn similar_unexisting_parameter() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; index .similar(json!({"id": 287947, "marin": "hello"}), |response, code| { @@ -39,28 +37,10 @@ async fn similar_unexisting_parameter() { .await; } -#[actix_rt::test] -async fn similar_feature_not_enabled() { - let server = Server::new().await; - let index = server.index("test"); - - let (response, code) = index.similar_post(json!({"id": 287947, "embedder": "manual"})).await; - snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" - { - "message": "Using the similar API requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); -} - #[actix_rt::test] async fn similar_bad_id() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -75,11 +55,11 @@ async fn similar_bad_id() { snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await; - let (response, code) = index.similar_post(json!({"id": ["doggo"]})).await; + let (response, code) = index.similar_post(json!({"id": ["doggo"], "embedder": "manual"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.id`: Document identifier `[\"doggo\"]` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" @@ -91,7 +71,6 @@ async fn similar_bad_id() { async fn similar_bad_ranking_score_threshold() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -122,7 +101,6 @@ async fn similar_bad_ranking_score_threshold() { async fn similar_invalid_ranking_score_threshold() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -153,7 +131,6 @@ async fn similar_invalid_ranking_score_threshold() { async fn similar_invalid_id() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -168,11 +145,12 @@ async fn similar_invalid_id() { snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await; - let (response, code) = index.similar_post(json!({"id": "http://invalid-docid/"})).await; + let (response, code) = + index.similar_post(json!({"id": "http://invalid-docid/", "embedder": "manual"})).await; snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.", + "message": "Invalid value at `.id`: Document identifier `\"http://invalid-docid/\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.", "code": "invalid_similar_id", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_similar_id" @@ -184,7 +162,6 @@ async fn similar_invalid_id() { async fn similar_not_found_id() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -216,7 +193,6 @@ async fn similar_not_found_id() { async fn similar_bad_offset() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -259,7 +235,6 @@ async fn similar_bad_offset() { async fn similar_bad_limit() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -304,7 +279,6 @@ async fn similar_bad_filter() { // Thus the error message is not generated by deserr but written by us. let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -324,7 +298,7 @@ async fn similar_bad_filter() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (response, code) = index.similar_post(json!({ "id": 287947, "filter": true, "embedder": "manual" })).await; @@ -344,7 +318,6 @@ async fn similar_bad_filter() { async fn filter_invalid_syntax_object() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -362,7 +335,7 @@ async fn filter_invalid_syntax_object() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index .similar(json!({"id": 287947, "filter": "title & Glass", "embedder": "manual"}), |response, code| { @@ -383,7 +356,6 @@ async fn filter_invalid_syntax_object() { async fn filter_invalid_syntax_array() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -401,7 +373,7 @@ async fn filter_invalid_syntax_array() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index .similar(json!({"id": 287947, "filter": ["title & Glass"], "embedder": "manual"}), |response, code| { @@ -422,7 +394,6 @@ async fn filter_invalid_syntax_array() { async fn filter_invalid_syntax_string() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -440,7 +411,7 @@ async fn filter_invalid_syntax_string() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "Found unexpected characters at the end of the filter: `XOR title = Glass`. You probably forgot an `OR` or an `AND` rule.\n15:32 title = Glass XOR title = Glass", @@ -463,7 +434,6 @@ async fn filter_invalid_syntax_string() { async fn filter_invalid_attribute_array() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -481,20 +451,21 @@ async fn filter_invalid_attribute_array() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); - let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", - "code": "invalid_similar_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" - }); index .similar( json!({"id": 287947, "filter": ["many = Glass"], "embedder": "manual"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_similar_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" + } + "###); }, ) .await; @@ -504,7 +475,6 @@ async fn filter_invalid_attribute_array() { async fn filter_invalid_attribute_string() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -522,20 +492,21 @@ async fn filter_invalid_attribute_string() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); - let expected_response = json!({ - "message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", - "code": "invalid_similar_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" - }); index .similar( json!({"id": 287947, "filter": "many = Glass", "embedder": "manual"}), |response, code| { - assert_eq!(response, expected_response); - assert_eq!(code, 400); + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "Attribute `many` is not filterable. Available filterable attribute patterns are: `title`.\n1:5 many = Glass", + "code": "invalid_similar_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_filter" + } + "###); }, ) .await; @@ -545,7 +516,6 @@ async fn filter_invalid_attribute_string() { async fn filter_reserved_geo_attribute_array() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -563,7 +533,7 @@ async fn filter_reserved_geo_attribute_array() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", @@ -586,7 +556,6 @@ async fn filter_reserved_geo_attribute_array() { async fn filter_reserved_geo_attribute_string() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -604,7 +573,7 @@ async fn filter_reserved_geo_attribute_string() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:13 _geo = Glass", @@ -627,7 +596,6 @@ async fn filter_reserved_geo_attribute_string() { async fn filter_reserved_attribute_array() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -645,7 +613,7 @@ async fn filter_reserved_attribute_array() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", @@ -668,7 +636,6 @@ async fn filter_reserved_attribute_array() { async fn filter_reserved_attribute_string() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -686,7 +653,7 @@ async fn filter_reserved_attribute_string() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geoDistance` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:21 _geoDistance = Glass", @@ -709,7 +676,6 @@ async fn filter_reserved_attribute_string() { async fn filter_reserved_geo_point_array() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -727,7 +693,7 @@ async fn filter_reserved_geo_point_array() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", @@ -750,7 +716,6 @@ async fn filter_reserved_geo_point_array() { async fn filter_reserved_geo_point_string() { let server = Server::new().await; let index = server.index("test"); - server.set_features(json!({"vectorStore": true})).await; let (response, code) = index .update_settings(json!({ @@ -768,7 +733,7 @@ async fn filter_reserved_geo_point_string() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let expected_response = json!({ "message": "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` coordinates.\n1:18 _geoPoint = Glass", @@ -790,7 +755,6 @@ async fn filter_reserved_geo_point_string() { #[actix_rt::test] async fn similar_bad_retrieve_vectors() { let server = Server::new().await; - server.set_features(json!({"vectorStore": true})).await; let index = server.index("test"); let (response, code) = @@ -839,3 +803,86 @@ async fn similar_bad_retrieve_vectors() { } "###); } + +#[actix_rt::test] +async fn similar_bad_embedder() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + "filterableAttributes": ["title"]})) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = DOCUMENTS.clone(); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let expected_response = json!({ + "message": "Cannot find embedder with name `auto`.", + "code": "invalid_similar_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_embedder" + }); + + index + .similar(json!({"id": 287947, "embedder": "auto"}), |response, code| { + assert_eq!(response, expected_response); + assert_eq!(code, 400); + }) + .await; + + let expected_response = json!({ + "message": "Invalid value type at `.embedder`: expected a string, but found a positive integer: `42`", + "code": "invalid_similar_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_embedder" + }); + + let (response, code) = index.similar_post(json!({"id": 287947, "embedder": 42})).await; + + assert_eq!(response, expected_response); + assert_eq!(code, 400); + + let expected_response = json!({ + "message": "Invalid value type at `.embedder`: expected a string, but found null", + "code": "invalid_similar_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_embedder" + }); + + let (response, code) = index.similar_post(json!({"id": 287947, "embedder": null})).await; + + assert_eq!(response, expected_response); + assert_eq!(code, 400); + + let expected_response = json!({ + "message": "Missing field `embedder`", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + }); + + let (response, code) = index.similar_post(json!({"id": 287947})).await; + assert_eq!(response, expected_response); + assert_eq!(code, 400); + + let expected_response = json!({ + "message": "Missing parameter `embedder`", + "code": "bad_request", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#bad_request" + }); + let (response, code) = index.similar_get("?id=287947").await; + assert_eq!(response, expected_response); + assert_eq!(code, 400); +} diff --git a/crates/meilisearch/tests/similar/mod.rs b/crates/meilisearch/tests/similar/mod.rs index fa0797a41..defb777e0 100644 --- a/crates/meilisearch/tests/similar/mod.rs +++ b/crates/meilisearch/tests/similar/mod.rs @@ -49,17 +49,6 @@ static DOCUMENTS: Lazy = Lazy::new(|| { async fn basic() { let server = Server::new().await; let index = server.index("test"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -77,7 +66,7 @@ async fn basic() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index .similar( @@ -246,17 +235,6 @@ async fn basic() { async fn ranking_score_threshold() { let server = Server::new().await; let index = server.index("test"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -274,7 +252,7 @@ async fn ranking_score_threshold() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index .similar( @@ -527,17 +505,6 @@ async fn ranking_score_threshold() { async fn filter() { let server = Server::new().await; let index = server.index("test"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -555,7 +522,7 @@ async fn filter() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index .similar( @@ -656,17 +623,6 @@ async fn filter() { async fn limit_and_offset() { let server = Server::new().await; let index = server.index("test"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -684,7 +640,7 @@ async fn limit_and_offset() { let documents = DOCUMENTS.clone(); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index .similar( diff --git a/crates/meilisearch/tests/snapshot/mod.rs b/crates/meilisearch/tests/snapshot/mod.rs index 976551190..a8f93f467 100644 --- a/crates/meilisearch/tests/snapshot/mod.rs +++ b/crates/meilisearch/tests/snapshot/mod.rs @@ -56,7 +56,7 @@ async fn perform_snapshot() { let (task, code) = server.index("test1").create(Some("prim")).await; meili_snap::snapshot!(code, @"202 Accepted"); - index.wait_task(task.uid()).await; + index.wait_task(task.uid()).await.succeeded(); // wait for the _next task_ to process, aka the snapshot that should be enqueued at some point @@ -111,6 +111,7 @@ async fn perform_snapshot() { } #[actix_rt::test] +#[cfg_attr(target_os = "windows", ignore)] async fn perform_on_demand_snapshot() { let temp = tempfile::tempdir().unwrap(); let snapshot_dir = tempfile::tempdir().unwrap(); @@ -129,11 +130,11 @@ async fn perform_on_demand_snapshot() { index.load_test_set().await; - server.index("doggo").create(Some("bone")).await; - index.wait_task(2).await; + let (task, _status_code) = server.index("doggo").create(Some("bone")).await; + index.wait_task(task.uid()).await.succeeded(); - server.index("doggo").create(Some("bone")).await; - index.wait_task(2).await; + let (task, _status_code) = server.index("doggo").create(Some("bone")).await; + index.wait_task(task.uid()).await.failed(); let (task, code) = server.create_snapshot().await; snapshot!(code, @"202 Accepted"); diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index a02a48a87..20a8eaef6 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -1,3 +1,4 @@ +use meili_snap::{json_string, snapshot}; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; @@ -28,10 +29,10 @@ async fn test_healthyness() { async fn stats() { let server = Server::new().await; let index = server.index("test"); - let (_, code) = index.create(Some("id")).await; + let (task, code) = index.create(Some("id")).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = server.stats().await; @@ -57,7 +58,7 @@ async fn stats() { assert_eq!(code, 202, "{}", response); assert_eq!(response["taskUid"], 1); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); let timestamp = OffsetDateTime::now_utc(); let (response, code) = server.stats().await; @@ -74,3 +75,269 @@ async fn stats() { assert_eq!(response["indexes"]["test"]["fieldDistribution"]["name"], 1); assert_eq!(response["indexes"]["test"]["fieldDistribution"]["age"], 1); } + +#[actix_rt::test] +async fn add_remove_embeddings() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + // 2 embedded documents for 5 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, + "isIndexing": false, + "numberOfEmbeddings": 5, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 2 embedded documents for 3 embeddings in total + let documents = json!([ + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 2 embedded documents for 2 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": null, "handcrafted": [0, 0, 0] }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 1 embedded documents for 2 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null, "handcrafted": null }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 1, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); +} + +#[actix_rt::test] +async fn add_remove_embedded_documents() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + // 2 embedded documents for 5 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 27, + "avgDocumentSize": 13, + "isIndexing": false, + "numberOfEmbeddings": 5, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // delete one embedded document, remaining 1 embedded documents for 3 embeddings in total + let (response, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 1, + "rawDocumentDbSize": 13, + "avgDocumentSize": 13, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 1, + "fieldDistribution": { + "id": 1, + "name": 1 + } + } + "###); +} + +#[actix_rt::test] +async fn update_embedder_settings() { + let server = Server::new().await; + let index = server.index("doggo"); + + // 2 embedded documents for 3 embeddings in total + // but no embedders are added in the settings yet so we expect 0 embedded documents for 0 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 108, + "avgDocumentSize": 54, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // add embedders to the settings + // 2 embedded documents for 3 embeddings in total + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "rawDocumentDbSize": 108, + "avgDocumentSize": 54, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); +} diff --git a/crates/meilisearch/tests/swap_indexes/mod.rs b/crates/meilisearch/tests/swap_indexes/mod.rs index 646f773d8..bf84d5823 100644 --- a/crates/meilisearch/tests/swap_indexes/mod.rs +++ b/crates/meilisearch/tests/swap_indexes/mod.rs @@ -15,7 +15,7 @@ async fn swap_indexes() { let (res, code) = b.add_documents(json!({ "id": 1, "index": "b"}), None).await; snapshot!(code, @"202 Accepted"); snapshot!(res["taskUid"], @"1"); - server.wait_task(1).await; + server.wait_task(res.uid()).await; let (tasks, code) = server.tasks().await; snapshot!(code, @"200 OK"); @@ -67,7 +67,7 @@ async fn swap_indexes() { let (res, code) = server.index_swap(json!([{ "indexes": ["a", "b"] }])).await; snapshot!(code, @"202 Accepted"); snapshot!(res["taskUid"], @"2"); - server.wait_task(2).await; + server.wait_task(res.uid()).await; let (tasks, code) = server.tasks().await; snapshot!(code, @"200 OK"); @@ -159,7 +159,7 @@ async fn swap_indexes() { let (res, code) = d.add_documents(json!({ "id": 1, "index": "d"}), None).await; snapshot!(code, @"202 Accepted"); snapshot!(res["taskUid"], @"4"); - server.wait_task(4).await; + server.wait_task(res.uid()).await; // ensure the index creation worked properly let (tasks, code) = server.tasks_filter("limit=2").await; @@ -215,7 +215,7 @@ async fn swap_indexes() { server.index_swap(json!([{ "indexes": ["a", "b"] }, { "indexes": ["c", "d"] } ])).await; snapshot!(res["taskUid"], @"5"); snapshot!(code, @"202 Accepted"); - server.wait_task(5).await; + server.wait_task(res.uid()).await; // ensure the index creation worked properly let (tasks, code) = server.tasks().await; diff --git a/crates/meilisearch/tests/tasks/errors.rs b/crates/meilisearch/tests/tasks/errors.rs index 932dd19d4..759531d42 100644 --- a/crates/meilisearch/tests/tasks/errors.rs +++ b/crates/meilisearch/tests/tasks/errors.rs @@ -95,36 +95,36 @@ async fn task_bad_types() { let (response, code) = server.tasks_filter("types=doggo").await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" } - "###); + "#); let (response, code) = server.cancel_tasks("types=doggo").await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" } - "###); + "#); let (response, code) = server.delete_tasks("types=doggo").await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" } - "###); + "#); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/tasks/mod.rs b/crates/meilisearch/tests/tasks/mod.rs index fc05ee4ca..f432ef7db 100644 --- a/crates/meilisearch/tests/tasks/mod.rs +++ b/crates/meilisearch/tests/tasks/mod.rs @@ -13,8 +13,8 @@ use crate::json; async fn error_get_unexisting_task_status() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); let (response, code) = index.get_task(1).await; let expected_response = json!({ @@ -32,8 +32,8 @@ async fn error_get_unexisting_task_status() { async fn get_task_status() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index + let (create_task, _status_code) = index.create(None).await; + let (add_task, _status_code) = index .add_documents( json!([{ "id": 1, @@ -42,8 +42,8 @@ async fn get_task_status() { None, ) .await; - index.wait_task(0).await; - let (_response, code) = index.get_task(1).await; + index.wait_task(create_task.uid()).await.succeeded(); + let (_response, code) = index.get_task(add_task.uid()).await; assert_eq!(code, 200); // TODO check response format, as per #48 } @@ -52,8 +52,8 @@ async fn get_task_status() { async fn list_tasks() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); index .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) .await; @@ -105,7 +105,7 @@ async fn list_tasks_with_star_filters() { let server = Server::new().await; let index = server.index("test"); let (task, _code) = index.create(None).await; - index.wait_task(task.uid()).await; + index.wait_task(task.uid()).await.succeeded(); index .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) .await; @@ -149,25 +149,21 @@ async fn list_tasks_with_star_filters() { async fn list_tasks_status_filtered() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - index - .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) - .await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.failed(); let (response, code) = index.filtered_tasks(&[], &["succeeded"], &[]).await; assert_eq!(code, 200, "{}", response); assert_eq!(response["results"].as_array().unwrap().len(), 1); - // We can't be sure that the update isn't already processed so we can't test this - // let (response, code) = index.filtered_tasks(&[], &["processing"]).await; - // assert_eq!(code, 200, "{}", response); - // assert_eq!(response["results"].as_array().unwrap().len(), 1); - - index.wait_task(1).await; - let (response, code) = index.filtered_tasks(&[], &["succeeded"], &[]).await; assert_eq!(code, 200, "{}", response); + assert_eq!(response["results"].as_array().unwrap().len(), 1); + + let (response, code) = index.filtered_tasks(&[], &["succeeded", "failed"], &[]).await; + assert_eq!(code, 200, "{}", response); assert_eq!(response["results"].as_array().unwrap().len(), 2); } @@ -175,8 +171,8 @@ async fn list_tasks_status_filtered() { async fn list_tasks_type_filtered() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); index .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) .await; @@ -195,8 +191,8 @@ async fn list_tasks_type_filtered() { async fn list_tasks_invalid_canceled_by_filter() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); index .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) .await; @@ -210,8 +206,8 @@ async fn list_tasks_invalid_canceled_by_filter() { async fn list_tasks_status_and_type_filtered() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); index .add_documents(serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(), None) .await; @@ -278,8 +274,9 @@ async fn test_summarized_task_view() { async fn test_summarized_document_addition_or_update() { let server = Server::new().await; let index = server.index("test"); - index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), None).await; - index.wait_task(0).await; + let (task, _status_code) = + index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), None).await; + index.wait_task(task.uid()).await.succeeded(); let (task, _) = index.get_task(0).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @@ -303,8 +300,9 @@ async fn test_summarized_document_addition_or_update() { } "###); - index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), Some("id")).await; - index.wait_task(1).await; + let (task, _status_code) = + index.add_documents(json!({ "id": 42, "content": "doggos & fluff" }), Some("id")).await; + index.wait_task(task.uid()).await.succeeded(); let (task, _) = index.get_task(1).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @@ -333,8 +331,8 @@ async fn test_summarized_document_addition_or_update() { async fn test_summarized_delete_documents_by_batch() { let server = Server::new().await; let index = server.index("test"); - index.delete_batch(vec![1, 2, 3]).await; - index.wait_task(0).await; + let (task, _status_code) = index.delete_batch(vec![1, 2, 3]).await; + index.wait_task(task.uid()).await.failed(); let (task, _) = index.get_task(0).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @@ -365,9 +363,9 @@ async fn test_summarized_delete_documents_by_batch() { "###); index.create(None).await; - index.delete_batch(vec![42]).await; - index.wait_task(2).await; - let (task, _) = index.get_task(2).await; + let (del_task, _status_code) = index.delete_batch(vec![42]).await; + index.wait_task(del_task.uid()).await.succeeded(); + let (task, _) = index.get_task(del_task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -397,9 +395,10 @@ async fn test_summarized_delete_documents_by_filter() { let server = Server::new().await; let index = server.index("test"); - index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; - index.wait_task(0).await; - let (task, _) = index.get_task(0).await; + let (task, _status_code) = + index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + index.wait_task(task.uid()).await.failed(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -429,9 +428,10 @@ async fn test_summarized_delete_documents_by_filter() { "###); index.create(None).await; - index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; - index.wait_task(2).await; - let (task, _) = index.get_task(2).await; + let (task, _status_code) = + index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + index.wait_task(task.uid()).await.failed(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -448,7 +448,7 @@ async fn test_summarized_delete_documents_by_filter() { "originalFilter": "\"doggo = bernese\"" }, "error": { - "message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", + "message": "Index `test`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese", "code": "invalid_document_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_document_filter" @@ -461,9 +461,10 @@ async fn test_summarized_delete_documents_by_filter() { "###); index.update_settings(json!({ "filterableAttributes": ["doggo"] })).await; - index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; - index.wait_task(4).await; - let (task, _) = index.get_task(4).await; + let (task, _status_code) = + index.delete_document_by_filter(json!({ "filter": "doggo = bernese" })).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -492,9 +493,9 @@ async fn test_summarized_delete_documents_by_filter() { async fn test_summarized_delete_document_by_id() { let server = Server::new().await; let index = server.index("test"); - index.delete_document(1).await; - index.wait_task(0).await; - let (task, _) = index.get_task(0).await; + let (task, _status_code) = index.delete_document(1).await; + index.wait_task(task.uid()).await.failed(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -524,9 +525,9 @@ async fn test_summarized_delete_document_by_id() { "###); index.create(None).await; - index.delete_document(42).await; - index.wait_task(2).await; - let (task, _) = index.get_task(2).await; + let (task, _status_code) = index.delete_document(42).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -567,9 +568,9 @@ async fn test_summarized_settings_update() { } "###); - index.update_settings(json!({ "displayedAttributes": ["doggos", "name"], "filterableAttributes": ["age", "nb_paw_pads"], "sortableAttributes": ["iq"] })).await; - index.wait_task(0).await; - let (task, _) = index.get_task(0).await; + let (task,_status_code) = index.update_settings(json!({ "displayedAttributes": ["doggos", "name"], "filterableAttributes": ["age", "nb_paw_pads"], "sortableAttributes": ["iq"] })).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -606,9 +607,9 @@ async fn test_summarized_settings_update() { async fn test_summarized_index_creation() { let server = Server::new().await; let index = server.index("test"); - index.create(None).await; - index.wait_task(0).await; - let (task, _) = index.get_task(0).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -630,9 +631,9 @@ async fn test_summarized_index_creation() { } "###); - index.create(Some("doggos")).await; - index.wait_task(1).await; - let (task, _) = index.get_task(1).await; + let (task, _status_code) = index.create(Some("doggos")).await; + index.wait_task(task.uid()).await.failed(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -774,9 +775,9 @@ async fn test_summarized_index_update() { let server = Server::new().await; let index = server.index("test"); // If the index doesn't exist yet, we should get errors with or without the primary key. - index.update(None).await; - index.wait_task(0).await; - let (task, _) = index.get_task(0).await; + let (task, _status_code) = index.update(None).await; + index.wait_task(task.uid()).await.failed(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -803,9 +804,9 @@ async fn test_summarized_index_update() { } "###); - index.update(Some("bones")).await; - index.wait_task(1).await; - let (task, _) = index.get_task(1).await; + let (task, _status_code) = index.update(Some("bones")).await; + index.wait_task(task.uid()).await.failed(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -835,9 +836,9 @@ async fn test_summarized_index_update() { // And run the same two tests once the index do exists. index.create(None).await; - index.update(None).await; - index.wait_task(3).await; - let (task, _) = index.get_task(3).await; + let (task, _status_code) = index.update(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -859,9 +860,9 @@ async fn test_summarized_index_update() { } "###); - index.update(Some("bones")).await; - index.wait_task(4).await; - let (task, _) = index.get_task(4).await; + let (task, _status_code) = index.update(Some("bones")).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -887,13 +888,13 @@ async fn test_summarized_index_update() { #[actix_web::test] async fn test_summarized_index_swap() { let server = Server::new().await; - server + let (task, _status_code) = server .index_swap(json!([ { "indexes": ["doggos", "cattos"] } ])) .await; - server.wait_task(0).await; - let (task, _) = server.get_task(0).await; + server.wait_task(task.uid()).await.failed(); + let (task, _) = server.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -927,15 +928,17 @@ async fn test_summarized_index_swap() { } "###); - server.index("doggos").create(None).await; - server.index("cattos").create(None).await; - server + let (task, _code) = server.index("doggos").create(None).await; + server.wait_task(task.uid()).await.succeeded(); + let (task, _code) = server.index("cattos").create(None).await; + server.wait_task(task.uid()).await.succeeded(); + let (task, _code) = server .index_swap(json!([ { "indexes": ["doggos", "cattos"] } ])) .await; - server.wait_task(3).await; - let (task, _) = server.get_task(3).await; + server.wait_task(task.uid()).await.succeeded(); + let (task, _) = server.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -970,11 +973,11 @@ async fn test_summarized_task_cancelation() { let server = Server::new().await; let index = server.index("doggos"); // to avoid being flaky we're only going to cancel an already finished task :( - index.create(None).await; - index.wait_task(0).await; - server.cancel_tasks("uids=0").await; - index.wait_task(1).await; - let (task, _) = index.get_task(1).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = server.cancel_tasks("uids=0").await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -1004,11 +1007,11 @@ async fn test_summarized_task_deletion() { let server = Server::new().await; let index = server.index("doggos"); // to avoid being flaky we're only going to delete an already finished task :( - index.create(None).await; - index.wait_task(0).await; - server.delete_tasks("uids=0").await; - index.wait_task(1).await; - let (task, _) = index.get_task(1).await; + let (task, _status_code) = index.create(None).await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _status_code) = server.delete_tasks("uids=0").await; + index.wait_task(task.uid()).await.succeeded(); + let (task, _) = index.get_task(task.uid()).await; assert_json_snapshot!(task, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" @@ -1036,9 +1039,9 @@ async fn test_summarized_task_deletion() { #[actix_web::test] async fn test_summarized_dump_creation() { let server = Server::new().await; - server.create_dump().await; - server.wait_task(0).await; - let (task, _) = server.get_task(0).await; + let (task, _status_code) = server.create_dump().await; + server.wait_task(task.uid()).await; + let (task, _) = server.get_task(task.uid()).await; assert_json_snapshot!(task, { ".details.dumpUid" => "[dumpUid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }, @r###" diff --git a/crates/meilisearch/tests/upgrade/mod.rs b/crates/meilisearch/tests/upgrade/mod.rs new file mode 100644 index 000000000..ae6bcff40 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/mod.rs @@ -0,0 +1,84 @@ +mod v1_12; + +use std::path::Path; +use std::{fs, io}; + +use meili_snap::snapshot; +use meilisearch::Opt; + +use crate::common::{default_settings, Server}; + +fn copy_dir_all(src: impl AsRef, dst: impl AsRef) -> io::Result<()> { + fs::create_dir_all(&dst)?; + for entry in fs::read_dir(src)? { + let entry = entry?; + let ty = entry.file_type()?; + if ty.is_dir() { + copy_dir_all(entry.path(), dst.as_ref().join(entry.file_name()))?; + } else { + fs::copy(entry.path(), dst.as_ref().join(entry.file_name()))?; + } + } + Ok(()) +} + +#[actix_rt::test] +async fn malformed_version_file() { + let temp = tempfile::tempdir().unwrap(); + let default_settings = default_settings(temp.path()); + let db_path = default_settings.db_path.clone(); + std::fs::create_dir_all(&db_path).unwrap(); + std::fs::write(db_path.join("VERSION"), "kefir").unwrap(); + let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; + let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); + snapshot!(err, @"Version file is corrupted and thus Meilisearch is unable to determine the version of the database. The version contains 1 parts instead of 3 (major, minor and patch)"); +} + +#[actix_rt::test] +async fn version_too_old() { + let temp = tempfile::tempdir().unwrap(); + let default_settings = default_settings(temp.path()); + let db_path = default_settings.db_path.clone(); + std::fs::create_dir_all(&db_path).unwrap(); + std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap(); + let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; + let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); + snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.14.0"); +} + +#[actix_rt::test] +async fn version_requires_downgrade() { + let temp = tempfile::tempdir().unwrap(); + let default_settings = default_settings(temp.path()); + let db_path = default_settings.db_path.clone(); + std::fs::create_dir_all(&db_path).unwrap(); + let major = meilisearch_types::versioning::VERSION_MAJOR; + let minor = meilisearch_types::versioning::VERSION_MINOR; + let patch = meilisearch_types::versioning::VERSION_PATCH.parse::().unwrap() + 1; + std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap(); + let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; + let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); + snapshot!(err, @"Database version 1.14.1 is higher than the Meilisearch version 1.14.0. Downgrade is not supported"); +} + +#[actix_rt::test] +async fn upgrade_to_the_current_version() { + let temp = tempfile::tempdir().unwrap(); + let server = Server::new_with_options(Opt { + experimental_dumpless_upgrade: true, + ..default_settings(temp.path()) + }) + .await + .unwrap(); + // The upgrade tasks should NOT be spawned => task queue is empty + let (tasks, _status) = server.tasks().await; + snapshot!(tasks, @r#" + { + "results": [], + "total": 0, + "limit": 20, + "from": null, + "next": null + } + "#); +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/mod.rs b/crates/meilisearch/tests/upgrade/v1_12/mod.rs new file mode 100644 index 000000000..e84a0aa43 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/mod.rs @@ -0,0 +1 @@ +mod v1_12_0; diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap new file mode 100644 index 000000000..e836fa4b3 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap @@ -0,0 +1,78 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [ + "age", + "surname" + ], + "sortableAttributes": [ + "age" + ], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [ + "le", + "un" + ], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": { + "boubou": [ + "kefir" + ] + }, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 4, + "twoTypos": 9 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + }, + "faceting": { + "maxValuesPerFacet": 99, + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + }, + "pagination": { + "maxTotalHits": 15 + }, + "embedders": {}, + "searchCutoffMs": 8000, + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fra" + ] + } + ], + "facetSearch": true, + "prefixSearch": "indexingTime" +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/search_with_sort_and_filter.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/search_with_sort_and_filter.snap new file mode 100644 index 000000000..11ebae310 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/search_with_sort_and_filter.snap @@ -0,0 +1,25 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "hits": [ + { + "id": 1, + "name": "kefir", + "surname": [ + "kef", + "kefkef", + "kefirounet", + "boubou" + ], + "age": 1.4, + "description": "kefir est un petit chien blanc trĆØs mignon" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batch_by_batchUids_after_deletion.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batch_by_batchUids_after_deletion.snap new file mode 100644 index 000000000..3fbfb7c60 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batch_by_batchUids_after_deletion.snap @@ -0,0 +1,11 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [], + "total": 0, + "limit": 20, + "from": null, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..99caeaf96 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,506 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 24, + "progress": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "upgradeDatabase": 1 + }, + "indexUids": {}, + "progressTrace": "[progressTrace]" + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 23, + "progress": null, + "details": { + "deletedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.004146631S", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 22, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.102738497S", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 21, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.005108474S", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 20, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.027954894S", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 19, + "progress": null, + "details": { + "deletedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.006903297S", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 18, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0, + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "succeeded": 1, + "canceled": 1 + }, + "types": { + "documentAdditionOrUpdate": 1, + "taskCancelation": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.000481257S", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000407005S", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000403716S", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000417016S", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT12.086284842S", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "progress": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.011506614S", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "progress": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007640163S", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "progress": null, + "details": { + "searchCutoffMs": 8000 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007307840S", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007391353S", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007445825S", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.012020083S", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007440092S", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "progress": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007565161S", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + }, + { + "uid": 5, + "progress": null, + "details": { + "stopWords": [ + "le", + "un" + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.016307263S", + "startedAt": "2025-01-16T16:53:19.913351957Z", + "finishedAt": "2025-01-16T16:53:19.92965922Z" + } + ], + "total": 23, + "limit": 20, + "from": 24, + "next": 4 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..99caeaf96 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,506 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 24, + "progress": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "upgradeDatabase": 1 + }, + "indexUids": {}, + "progressTrace": "[progressTrace]" + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 23, + "progress": null, + "details": { + "deletedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.004146631S", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 22, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.102738497S", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 21, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.005108474S", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 20, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.027954894S", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 19, + "progress": null, + "details": { + "deletedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.006903297S", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 18, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0, + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "succeeded": 1, + "canceled": 1 + }, + "types": { + "documentAdditionOrUpdate": 1, + "taskCancelation": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.000481257S", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000407005S", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000403716S", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000417016S", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT12.086284842S", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "progress": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.011506614S", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "progress": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007640163S", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "progress": null, + "details": { + "searchCutoffMs": 8000 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007307840S", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007391353S", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007445825S", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.012020083S", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007440092S", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "progress": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007565161S", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + }, + { + "uid": 5, + "progress": null, + "details": { + "stopWords": [ + "le", + "un" + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.016307263S", + "startedAt": "2025-01-16T16:53:19.913351957Z", + "finishedAt": "2025-01-16T16:53:19.92965922Z" + } + ], + "total": 23, + "limit": 20, + "from": 24, + "next": 4 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..99caeaf96 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,506 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 24, + "progress": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "upgradeDatabase": 1 + }, + "indexUids": {}, + "progressTrace": "[progressTrace]" + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 23, + "progress": null, + "details": { + "deletedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.004146631S", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 22, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.102738497S", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 21, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.005108474S", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 20, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.027954894S", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 19, + "progress": null, + "details": { + "deletedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.006903297S", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 18, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0, + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "succeeded": 1, + "canceled": 1 + }, + "types": { + "documentAdditionOrUpdate": 1, + "taskCancelation": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.000481257S", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000407005S", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000403716S", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000417016S", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT12.086284842S", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "progress": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.011506614S", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "progress": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007640163S", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "progress": null, + "details": { + "searchCutoffMs": 8000 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007307840S", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007391353S", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007445825S", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.012020083S", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007440092S", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "progress": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007565161S", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + }, + { + "uid": 5, + "progress": null, + "details": { + "stopWords": [ + "le", + "un" + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.016307263S", + "startedAt": "2025-01-16T16:53:19.913351957Z", + "finishedAt": "2025-01-16T16:53:19.92965922Z" + } + ], + "total": 23, + "limit": 20, + "from": 24, + "next": 4 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_batchUids_equal_10.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_batchUids_equal_10.snap new file mode 100644 index 000000000..737b3ed06 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_batchUids_equal_10.snap @@ -0,0 +1,39 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 10, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 10, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..b6634ceaf --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,57 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 1, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 0, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.111055654S", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 2, + "limit": 20, + "from": 1, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..b6634ceaf --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,57 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 1, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 0, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.111055654S", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 2, + "limit": 20, + "from": 1, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..b6634ceaf --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,57 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 1, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 0, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.111055654S", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 2, + "limit": 20, + "from": 1, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_canceledBy_equal_19.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_canceledBy_equal_19.snap new file mode 100644 index 000000000..5fbeb8a59 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_canceledBy_equal_19.snap @@ -0,0 +1,40 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 18, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0, + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "succeeded": 1, + "canceled": 1 + }, + "types": { + "documentAdditionOrUpdate": 1, + "taskCancelation": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 18, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_statuses_equal_canceled.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_statuses_equal_canceled.snap new file mode 100644 index 000000000..5fbeb8a59 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_statuses_equal_canceled.snap @@ -0,0 +1,40 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 18, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0, + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "succeeded": 1, + "canceled": 1 + }, + "types": { + "documentAdditionOrUpdate": 1, + "taskCancelation": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 18, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_uids_equal_10.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_uids_equal_10.snap new file mode 100644 index 000000000..737b3ed06 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_uids_equal_10.snap @@ -0,0 +1,39 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 10, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 10, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/task_by_batchUids_after_deletion.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/task_by_batchUids_after_deletion.snap new file mode 100644 index 000000000..3fbfb7c60 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/task_by_batchUids_after_deletion.snap @@ -0,0 +1,11 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [], + "total": 0, + "limit": 20, + "from": null, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..037aeccd6 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,397 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 25, + "batchUid": 24, + "indexUid": null, + "status": "succeeded", + "type": "upgradeDatabase", + "canceledBy": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 24, + "batchUid": 23, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 0 + }, + "error": null, + "duration": "PT0.004146631S", + "enqueuedAt": "2025-01-23T11:38:57.000009177Z", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 23, + "batchUid": 22, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.102738497S", + "enqueuedAt": "2025-01-23T11:36:22.53917994Z", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 22, + "batchUid": 21, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document doesn't have a `id` attribute: `{\"age\":1.4}`.", + "code": "missing_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_document_id" + }, + "duration": "PT0.005108474S", + "enqueuedAt": "2025-01-23T11:36:04.115475071Z", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 21, + "batchUid": 20, + "indexUid": "mieli", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.", + "code": "index_primary_key_no_candidate_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_primary_key_no_candidate_found" + }, + "duration": "PT0.027954894S", + "enqueuedAt": "2025-01-23T11:35:53.625718309Z", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 20, + "batchUid": 19, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 19546 + }, + "error": null, + "duration": "PT0.006903297S", + "enqueuedAt": "2025-01-20T11:50:52.862223877Z", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 19, + "batchUid": 18, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.618121963Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 18, + "batchUid": 18, + "indexUid": "mieli", + "status": "canceled", + "type": "documentAdditionOrUpdate", + "canceledBy": 19, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0 + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.596815611Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "batchUid": 17, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000407005S", + "enqueuedAt": "2025-01-20T11:47:53.498618093Z", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "batchUid": 16, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000403716S", + "enqueuedAt": "2025-01-20T11:47:48.426597451Z", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "batchUid": 15, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "error": null, + "duration": "PT0.000417016S", + "enqueuedAt": "2025-01-20T11:47:42.414346511Z", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "batchUid": 14, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "error": null, + "duration": "PT12.086284842S", + "enqueuedAt": "2025-01-20T11:47:03.079292487Z", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "batchUid": 13, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "error": null, + "duration": "PT0.011506614S", + "enqueuedAt": "2025-01-16T17:18:43.280901282Z", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "batchUid": 12, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "error": null, + "duration": "PT0.007640163S", + "enqueuedAt": "2025-01-16T17:02:52.527382964Z", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "batchUid": 11, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "searchCutoffMs": 8000 + }, + "error": null, + "duration": "PT0.007307840S", + "enqueuedAt": "2025-01-16T17:01:14.100316617Z", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "batchUid": 10, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "error": null, + "duration": "PT0.007391353S", + "enqueuedAt": "2025-01-16T17:00:29.188815062Z", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "batchUid": 9, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "error": null, + "duration": "PT0.007445825S", + "enqueuedAt": "2025-01-16T17:00:15.759501709Z", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "batchUid": 8, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "error": null, + "duration": "PT0.012020083S", + "enqueuedAt": "2025-01-16T16:59:42.727292501Z", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "batchUid": 7, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "error": null, + "duration": "PT0.007440092S", + "enqueuedAt": "2025-01-16T16:58:41.203145044Z", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "batchUid": 6, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "error": null, + "duration": "PT0.007565161S", + "enqueuedAt": "2025-01-16T16:54:51.927866243Z", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + } + ], + "total": 24, + "limit": 20, + "from": 25, + "next": 5 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..037aeccd6 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,397 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 25, + "batchUid": 24, + "indexUid": null, + "status": "succeeded", + "type": "upgradeDatabase", + "canceledBy": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 24, + "batchUid": 23, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 0 + }, + "error": null, + "duration": "PT0.004146631S", + "enqueuedAt": "2025-01-23T11:38:57.000009177Z", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 23, + "batchUid": 22, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.102738497S", + "enqueuedAt": "2025-01-23T11:36:22.53917994Z", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 22, + "batchUid": 21, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document doesn't have a `id` attribute: `{\"age\":1.4}`.", + "code": "missing_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_document_id" + }, + "duration": "PT0.005108474S", + "enqueuedAt": "2025-01-23T11:36:04.115475071Z", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 21, + "batchUid": 20, + "indexUid": "mieli", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.", + "code": "index_primary_key_no_candidate_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_primary_key_no_candidate_found" + }, + "duration": "PT0.027954894S", + "enqueuedAt": "2025-01-23T11:35:53.625718309Z", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 20, + "batchUid": 19, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 19546 + }, + "error": null, + "duration": "PT0.006903297S", + "enqueuedAt": "2025-01-20T11:50:52.862223877Z", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 19, + "batchUid": 18, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.618121963Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 18, + "batchUid": 18, + "indexUid": "mieli", + "status": "canceled", + "type": "documentAdditionOrUpdate", + "canceledBy": 19, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0 + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.596815611Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "batchUid": 17, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000407005S", + "enqueuedAt": "2025-01-20T11:47:53.498618093Z", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "batchUid": 16, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000403716S", + "enqueuedAt": "2025-01-20T11:47:48.426597451Z", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "batchUid": 15, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "error": null, + "duration": "PT0.000417016S", + "enqueuedAt": "2025-01-20T11:47:42.414346511Z", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "batchUid": 14, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "error": null, + "duration": "PT12.086284842S", + "enqueuedAt": "2025-01-20T11:47:03.079292487Z", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "batchUid": 13, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "error": null, + "duration": "PT0.011506614S", + "enqueuedAt": "2025-01-16T17:18:43.280901282Z", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "batchUid": 12, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "error": null, + "duration": "PT0.007640163S", + "enqueuedAt": "2025-01-16T17:02:52.527382964Z", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "batchUid": 11, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "searchCutoffMs": 8000 + }, + "error": null, + "duration": "PT0.007307840S", + "enqueuedAt": "2025-01-16T17:01:14.100316617Z", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "batchUid": 10, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "error": null, + "duration": "PT0.007391353S", + "enqueuedAt": "2025-01-16T17:00:29.188815062Z", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "batchUid": 9, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "error": null, + "duration": "PT0.007445825S", + "enqueuedAt": "2025-01-16T17:00:15.759501709Z", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "batchUid": 8, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "error": null, + "duration": "PT0.012020083S", + "enqueuedAt": "2025-01-16T16:59:42.727292501Z", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "batchUid": 7, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "error": null, + "duration": "PT0.007440092S", + "enqueuedAt": "2025-01-16T16:58:41.203145044Z", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "batchUid": 6, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "error": null, + "duration": "PT0.007565161S", + "enqueuedAt": "2025-01-16T16:54:51.927866243Z", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + } + ], + "total": 24, + "limit": 20, + "from": 25, + "next": 5 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..037aeccd6 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,397 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 25, + "batchUid": 24, + "indexUid": null, + "status": "succeeded", + "type": "upgradeDatabase", + "canceledBy": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 24, + "batchUid": 23, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 0 + }, + "error": null, + "duration": "PT0.004146631S", + "enqueuedAt": "2025-01-23T11:38:57.000009177Z", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 23, + "batchUid": 22, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.102738497S", + "enqueuedAt": "2025-01-23T11:36:22.53917994Z", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 22, + "batchUid": 21, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document doesn't have a `id` attribute: `{\"age\":1.4}`.", + "code": "missing_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_document_id" + }, + "duration": "PT0.005108474S", + "enqueuedAt": "2025-01-23T11:36:04.115475071Z", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 21, + "batchUid": 20, + "indexUid": "mieli", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.", + "code": "index_primary_key_no_candidate_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_primary_key_no_candidate_found" + }, + "duration": "PT0.027954894S", + "enqueuedAt": "2025-01-23T11:35:53.625718309Z", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 20, + "batchUid": 19, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 19546 + }, + "error": null, + "duration": "PT0.006903297S", + "enqueuedAt": "2025-01-20T11:50:52.862223877Z", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 19, + "batchUid": 18, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.618121963Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 18, + "batchUid": 18, + "indexUid": "mieli", + "status": "canceled", + "type": "documentAdditionOrUpdate", + "canceledBy": 19, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0 + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.596815611Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "batchUid": 17, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000407005S", + "enqueuedAt": "2025-01-20T11:47:53.498618093Z", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "batchUid": 16, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000403716S", + "enqueuedAt": "2025-01-20T11:47:48.426597451Z", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "batchUid": 15, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "error": null, + "duration": "PT0.000417016S", + "enqueuedAt": "2025-01-20T11:47:42.414346511Z", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "batchUid": 14, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "error": null, + "duration": "PT12.086284842S", + "enqueuedAt": "2025-01-20T11:47:03.079292487Z", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "batchUid": 13, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "error": null, + "duration": "PT0.011506614S", + "enqueuedAt": "2025-01-16T17:18:43.280901282Z", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "batchUid": 12, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "error": null, + "duration": "PT0.007640163S", + "enqueuedAt": "2025-01-16T17:02:52.527382964Z", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "batchUid": 11, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "searchCutoffMs": 8000 + }, + "error": null, + "duration": "PT0.007307840S", + "enqueuedAt": "2025-01-16T17:01:14.100316617Z", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "batchUid": 10, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "error": null, + "duration": "PT0.007391353S", + "enqueuedAt": "2025-01-16T17:00:29.188815062Z", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "batchUid": 9, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "error": null, + "duration": "PT0.007445825S", + "enqueuedAt": "2025-01-16T17:00:15.759501709Z", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "batchUid": 8, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "error": null, + "duration": "PT0.012020083S", + "enqueuedAt": "2025-01-16T16:59:42.727292501Z", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "batchUid": 7, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "error": null, + "duration": "PT0.007440092S", + "enqueuedAt": "2025-01-16T16:58:41.203145044Z", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "batchUid": 6, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "error": null, + "duration": "PT0.007565161S", + "enqueuedAt": "2025-01-16T16:54:51.927866243Z", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + } + ], + "total": 24, + "limit": 20, + "from": 25, + "next": 5 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_batchUids_equal_10.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_batchUids_equal_10.snap new file mode 100644 index 000000000..9ca68d4f4 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_batchUids_equal_10.snap @@ -0,0 +1,33 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 10, + "batchUid": 10, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 10, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..d644e59f3 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,45 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 1, + "batchUid": 1, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 0, + "batchUid": 0, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.111055654S", + "enqueuedAt": "2025-01-16T16:45:16.003570092Z", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 2, + "limit": 20, + "from": 1, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeFinishedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..d644e59f3 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeFinishedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,45 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 1, + "batchUid": 1, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 0, + "batchUid": 0, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.111055654S", + "enqueuedAt": "2025-01-16T16:45:16.003570092Z", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 2, + "limit": 20, + "from": 1, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeStartedAt_equal_2025-01-16T16_47_41.snap new file mode 100644 index 000000000..d644e59f3 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_beforeStartedAt_equal_2025-01-16T16_47_41.snap @@ -0,0 +1,45 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 1, + "batchUid": 1, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 0, + "batchUid": 0, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.111055654S", + "enqueuedAt": "2025-01-16T16:45:16.003570092Z", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 2, + "limit": 20, + "from": 1, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_canceledBy_equal_19.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_canceledBy_equal_19.snap new file mode 100644 index 000000000..8e0249837 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_canceledBy_equal_19.snap @@ -0,0 +1,29 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 18, + "batchUid": 18, + "indexUid": "mieli", + "status": "canceled", + "type": "documentAdditionOrUpdate", + "canceledBy": 19, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 18, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_statuses_equal_canceled.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_statuses_equal_canceled.snap new file mode 100644 index 000000000..8e0249837 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_statuses_equal_canceled.snap @@ -0,0 +1,29 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 18, + "batchUid": 18, + "indexUid": "mieli", + "status": "canceled", + "type": "documentAdditionOrUpdate", + "canceledBy": 19, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 18, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_uids_equal_10.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_uids_equal_10.snap new file mode 100644 index 000000000..9ca68d4f4 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_uids_equal_10.snap @@ -0,0 +1,33 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "uid": 10, + "batchUid": 10, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + ], + "total": 1, + "limit": 20, + "from": 10, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap new file mode 100644 index 000000000..623c1f778 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -0,0 +1,624 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 24, + "progress": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "upgradeDatabase": 1 + }, + "indexUids": {}, + "progressTrace": "[progressTrace]" + }, + "duration": "[duration]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 23, + "progress": null, + "details": { + "deletedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.004146631S", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 22, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.102738497S", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 21, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.005108474S", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 20, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.027954894S", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 19, + "progress": null, + "details": { + "deletedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.006903297S", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 18, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0, + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 2, + "status": { + "succeeded": 1, + "canceled": 1 + }, + "types": { + "documentAdditionOrUpdate": 1, + "taskCancelation": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.000481257S", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000407005S", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000403716S", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "progress": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "taskCancelation": 1 + }, + "indexUids": {} + }, + "duration": "PT0.000417016S", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "progress": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT12.086284842S", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "progress": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.011506614S", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "progress": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007640163S", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "progress": null, + "details": { + "searchCutoffMs": 8000 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007307840S", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007391353S", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "progress": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007445825S", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.012020083S", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "progress": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007440092S", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "progress": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007565161S", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + }, + { + "uid": 5, + "progress": null, + "details": { + "stopWords": [ + "le", + "un" + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.016307263S", + "startedAt": "2025-01-16T16:53:19.913351957Z", + "finishedAt": "2025-01-16T16:53:19.92965922Z" + }, + { + "uid": 4, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.087655941S", + "startedAt": "2025-01-16T16:52:32.631145531Z", + "finishedAt": "2025-01-16T16:52:32.718801472Z" + }, + { + "uid": 3, + "progress": null, + "details": { + "sortableAttributes": [ + "age" + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.007593573S", + "startedAt": "2025-01-16T16:47:53.677901409Z", + "finishedAt": "2025-01-16T16:47:53.685494982Z" + }, + { + "uid": 2, + "progress": null, + "details": { + "filterableAttributes": [ + "age", + "surname" + ] + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.017769760S", + "startedAt": "2025-01-16T16:47:41.211587682Z", + "finishedAt": "2025-01-16T16:47:41.229357442Z" + }, + { + "uid": 1, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.066095506S", + "startedAt": "2025-01-16T16:47:10.217299609Z", + "finishedAt": "2025-01-16T16:47:10.283395115Z" + }, + { + "uid": 0, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.111055654S", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 25, + "limit": 1000, + "from": 24, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap new file mode 100644 index 000000000..8efd2a55e --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap @@ -0,0 +1,504 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +{ + "results": [ + { + "uid": 25, + "batchUid": 24, + "indexUid": null, + "status": "succeeded", + "type": "upgradeDatabase", + "canceledBy": null, + "details": { + "upgradeFrom": "v1.12.0", + "upgradeTo": "v1.14.0" + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + }, + { + "uid": 24, + "batchUid": 23, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 0 + }, + "error": null, + "duration": "PT0.004146631S", + "enqueuedAt": "2025-01-23T11:38:57.000009177Z", + "startedAt": "2025-01-23T11:38:57.012591321Z", + "finishedAt": "2025-01-23T11:38:57.016737952Z" + }, + { + "uid": 23, + "batchUid": 22, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.102738497S", + "enqueuedAt": "2025-01-23T11:36:22.53917994Z", + "startedAt": "2025-01-23T11:36:22.551906856Z", + "finishedAt": "2025-01-23T11:36:22.654645353Z" + }, + { + "uid": 22, + "batchUid": 21, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Document doesn't have a `id` attribute: `{\"age\":1.4}`.", + "code": "missing_document_id", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#missing_document_id" + }, + "duration": "PT0.005108474S", + "enqueuedAt": "2025-01-23T11:36:04.115475071Z", + "startedAt": "2025-01-23T11:36:04.132670526Z", + "finishedAt": "2025-01-23T11:36:04.137779Z" + }, + { + "uid": 21, + "batchUid": 20, + "indexUid": "mieli", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.", + "code": "index_primary_key_no_candidate_found", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#index_primary_key_no_candidate_found" + }, + "duration": "PT0.027954894S", + "enqueuedAt": "2025-01-23T11:35:53.625718309Z", + "startedAt": "2025-01-23T11:35:53.631082795Z", + "finishedAt": "2025-01-23T11:35:53.659037689Z" + }, + { + "uid": 20, + "batchUid": 19, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 19546 + }, + "error": null, + "duration": "PT0.006903297S", + "enqueuedAt": "2025-01-20T11:50:52.862223877Z", + "startedAt": "2025-01-20T11:50:52.874106134Z", + "finishedAt": "2025-01-20T11:50:52.881009431Z" + }, + { + "uid": 19, + "batchUid": 18, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 1, + "canceledTasks": 1, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.618121963Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 18, + "batchUid": 18, + "indexUid": "mieli", + "status": "canceled", + "type": "documentAdditionOrUpdate", + "canceledBy": 19, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 0 + }, + "error": null, + "duration": "PT0.000481257S", + "enqueuedAt": "2025-01-20T11:48:04.596815611Z", + "startedAt": "2025-01-20T11:48:04.92820416Z", + "finishedAt": "2025-01-20T11:48:04.928685417Z" + }, + { + "uid": 17, + "batchUid": 17, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000407005S", + "enqueuedAt": "2025-01-20T11:47:53.498618093Z", + "startedAt": "2025-01-20T11:47:53.509403957Z", + "finishedAt": "2025-01-20T11:47:53.509810962Z" + }, + { + "uid": 16, + "batchUid": 16, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing%2Cenqueued" + }, + "error": null, + "duration": "PT0.000403716S", + "enqueuedAt": "2025-01-20T11:47:48.426597451Z", + "startedAt": "2025-01-20T11:47:48.430653005Z", + "finishedAt": "2025-01-20T11:47:48.431056721Z" + }, + { + "uid": 15, + "batchUid": 15, + "indexUid": null, + "status": "succeeded", + "type": "taskCancelation", + "canceledBy": null, + "details": { + "matchedTasks": 0, + "canceledTasks": 0, + "originalFilter": "?statuses=processing" + }, + "error": null, + "duration": "PT0.000417016S", + "enqueuedAt": "2025-01-20T11:47:42.414346511Z", + "startedAt": "2025-01-20T11:47:42.429678617Z", + "finishedAt": "2025-01-20T11:47:42.430095633Z" + }, + { + "uid": 14, + "batchUid": 14, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 19547, + "indexedDocuments": 19546 + }, + "error": null, + "duration": "PT12.086284842S", + "enqueuedAt": "2025-01-20T11:47:03.079292487Z", + "startedAt": "2025-01-20T11:47:03.092181576Z", + "finishedAt": "2025-01-20T11:47:15.178466418Z" + }, + { + "uid": 13, + "batchUid": 13, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "localizedAttributes": [ + { + "attributePatterns": [ + "description" + ], + "locales": [ + "fr" + ] + } + ] + }, + "error": null, + "duration": "PT0.011506614S", + "enqueuedAt": "2025-01-16T17:18:43.280901282Z", + "startedAt": "2025-01-16T17:18:43.29334923Z", + "finishedAt": "2025-01-16T17:18:43.304855844Z" + }, + { + "uid": 12, + "batchUid": 12, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "sortFacetValuesBy": { + "*": "alpha", + "age": "count" + } + } + }, + "error": null, + "duration": "PT0.007640163S", + "enqueuedAt": "2025-01-16T17:02:52.527382964Z", + "startedAt": "2025-01-16T17:02:52.539749853Z", + "finishedAt": "2025-01-16T17:02:52.547390016Z" + }, + { + "uid": 11, + "batchUid": 11, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "searchCutoffMs": 8000 + }, + "error": null, + "duration": "PT0.007307840S", + "enqueuedAt": "2025-01-16T17:01:14.100316617Z", + "startedAt": "2025-01-16T17:01:14.112756687Z", + "finishedAt": "2025-01-16T17:01:14.120064527Z" + }, + { + "uid": 10, + "batchUid": 10, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 99 + }, + "pagination": { + "maxTotalHits": 15 + } + }, + "error": null, + "duration": "PT0.007391353S", + "enqueuedAt": "2025-01-16T17:00:29.188815062Z", + "startedAt": "2025-01-16T17:00:29.201180268Z", + "finishedAt": "2025-01-16T17:00:29.208571621Z" + }, + { + "uid": 9, + "batchUid": 9, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "faceting": { + "maxValuesPerFacet": 100 + }, + "pagination": { + "maxTotalHits": 1000 + } + }, + "error": null, + "duration": "PT0.007445825S", + "enqueuedAt": "2025-01-16T17:00:15.759501709Z", + "startedAt": "2025-01-16T17:00:15.77629445Z", + "finishedAt": "2025-01-16T17:00:15.783740275Z" + }, + { + "uid": 8, + "batchUid": 8, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + }, + "disableOnWords": [ + "kefir" + ], + "disableOnAttributes": [ + "surname" + ] + } + }, + "error": null, + "duration": "PT0.012020083S", + "enqueuedAt": "2025-01-16T16:59:42.727292501Z", + "startedAt": "2025-01-16T16:59:42.744086671Z", + "finishedAt": "2025-01-16T16:59:42.756106754Z" + }, + { + "uid": 7, + "batchUid": 7, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "typoTolerance": { + "minWordSizeForTypos": { + "oneTypo": 4 + } + } + }, + "error": null, + "duration": "PT0.007440092S", + "enqueuedAt": "2025-01-16T16:58:41.203145044Z", + "startedAt": "2025-01-16T16:58:41.2155771Z", + "finishedAt": "2025-01-16T16:58:41.223017192Z" + }, + { + "uid": 6, + "batchUid": 6, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "synonyms": { + "boubou": [ + "kefir" + ] + } + }, + "error": null, + "duration": "PT0.007565161S", + "enqueuedAt": "2025-01-16T16:54:51.927866243Z", + "startedAt": "2025-01-16T16:54:51.940332781Z", + "finishedAt": "2025-01-16T16:54:51.947897942Z" + }, + { + "uid": 5, + "batchUid": 5, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "stopWords": [ + "le", + "un" + ] + }, + "error": null, + "duration": "PT0.016307263S", + "enqueuedAt": "2025-01-16T16:53:19.900781991Z", + "startedAt": "2025-01-16T16:53:19.913351957Z", + "finishedAt": "2025-01-16T16:53:19.92965922Z" + }, + { + "uid": 4, + "batchUid": 4, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.087655941S", + "enqueuedAt": "2025-01-16T16:52:32.618659861Z", + "startedAt": "2025-01-16T16:52:32.631145531Z", + "finishedAt": "2025-01-16T16:52:32.718801472Z" + }, + { + "uid": 3, + "batchUid": 3, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "sortableAttributes": [ + "age" + ] + }, + "error": null, + "duration": "PT0.007593573S", + "enqueuedAt": "2025-01-16T16:47:53.665616298Z", + "startedAt": "2025-01-16T16:47:53.677901409Z", + "finishedAt": "2025-01-16T16:47:53.685494982Z" + }, + { + "uid": 2, + "batchUid": 2, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "filterableAttributes": [ + "age", + "surname" + ] + }, + "error": null, + "duration": "PT0.017769760S", + "enqueuedAt": "2025-01-16T16:47:41.194872913Z", + "startedAt": "2025-01-16T16:47:41.211587682Z", + "finishedAt": "2025-01-16T16:47:41.229357442Z" + }, + { + "uid": 1, + "batchUid": 1, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.066095506S", + "enqueuedAt": "2025-01-16T16:47:10.200537203Z", + "startedAt": "2025-01-16T16:47:10.217299609Z", + "finishedAt": "2025-01-16T16:47:10.283395115Z" + }, + { + "uid": 0, + "batchUid": 0, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.111055654S", + "enqueuedAt": "2025-01-16T16:45:16.003570092Z", + "startedAt": "2025-01-16T16:45:16.020248085Z", + "finishedAt": "2025-01-16T16:45:16.131303739Z" + } + ], + "total": 26, + "limit": 1000, + "from": 25, + "next": null +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_keys/list_all_keys.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_keys/list_all_keys.snap new file mode 100644 index 000000000..de7e5ffed --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_keys/list_all_keys.snap @@ -0,0 +1,42 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "name": "Kefir", + "description": "My little kefirino key", + "key": "760c6345918b5ab1d251c1a3e8f9666547628a710d91f6b1d558ba944ef15746", + "uid": "9a77a636-e4e2-4f1a-93ac-978c368fd596", + "actions": [ + "stats.get", + "documents.*" + ], + "indexes": [ + "kefir" + ], + "expiresAt": null, + "createdAt": "2025-01-16T14:43:20.863318893Z", + "updatedAt": "[date]" + }, + { + "name": "Default Search API Key", + "description": "Use it to search from the frontend", + "key": "4d9376547ed779a05dde416148e7e98bd47530e28c500be674c9e60b2accb814", + "uid": "dc699ff0-a053-4956-a46a-912e51b3316b", + "actions": [ + "search" + ], + "indexes": [ + "*" + ], + "expiresAt": null, + "createdAt": "2025-01-16T14:24:46.264041777Z", + "updatedAt": "[date]" + } + ], + "offset": 0, + "limit": 20, + "total": 2 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_keys/list_all_keys_after_removing_kefir.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_keys/list_all_keys_after_removing_kefir.snap new file mode 100644 index 000000000..bdc0d9b0a --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_keys/list_all_keys_after_removing_kefir.snap @@ -0,0 +1,26 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +snapshot_kind: text +--- +{ + "results": [ + { + "name": "kefir", + "description": "the patou", + "key": "4d9376547ed779a05dde416148e7e98bd47530e28c500be674c9e60b2accb814", + "uid": "dc699ff0-a053-4956-a46a-912e51b3316b", + "actions": [ + "search" + ], + "indexes": [ + "*" + ], + "expiresAt": null, + "createdAt": "2025-01-16T14:24:46.264041777Z", + "updatedAt": "[date]" + } + ], + "offset": 0, + "limit": 20, + "total": 1 +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/VERSION b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/VERSION new file mode 100644 index 000000000..32bd932f3 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/VERSION @@ -0,0 +1 @@ +1.12.0 \ No newline at end of file diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/data.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/data.mdb new file mode 100644 index 000000000..421b40a8c Binary files /dev/null and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/data.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb new file mode 100644 index 000000000..4c80ffe2c Binary files /dev/null and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb new file mode 100644 index 000000000..c31db3415 Binary files /dev/null and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb new file mode 100644 index 000000000..c99608b77 Binary files /dev/null and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/instance-uid b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/instance-uid new file mode 100644 index 000000000..1584db263 --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/instance-uid @@ -0,0 +1 @@ +34ebe3d9-306d-4ead-885d-65d33b4bd60a \ No newline at end of file diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb new file mode 100644 index 000000000..226be2332 Binary files /dev/null and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb new file mode 100644 index 000000000..6d38eab08 Binary files /dev/null and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs new file mode 100644 index 000000000..11ba2882a --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -0,0 +1,291 @@ +// This test is the first test of the dumpless upgrade. +// It must test pretty much all the features of meilisearch because the other tests will only tests +// the new features they introduced. + +use insta::assert_json_snapshot; +use manifest_dir_macros::exist_relative_path; +use meili_snap::{json_string, snapshot}; +use meilisearch::Opt; + +use crate::common::{default_settings, Server, Value}; +use crate::json; +use crate::upgrade::copy_dir_all; + +#[actix_rt::test] +async fn import_v1_12_0() { + let temp = tempfile::tempdir().unwrap(); + let original_db_path = exist_relative_path!("tests/upgrade/v1_12/v1_12_0.ms"); + let options = Opt { + experimental_dumpless_upgrade: true, + master_key: Some("kefir".to_string()), + ..default_settings(temp.path()) + }; + copy_dir_all(original_db_path, &options.db_path).unwrap(); + let mut server = Server::new_with_options(options).await.unwrap(); + server.use_api_key("kefir"); + + check_the_keys(&server).await; + check_the_index_scheduler(&server).await; + check_the_index_features(&server).await; +} + +/// We must ensure that the keys database is still working: +/// 1. Check its content +/// 2. Ensure we can still query the keys +/// 3. Ensure we can still update the keys +async fn check_the_keys(server: &Server) { + // All the api keys are still present + let (keys, _) = server.list_api_keys("").await; + snapshot!(json_string!(keys, { ".results[].updatedAt" => "[date]" }), name: "list_all_keys"); + + // We can still query the keys + let (by_uid, _) = server.get_api_key("9a77a636-e4e2-4f1a-93ac-978c368fd596").await; + let (by_key, _) = server + .get_api_key("760c6345918b5ab1d251c1a3e8f9666547628a710d91f6b1d558ba944ef15746") + .await; + + assert_eq!(by_uid, by_key); + snapshot!(json_string!(by_uid, { ".updatedAt" => "[date]" }), @r#" + { + "name": "Kefir", + "description": "My little kefirino key", + "key": "760c6345918b5ab1d251c1a3e8f9666547628a710d91f6b1d558ba944ef15746", + "uid": "9a77a636-e4e2-4f1a-93ac-978c368fd596", + "actions": [ + "stats.get", + "documents.*" + ], + "indexes": [ + "kefir" + ], + "expiresAt": null, + "createdAt": "2025-01-16T14:43:20.863318893Z", + "updatedAt": "[date]" + } + "#); + + // Remove a key + let (_value, status) = server.delete_api_key("9a77a636-e4e2-4f1a-93ac-978c368fd596").await; + snapshot!(status, @"204 No Content"); + + // Update a key + let (value, _) = server + .patch_api_key( + "dc699ff0-a053-4956-a46a-912e51b3316b", + json!({ "name": "kefir", "description": "the patou" }), + ) + .await; + snapshot!(json_string!(value, { ".updatedAt" => "[date]" }), @r#" + { + "name": "kefir", + "description": "the patou", + "key": "4d9376547ed779a05dde416148e7e98bd47530e28c500be674c9e60b2accb814", + "uid": "dc699ff0-a053-4956-a46a-912e51b3316b", + "actions": [ + "search" + ], + "indexes": [ + "*" + ], + "expiresAt": null, + "createdAt": "2025-01-16T14:24:46.264041777Z", + "updatedAt": "[date]" + } + "#); + + // Everything worked + let (keys, _) = server.list_api_keys("").await; + snapshot!(json_string!(keys, { ".results[].updatedAt" => "[date]" }), name: "list_all_keys_after_removing_kefir"); +} + +/// We must ensure the index-scheduler database is still working: +/// 1. We can query the indexes and their metadata +/// 2. The upgrade task has been spawned and has been processed (wait for it to finish or it'll be flaky) +/// 3. Snapshot the whole queue, the tasks and batches should always be the same after update +/// 4. Query the batches and tasks on all filters => the databases should still works +/// 5. Ensure we can still update the queue +/// 5.1. Delete tasks until a batch is removed +/// 5.2. Enqueue a new task +/// 5.3. Create an index +async fn check_the_index_scheduler(server: &Server) { + // Wait until the upgrade has been applied to all indexes to avoid flakyness + let (tasks, _) = server.tasks_filter("types=upgradeDatabase&limit=1").await; + server.wait_task(Value(tasks["results"][0].clone()).uid()).await.succeeded(); + + // All the indexes are still present + let (indexes, _) = server.list_indexes(None, None).await; + snapshot!(indexes, @r#" + { + "results": [ + { + "uid": "kefir", + "createdAt": "2025-01-16T16:45:16.020663157Z", + "updatedAt": "2025-01-23T11:36:22.634859166Z", + "primaryKey": "id" + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "#); + // And their metadata are still right + let (stats, _) = server.stats().await; + assert_json_snapshot!(stats, { + ".databaseSize" => "[bytes]", + ".usedDatabaseSize" => "[bytes]" + }, + @r###" + { + "databaseSize": "[bytes]", + "usedDatabaseSize": "[bytes]", + "lastUpdate": "2025-01-23T11:36:22.634859166Z", + "indexes": { + "kefir": { + "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "avgDocumentSize": 109, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "age": 1, + "description": 1, + "id": 1, + "name": 1, + "surname": 1 + } + } + } + } + "###); + + // Tasks and batches should still work + // We rewrite the first task for all calls because it may be the upgrade database with unknown dates and duration. + // The other tasks should NOT change + let (tasks, _) = server.tasks_filter("limit=1000").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "the_whole_task_queue_once_everything_has_been_processed"); + let (batches, _) = server.batches_filter("limit=1000").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed"); + + // Tests all the tasks query parameters + let (tasks, _) = server.tasks_filter("uids=10").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_uids_equal_10"); + let (tasks, _) = server.tasks_filter("batchUids=10").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_batchUids_equal_10"); + let (tasks, _) = server.tasks_filter("statuses=canceled").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_statuses_equal_canceled"); + // types has already been tested above to retrieve the upgrade database + let (tasks, _) = server.tasks_filter("canceledBy=19").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_canceledBy_equal_19"); + let (tasks, _) = server.tasks_filter("beforeEnqueuedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41"); + let (tasks, _) = server.tasks_filter("afterEnqueuedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41"); + let (tasks, _) = server.tasks_filter("beforeStartedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_beforeStartedAt_equal_2025-01-16T16_47_41"); + let (tasks, _) = server.tasks_filter("afterStartedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41"); + let (tasks, _) = server.tasks_filter("beforeFinishedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_beforeFinishedAt_equal_2025-01-16T16_47_41"); + let (tasks, _) = server.tasks_filter("afterFinishedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41"); + + // Tests all the batches query parameters + let (batches, _) = server.batches_filter("uids=10").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_uids_equal_10"); + let (batches, _) = server.batches_filter("batchUids=10").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_batchUids_equal_10"); + let (batches, _) = server.batches_filter("statuses=canceled").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_statuses_equal_canceled"); + // types has already been tested above to retrieve the upgrade database + let (batches, _) = server.batches_filter("canceledBy=19").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_canceledBy_equal_19"); + let (batches, _) = server.batches_filter("beforeEnqueuedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41"); + let (batches, _) = server.batches_filter("afterEnqueuedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41"); + let (batches, _) = server.batches_filter("beforeStartedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41"); + let (batches, _) = server.batches_filter("afterStartedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterStartedAt_equal_2025-01-16T16_47_41"); + let (batches, _) = server.batches_filter("beforeFinishedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41"); + let (batches, _) = server.batches_filter("afterFinishedAt=2025-01-16T16:47:41Z").await; + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41"); + + let (stats, _) = server.stats().await; + assert_json_snapshot!(stats, { + ".databaseSize" => "[bytes]", + ".usedDatabaseSize" => "[bytes]" + }, + @r###" + { + "databaseSize": "[bytes]", + "usedDatabaseSize": "[bytes]", + "lastUpdate": "2025-01-23T11:36:22.634859166Z", + "indexes": { + "kefir": { + "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "avgDocumentSize": 109, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "age": 1, + "description": 1, + "id": 1, + "name": 1, + "surname": 1 + } + } + } + } + "###); + let index = server.index("kefir"); + let (stats, _) = index.stats().await; + snapshot!(stats, @r###" + { + "numberOfDocuments": 1, + "rawDocumentDbSize": 109, + "avgDocumentSize": 109, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "age": 1, + "description": 1, + "id": 1, + "name": 1, + "surname": 1 + } + } + "###); + + // Delete all the tasks of a specific batch + let (task, _) = server.delete_tasks("batchUids=10").await; + server.wait_task(task.uid()).await.succeeded(); + + let (tasks, _) = server.tasks_filter("batchUids=10").await; + snapshot!(tasks, name: "task_by_batchUids_after_deletion"); + let (tasks, _) = server.batches_filter("batchUids=10").await; + snapshot!(tasks, name: "batch_by_batchUids_after_deletion"); + + let index = server.index("kefirausaurus"); + let (task, _) = index.create(Some("kefid")).await; + server.wait_task(task.uid()).await.succeeded(); +} + +/// Ensuring the index roughly works with filter and sort. +/// More specific test will be made for the next versions everytime they updates a feature +async fn check_the_index_features(server: &Server) { + let kefir = server.index("kefir"); + + let (settings, _) = kefir.settings().await; + snapshot!(settings, name: "kefir_settings"); + + let (results, _status) = + kefir.search_post(json!({ "sort": ["age:asc"], "filter": "surname = kefirounet" })).await; + snapshot!(results, name: "search_with_sort_and_filter"); +} diff --git a/crates/meilisearch/tests/vector/binary_quantized.rs b/crates/meilisearch/tests/vector/binary_quantized.rs index 560c4e2f2..96e32c1a3 100644 --- a/crates/meilisearch/tests/vector/binary_quantized.rs +++ b/crates/meilisearch/tests/vector/binary_quantized.rs @@ -8,17 +8,6 @@ use crate::vector::generate_default_user_provided_documents; async fn retrieve_binary_quantize_status_in_the_settings() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -35,7 +24,7 @@ async fn retrieve_binary_quantize_status_in_the_settings() { let (settings, code) = index.settings().await; snapshot!(code, @"200 OK"); - snapshot!(settings["embedders"]["manual"], @r###"{"source":"userProvided","dimensions":3}"###); + snapshot!(settings["embedders"]["manual"], @r#"{"source":"userProvided","dimensions":3}"#); let (response, code) = index .update_settings(json!({ @@ -53,7 +42,7 @@ async fn retrieve_binary_quantize_status_in_the_settings() { let (settings, code) = index.settings().await; snapshot!(code, @"200 OK"); - snapshot!(settings["embedders"]["manual"], @r###"{"source":"userProvided","dimensions":3,"binaryQuantized":false}"###); + snapshot!(settings["embedders"]["manual"], @r#"{"source":"userProvided","dimensions":3,"binaryQuantized":false}"#); let (response, code) = index .update_settings(json!({ @@ -71,24 +60,13 @@ async fn retrieve_binary_quantize_status_in_the_settings() { let (settings, code) = index.settings().await; snapshot!(code, @"200 OK"); - snapshot!(settings["embedders"]["manual"], @r###"{"source":"userProvided","dimensions":3,"binaryQuantized":true}"###); + snapshot!(settings["embedders"]["manual"], @r#"{"source":"userProvided","dimensions":3,"binaryQuantized":true}"#); } #[actix_rt::test] async fn binary_quantize_before_sending_documents() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -163,17 +141,6 @@ async fn binary_quantize_before_sending_documents() { async fn binary_quantize_after_sending_documents() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -261,17 +228,6 @@ async fn binary_quantize_after_sending_documents() { async fn try_to_disable_binary_quantization() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -300,7 +256,7 @@ async fn try_to_disable_binary_quantization() { .await; snapshot!(code, @"202 Accepted"); let ret = server.wait_task(response.uid()).await; - snapshot!(ret, @r###" + snapshot!(ret, @r#" { "uid": "[uid]", "batchUid": "[batch_uid]", @@ -318,7 +274,7 @@ async fn try_to_disable_binary_quantization() { } }, "error": { - "message": "`.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.", + "message": "Index `doggo`: `.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" @@ -328,7 +284,7 @@ async fn try_to_disable_binary_quantization() { "startedAt": "[date]", "finishedAt": "[date]" } - "###); + "#); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index bb20d7b2a..2a7038fcb 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -1,4 +1,6 @@ mod binary_quantized; +#[cfg(feature = "test-ollama")] +mod ollama; mod openai; mod rest; mod settings; @@ -13,36 +15,13 @@ use crate::common::{default_settings, GetAllDocumentsOptions, Server}; use crate::json; async fn get_server_vector() -> Server { - let server = Server::new().await; - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); - server + Server::new().await } #[actix_rt::test] async fn add_remove_user_provided() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -55,7 +34,7 @@ async fn add_remove_user_provided() { })) .await; snapshot!(code, @"202 Accepted"); - server.wait_task(response.uid()).await; + server.wait_task(response.uid()).await.succeeded(); let documents = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, @@ -63,7 +42,7 @@ async fn add_remove_user_provided() { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) @@ -116,12 +95,12 @@ async fn add_remove_user_provided() { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) .await; - snapshot!(json_string!(documents), @r###" + snapshot!(json_string!(documents), @r#" { "results": [ { @@ -155,16 +134,16 @@ async fn add_remove_user_provided() { "limit": 20, "total": 2 } - "###); + "#); let (value, code) = index.delete_document(0).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) .await; - snapshot!(json_string!(documents), @r###" + snapshot!(json_string!(documents), @r#" { "results": [ { @@ -182,22 +161,102 @@ async fn add_remove_user_provided() { "limit": 20, "total": 1 } + "#); +} + +#[actix_rt::test] +async fn user_provide_mismatched_embedding_dimension() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass + let new_document = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, + ]); + let (response, code) = index.add_documents(new_document, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 1.0 + ], + [ + 1.0, + 2.0, + 2.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } "###); } async fn generate_default_user_provided_documents(server: &Server) -> Index { let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -221,7 +280,7 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); index } @@ -250,7 +309,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -280,7 +339,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -311,7 +370,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -340,7 +399,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -369,7 +428,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -398,7 +457,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -440,7 +499,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -469,7 +528,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -498,7 +557,7 @@ async fn user_provided_embeddings_error() { "indexedDocuments": 0 }, "error": { - "message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", + "message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`", "code": "invalid_vectors_type", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" @@ -539,7 +598,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -569,7 +628,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -599,7 +658,7 @@ async fn user_provided_vectors_error() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", + "message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -618,7 +677,7 @@ async fn clear_documents() { let index = generate_default_user_provided_documents(&server).await; let (value, _code) = index.clear_all_documents().await; - index.wait_task(value.uid()).await; + index.wait_task(value.uid()).await.succeeded(); // Make sure the documents DB has been cleared let (documents, _code) = index @@ -654,17 +713,6 @@ async fn add_remove_one_vector_4588() { // https://github.com/meilisearch/meilisearch/issues/4588 let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -721,7 +769,7 @@ async fn add_remove_one_vector_4588() { let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) .await; - snapshot!(json_string!(documents), @r###" + snapshot!(json_string!(documents), @r#" { "results": [ { @@ -739,5 +787,5 @@ async fn add_remove_one_vector_4588() { "limit": 20, "total": 1 } - "###); + "#); } diff --git a/crates/meilisearch/tests/vector/ollama.rs b/crates/meilisearch/tests/vector/ollama.rs new file mode 100644 index 000000000..eb80758df --- /dev/null +++ b/crates/meilisearch/tests/vector/ollama.rs @@ -0,0 +1,733 @@ +//! Tests ollama embedders with the server at the location described by `MEILI_TEST_OLLAMA_SERVER` environment variable. + +use std::env::VarError; + +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Value}; +use crate::json; +use crate::vector::get_server_vector; + +pub enum Endpoint { + /// Deprecated, undocumented endpoint + Embeddings, + /// Current endpoint + Embed, +} + +impl Endpoint { + fn suffix(&self) -> &'static str { + match self { + Endpoint::Embeddings => "/api/embeddings", + Endpoint::Embed => "/api/embed", + } + } +} + +pub enum Model { + Nomic, + AllMinilm, +} + +impl Model { + fn name(&self) -> &'static str { + match self { + Model::Nomic => "nomic-embed-text", + Model::AllMinilm => "all-minilm", + } + } +} + +const DOGGO_TEMPLATE: &str = r#"{%- if doc.gender == "F" -%}Une chienne nommĆ©e {{doc.name}}, nĆ©e en {{doc.birthyear}} + {%- else -%} + Un chien nommĆ© {{doc.name}}, nĆ© en {{doc.birthyear}} + {%- endif %}, de race {{doc.breed}}."#; + +fn create_ollama_config_with_template( + document_template: &str, + model: Model, + endpoint: Endpoint, +) -> Option { + let ollama_base_url = match std::env::var("MEILI_TEST_OLLAMA_SERVER") { + Ok(ollama_base_url) => ollama_base_url, + Err(VarError::NotPresent) => return None, + Err(VarError::NotUnicode(s)) => panic!( + "`MEILI_TEST_OLLAMA_SERVER` was not properly utf-8, `{:?}`", + s.as_encoded_bytes() + ), + }; + + Some(json!({ + "source": "ollama", + "url": format!("{ollama_base_url}{}", endpoint.suffix()), + "documentTemplate": document_template, + "documentTemplateMaxBytes": 8000000, + "model": model.name() + })) +} + +#[actix_rt::test] +async fn test_both_apis() { + let Some(embed_settings) = + create_ollama_config_with_template(DOGGO_TEMPLATE, Model::AllMinilm, Endpoint::Embed) + else { + panic!("Missing `MEILI_TEST_OLLAMA_SERVER` environment variable, skipping `test_both_apis` test."); + }; + + let Some(embeddings_settings) = + create_ollama_config_with_template(DOGGO_TEMPLATE, Model::AllMinilm, Endpoint::Embeddings) + else { + return; + }; + + let Some(nomic_embed_settings) = + create_ollama_config_with_template(DOGGO_TEMPLATE, Model::Nomic, Endpoint::Embed) + else { + return; + }; + + let Some(nomic_embeddings_settings) = + create_ollama_config_with_template(DOGGO_TEMPLATE, Model::Nomic, Endpoint::Embeddings) + else { + return; + }; + + let server = get_server_vector().await; + + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "embed": embed_settings, + "embeddings": embeddings_settings, + "nomic_embed": nomic_embed_settings, + "nomic_embeddings": nomic_embeddings_settings, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task["status"], @r###""succeeded""###); + let documents = json!([ + {"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"}, + {"id": 1, "name": "Intel", "gender": "M", "birthyear": 2011, "breed": "Beagle"}, + {"id": 2, "name": "VĆ©nus", "gender": "F", "birthyear": 2003, "breed": "Jack Russel Terrier"}, + {"id": 3, "name": "Max", "gender": "M", "birthyear": 1995, "breed": "Labrador Retriever"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 4, + "indexedDocuments": 4 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents, {".results.*._vectors.*.embeddings" => "[vector]"}), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou", + "_vectors": { + "embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "embeddings": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embeddings": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle", + "_vectors": { + "embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "embeddings": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embeddings": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier", + "_vectors": { + "embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "embeddings": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embeddings": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever", + "_vectors": { + "embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "embeddings": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embed": { + "embeddings": "[vector]", + "regenerate": true + }, + "nomic_embeddings": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0, "embedder": "embed"}, + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0, "embedder": "embeddings"}, + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0, "embedder": "embed"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0, "embedder": "embeddings"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0, "embedder": "embed"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0, "embedder": "embeddings"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0, "embedder": "nomic_embed"}, + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "chien de chasse", + "hybrid": {"semanticRatio": 1.0, "embedder": "nomic_embeddings"}, + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0, "embedder": "nomic_embed"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "petit chien", + "hybrid": {"semanticRatio": 1.0, "embedder": "nomic_embeddings"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0, "embedder": "nomic_embed"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + } + ] + "###); + + let (response, code) = index + .search_post(json!({ + "q": "grand chien de berger des montagnes", + "hybrid": {"semanticRatio": 1.0, "embedder": "nomic_embeddings"} + })) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "name": "kefir", + "gender": "M", + "birthyear": 2023, + "breed": "Patou" + }, + { + "id": 3, + "name": "Max", + "gender": "M", + "birthyear": 1995, + "breed": "Labrador Retriever" + }, + { + "id": 2, + "name": "VĆ©nus", + "gender": "F", + "birthyear": 2003, + "breed": "Jack Russel Terrier" + }, + { + "id": 1, + "name": "Intel", + "gender": "M", + "birthyear": 2011, + "breed": "Beagle" + } + ] + "###); +} diff --git a/crates/meilisearch/tests/vector/openai.rs b/crates/meilisearch/tests/vector/openai.rs index 99aa1f710..4ae8cb041 100644 --- a/crates/meilisearch/tests/vector/openai.rs +++ b/crates/meilisearch/tests/vector/openai.rs @@ -713,7 +713,7 @@ async fn bad_api_key() { } }, "error": { - "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -757,7 +757,7 @@ async fn bad_api_key() { } }, "error": { - "message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", + "message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1995,7 +1995,7 @@ async fn timeout() { let (response, code) = index .search_post(json!({ - "q": "grand chien de berger des montagnes", + "q": "grand chien de berger des montagnes foil the cache", "hybrid": {"semanticRatio": 0.99, "embedder": "default"} })) .await; diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index cadc54f24..82fc71b26 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -916,7 +916,7 @@ async fn bad_settings() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "`.embedders.rest`: Missing field `request` (note: this field is mandatory for source rest)", + "message": "`.embedders.rest`: Missing field `request` (note: this field is mandatory for source `rest`)", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" @@ -933,7 +933,7 @@ async fn bad_settings() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "`.embedders.rest`: Missing field `response` (note: this field is mandatory for source rest)", + "message": "`.embedders.rest`: Missing field `response` (note: this field is mandatory for source `rest`)", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" @@ -985,7 +985,7 @@ async fn bad_settings() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1025,7 +1025,7 @@ async fn bad_settings() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1178,7 +1178,7 @@ async fn server_returns_bad_request() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1247,7 +1247,7 @@ async fn server_returns_bad_request() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1306,7 +1306,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1362,7 +1362,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1414,7 +1414,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1478,7 +1478,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1542,7 +1542,7 @@ async fn server_returns_bad_response() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1908,7 +1908,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -1951,7 +1951,7 @@ async fn server_custom_header() { } }, "error": { - "message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", + "message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -2099,7 +2099,7 @@ async fn searchable_reindex() { ] }, "error": { - "message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", + "message": "Index `doggo`: While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index 027c55219..50253f930 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -8,27 +8,16 @@ use crate::vector::generate_default_user_provided_documents; async fn field_unavailable_for_source() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ - "embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}}, + "embedders": { "manual": {"source": "userProvided", "dimensions": 128, "documentTemplate": "{{doc.documentTemplate}}"}}, })) .await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided`.\n - note: `documentTemplate` is available for sources: `openAi`, `huggingFace`, `ollama`, `rest`\n - note: available fields for source `userProvided`: `source`, `dimensions`, `distribution`, `binaryQuantized`", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" @@ -43,7 +32,7 @@ async fn field_unavailable_for_source() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`", + "message": "`.embedders.default`: Field `revision` unavailable for source `openAi`.\n - note: `revision` is available for sources: `huggingFace`\n - note: available fields for source `openAi`: `source`, `model`, `apiKey`, `dimensions`, `documentTemplate`, `documentTemplateMaxBytes`, `url`, `distribution`, `binaryQuantized`", "code": "invalid_settings_embedders", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" @@ -55,17 +44,6 @@ async fn field_unavailable_for_source() { async fn update_embedder() { let server = Server::new().await; let index = server.index("doggo"); - let (value, code) = server.set_features(json!({"vectorStore": true})).await; - snapshot!(code, @"200 OK"); - snapshot!(value, @r###" - { - "vectorStore": true, - "metrics": false, - "logsRoute": false, - "editDocumentsByFunction": false, - "containsFilter": false - } - "###); let (response, code) = index .update_settings(json!({ @@ -73,7 +51,7 @@ async fn update_embedder() { })) .await; snapshot!(code, @"202 Accepted"); - server.wait_task(response.uid()).await; + server.wait_task(response.uid()).await.succeeded(); let (response, code) = index .update_settings(json!({ @@ -271,9 +249,820 @@ async fn reset_embedder_documents() { snapshot!(json_string!(documents), @r###" { "message": "Cannot find embedder with name `default`.", - "code": "invalid_embedder", + "code": "invalid_search_embedder", "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_embedder" + "link": "https://docs.meilisearch.com/errors#invalid_search_embedder" + } + "###); +} + +#[actix_rt::test] +async fn ollama_url_checks() { + let server = super::get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "ollama": {"source": "ollama", "model": "toto", "dimensions": 1, "url": "http://localhost:11434/api/embeddings"}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "ollama": { + "source": "ollama", + "model": "toto", + "dimensions": 1, + "url": "[url]" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "ollama": {"source": "ollama", "model": "toto", "dimensions": 1, "url": "http://localhost:11434/api/embed"}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "ollama": { + "source": "ollama", + "model": "toto", + "dimensions": 1, + "url": "[url]" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "ollama": {"source": "ollama", "model": "toto", "dimensions": 1, "url": "http://localhost:11434/api/embedd"}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "ollama": { + "source": "ollama", + "model": "toto", + "dimensions": 1, + "url": "[url]" + } + } + }, + "error": { + "message": "Index `doggo`: Error while generating embeddings: user error: unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `http://localhost:11434/api/embedd`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "ollama": {"source": "ollama", "model": "toto", "dimensions": 1, "url": "http://localhost:11434/v1/embeddings"}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "ollama": { + "source": "ollama", + "model": "toto", + "dimensions": 1, + "url": "[url]" + } + } + }, + "error": { + "message": "Index `doggo`: Error while generating embeddings: user error: unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `http://localhost:11434/v1/embeddings`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn composite_checks() { + let server = Server::new().await; + let index = server.index("test"); + // feature not enabled, using source + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "using `\"composite\"` as source requires enabling the `composite embedders` experimental feature. See https://github.com/orgs/meilisearch/discussions/816", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // feature not enabled, using search embedder + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "userProvided", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + } + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "setting `searchEmbedder` requires enabling the `composite embedders` experimental feature. See https://github.com/orgs/meilisearch/discussions/816", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // feature not enabled, using indexing embedder + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "userProvided", + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + } + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "setting `indexingEmbedder` requires enabling the `composite embedders` experimental feature. See https://github.com/orgs/meilisearch/discussions/816", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // enable feature + let (_, code) = server.set_features(json!({"compositeEmbedders": true})).await; + snapshot!(code, @"200 OK"); + + // inner distribution + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "distribution": { + "mean": 0.5, + "sigma": 0.2, + } + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.searchEmbedder`: Field `distribution` unavailable for source `huggingFace` for the search embedder.\n - note: available fields for source `huggingFace` for the search embedder: `source`, `model`, `revision`, `pooling`\n - note: `distribution` is available when source `huggingFace` is not for the search embedder", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // manual source + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "userProvided", + "dimensions": 42, + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.searchEmbedder.source`: Source `userProvided` is not available in a nested embedder", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // composite source + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + } + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.searchEmbedder.source`: Source `composite` is not available in a nested embedder", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // no source in indexing + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + "indexingEmbedder": {}, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.indexingEmbedder`: Missing field `source`.\n - note: this field is mandatory for nested embedders", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // no source in search + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": {}, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.searchEmbedder`: Missing field `source`.\n - note: this field is mandatory for nested embedders", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // no indexing + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test`: Missing field `indexingEmbedder` (note: this field is mandatory for source `composite`)", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // no search + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test`: Missing field `searchEmbedder` (note: this field is mandatory for source `composite`)", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // inner quantized + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "binaryQuantized": true, + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "binaryQuantized": false, + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.searchEmbedder`: Field `binaryQuantized` unavailable for source `huggingFace` for the search embedder.\n - note: available fields for source `huggingFace` for the search embedder: `source`, `model`, `revision`, `pooling`\n - note: `binaryQuantized` is available when source `huggingFace` is not for the search embedder", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // prompt in search + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "toto", + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.test.searchEmbedder`: Field `documentTemplate` unavailable for source `huggingFace` for the search embedder.\n - note: available fields for source `huggingFace` for the search embedder: `source`, `model`, `revision`, `pooling`\n - note: `documentTemplate` is available when source `huggingFace` is not for the search embedder", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + // dimensions don't match + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "ollama", + "dimensions": 0x42, + "model": "does-not-exist", + }, + "indexingEmbedder": { + "source": "ollama", + "dimensions": 42, + "model": "does-not-exist", + }, + } + } + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "test", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "ollama", + "model": "does-not-exist", + "dimensions": 66 + }, + "indexingEmbedder": { + "source": "ollama", + "model": "does-not-exist", + "dimensions": 42 + } + } + } + }, + "error": { + "message": "Index `test`: Error while generating embeddings: user error: error while generating test embeddings.\n - the dimensions of embeddings produced at search time and at indexing time don't match.\n - Search time dimensions: 66\n - Indexing time dimensions: 42\n - Note: Dimensions of embeddings produced by both embedders are required to match.", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + // pooling don't match + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "forceMean" + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "forceCls" + }, + } + } + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "test", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "forceMean" + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "forceCls" + } + } + } + }, + "error": { + "message": "Index `test`: Error while generating embeddings: user error: error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance 0.25\n - Meilisearch requires a maximum distance of 0.01.\n - Note: check that both embedders produce similar embeddings.\n - Make sure the `model`, `revision` and `pooling` of both embedders match.", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // ok + let (response, _code) = index + .update_settings(json!({ + "embedders": { + "test": null + } + })) + .await; + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + }, + } + } + })) + .await; + snapshot!(code, @"202 Accepted"); + let response = server.wait_task(response.uid()).await; + snapshot!(response, @r###" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "test", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "test": { + "source": "composite", + "searchEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e" + }, + "indexingEmbedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" } "###); } diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index 048da6232..485177838 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -9,13 +9,15 @@ edition.workspace = true license.workspace = true [dependencies] -anyhow = "1.0.86" -clap = { version = "4.5.9", features = ["derive"] } +anyhow = "1.0.95" +clap = { version = "4.5.24", features = ["derive"] } dump = { path = "../dump" } file-store = { path = "../file-store" } +indexmap = { version = "2.7.0", features = ["serde"] } meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } -serde = { version = "1.0.209", features = ["derive"] } -time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } -uuid = { version = "1.10.0", features = ["v4"], default-features = false } -arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.135", features = ["preserve_order"] } +tempfile = "3.15.0" +time = { version = "0.3.37", features = ["formatting", "parsing", "alloc"] } +uuid = { version = "1.11.0", features = ["v4"], default-features = false } diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index f84cea98d..dd1213782 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -1,19 +1,26 @@ use std::fs::{read_dir, read_to_string, remove_file, File}; -use std::io::BufWriter; +use std::io::{BufWriter, Write as _}; use std::path::PathBuf; +use std::time::Instant; -use anyhow::Context; -use clap::{Parser, Subcommand}; +use anyhow::{bail, Context}; +use clap::{Parser, Subcommand, ValueEnum}; use dump::{DumpWriter, IndexMetadata}; use file_store::FileStore; -use meilisearch_auth::AuthController; -use meilisearch_types::heed::types::{SerdeJson, Str}; -use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; +use meilisearch_auth::{open_auth_store_env, AuthController}; +use meilisearch_types::batches::Batch; +use meilisearch_types::heed::types::{Bytes, SerdeJson, Str}; +use meilisearch_types::heed::{ + CompactionOption, Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, WithoutTls, +}; +use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; use meilisearch_types::versioning::{get_version, parse_version}; use meilisearch_types::Index; +use serde_json::Value::Object; use time::macros::format_description; use time::OffsetDateTime; use upgrade::OfflineUpgrade; @@ -65,6 +72,24 @@ enum Command { skip_enqueued_tasks: bool, }, + /// Exports the documents of an index in NDJSON format from a Meilisearch index to stdout. + /// + /// This command can be executed on a running Meilisearch database. However, please note that + /// it will maintain a read-only transaction for the duration of the extraction process. + ExportDocuments { + /// The index name to export the documents from. + #[arg(long)] + index_name: String, + + /// Do not export vectors with the documents. + #[arg(long)] + ignore_vectors: bool, + + /// The number of documents to skip. + #[arg(long)] + offset: Option, + }, + /// Attempts to upgrade from one major version to the next without a dump. /// /// Make sure to run this commmand when Meilisearch is not running! @@ -73,11 +98,51 @@ enum Command { /// /// Supported upgrade paths: /// - /// - v1.9.x -> v1.10.x -> v1.11.x + /// - v1.9.x -> v1.10.x -> v1.11.x -> v1.12.x OfflineUpgrade { #[arg(long)] target_version: String, }, + + /// Compact the index by using LMDB. + /// + /// You must run this command while Meilisearch is off. The reason is that Meilisearch keep the + /// indexes opened and this compaction operation writes into another file. Meilisearch will not + /// switch to the new file. + /// + /// **Another possibility** is to keep Meilisearch running to serve search requests, run the + /// compaction and once done, close and immediately reopen Meilisearch. This way Meilisearch + /// will reopened the data.mdb file when rebooting and see the newly compacted file, ignoring + /// the previous non-compacted data. + /// + /// Note that the compaction will open the index, copy and compact the index into another file + /// **on the same disk as the index** and replace the previous index with the newly compacted + /// one. This means that the disk must have enough room for at most two times the index size. + /// + /// To make sure not to lose any data, this tool takes a mutable transaction on the index + /// before running the copy and compaction. This way the current indexation must finish before + /// the compaction operation can start. Once the compaction is done, the big index is replaced + /// by the compacted one and the mutable transaction is released. + CompactIndex { index_name: String }, + + /// Uses the hair dryer the dedicate pages hot in cache + /// + /// To make the index faster we must make sure it is hot in the DB cache that's the cure of + /// memory-mapping but also it's strengh. This command is designed to make a spcific part of + /// the index hot in cache. + HairDryer { + #[arg(long, value_delimiter = ',')] + index_name: Vec, + + #[arg(long, value_delimiter = ',')] + index_part: Vec, + }, +} + +#[derive(Clone, ValueEnum)] +enum IndexPart { + /// Will make the arroy index hot. + Arroy, } fn main() -> anyhow::Result<()> { @@ -88,19 +153,26 @@ fn main() -> anyhow::Result<()> { match command { Command::ClearTaskQueue => clear_task_queue(db_path), Command::ExportADump { dump_dir, skip_enqueued_tasks } => { - export_a_dump(db_path, dump_dir, skip_enqueued_tasks) + export_a_dump(db_path, dump_dir, skip_enqueued_tasks, detected_version) + } + Command::ExportDocuments { index_name, ignore_vectors, offset } => { + export_documents(db_path, index_name, ignore_vectors, offset) } Command::OfflineUpgrade { target_version } => { let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?; OfflineUpgrade { db_path, current_version: detected_version, target_version }.upgrade() } + Command::CompactIndex { index_name } => compact_index(db_path, &index_name), + Command::HairDryer { index_name, index_part } => { + hair_dryer(db_path, &index_name, &index_part) + } } } /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) } + let env = unsafe { EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&path) } .with_context(|| format!("While trying to open {:?}", path.display()))?; eprintln!("Deleting tasks from the database..."); @@ -153,7 +225,7 @@ fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { } fn try_opening_database( - env: &Env, + env: &Env, rtxn: &RoTxn, db_name: &str, ) -> anyhow::Result> { @@ -163,7 +235,7 @@ fn try_opening_database( } fn try_opening_poly_database( - env: &Env, + env: &Env, rtxn: &RoTxn, db_name: &str, ) -> anyhow::Result> { @@ -187,6 +259,7 @@ fn export_a_dump( db_path: PathBuf, dump_dir: PathBuf, skip_enqueued_tasks: bool, + detected_version: (u32, u32, u32), ) -> Result<(), anyhow::Error> { let started_at = OffsetDateTime::now_utc(); @@ -211,13 +284,18 @@ fn export_a_dump( FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?; let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; eprintln!("Dumping the keys..."); // 2. dump the keys - let auth_store = AuthController::new(&db_path, &None) + let auth_path = db_path.join("auth"); + std::fs::create_dir_all(&auth_path).context("While creating the auth directory")?; + let auth_env = open_auth_store_env(&auth_path).context("While opening the auth store")?; + let auth_store = AuthController::new(auth_env, &None) .with_context(|| format!("While opening the auth store at {}", db_path.display()))?; let mut dump_keys = dump.create_keys()?; let mut count = 0; @@ -229,36 +307,41 @@ fn export_a_dump( eprintln!("Successfully dumped {count} keys!"); + eprintln!("Dumping the queue"); let rtxn = env.read_txn()?; let all_tasks: Database> = try_opening_database(&env, &rtxn, "all-tasks")?; + let all_batches: Database> = + try_opening_database(&env, &rtxn, "all-batches")?; let index_mapping: Database = try_opening_database(&env, &rtxn, "index-mapping")?; - if skip_enqueued_tasks { - eprintln!("Skip dumping the enqueued tasks..."); - } else { - eprintln!("Dumping the enqueued tasks..."); + eprintln!("Dumping the tasks"); + let mut dump_tasks = dump.create_tasks_queue()?; + let mut count_tasks = 0; + let mut count_enqueued_tasks = 0; + for ret in all_tasks.iter(&rtxn)? { + let (_, t) = ret?; + let status = t.status; + let content_file = t.content_uuid(); - // 3. dump the tasks - let mut dump_tasks = dump.create_tasks_queue()?; - let mut count = 0; - for ret in all_tasks.iter(&rtxn)? { - let (_, t) = ret?; - let status = t.status; - let content_file = t.content_uuid(); - let mut dump_content_file = dump_tasks.push_task(&t.into())?; + if status == Status::Enqueued && skip_enqueued_tasks { + continue; + } - // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. - if let Some(content_file_uuid) = content_file { - if status == Status::Enqueued { - let content_file = file_store.get_update(content_file_uuid)?; + let mut dump_content_file = dump_tasks.push_task(&t.into())?; + // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. + if let Some(content_file_uuid) = content_file { + if status == Status::Enqueued { + let content_file = file_store.get_update(content_file_uuid)?; + + if (detected_version.0, detected_version.1, detected_version.2) < (1, 12, 0) { + eprintln!("Dumping the enqueued tasks reading them in obkv format..."); let reader = DocumentsBatchReader::from_reader(content_file).with_context(|| { format!("While reading content file {:?}", content_file_uuid) })?; - let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index(); while let Some(doc) = cursor.next_document().with_context(|| { format!("While iterating on content file {:?}", content_file_uuid) @@ -266,26 +349,52 @@ fn export_a_dump( dump_content_file .push_document(&obkv_to_object(doc, &documents_batch_index)?)?; } - dump_content_file.flush()?; - count += 1; + } else { + eprintln!("Dumping the enqueued tasks reading them in JSON stream format..."); + for document in + serde_json::de::Deserializer::from_reader(content_file).into_iter() + { + let document = document.with_context(|| { + format!("While reading content file {:?}", content_file_uuid) + })?; + dump_content_file.push_document(&document)?; + } } + + dump_content_file.flush()?; + count_enqueued_tasks += 1; } } - dump_tasks.flush()?; - - eprintln!("Successfully dumped {count} enqueued tasks!"); + count_tasks += 1; } + dump_tasks.flush()?; + eprintln!( + "Successfully dumped {count_tasks} tasks including {count_enqueued_tasks} enqueued tasks!" + ); + // 4. dump the batches + eprintln!("Dumping the batches"); + let mut dump_batches = dump.create_batches_queue()?; + let mut count = 0; + + for ret in all_batches.iter(&rtxn)? { + let (_, b) = ret?; + dump_batches.push_batch(&b)?; + count += 1; + } + dump_batches.flush()?; + eprintln!("Successfully dumped {count} batches!"); + + // 5. Dump the indexes eprintln!("Dumping the indexes..."); - - // 4. Dump the indexes let mut count = 0; for result in index_mapping.iter(&rtxn)? { let (uid, uuid) = result?; let index_path = db_path.join("indexes").join(uuid.to_string()); - let index = Index::new(EnvOpenOptions::new(), &index_path).with_context(|| { - format!("While trying to open the index at path {:?}", index_path.display()) - })?; + let index = Index::new(EnvOpenOptions::new().read_txn_without_tls(), &index_path, false) + .with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; let rtxn = index.read_txn()?; let metadata = IndexMetadata { @@ -299,14 +408,14 @@ fn export_a_dump( let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - // 4.1. Dump the documents + // 5.1. Dump the documents for ret in index.all_documents(&rtxn)? { let (_id, doc) = ret?; let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?; index_dumper.push_document(&document)?; } - // 4.2. Dump the settings + // 5.2. Dump the settings let settings = meilisearch_types::settings::settings( &index, &rtxn, @@ -332,3 +441,250 @@ fn export_a_dump( Ok(()) } + +fn compact_index(db_path: PathBuf, index_name: &str) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let rtxn = env.read_txn()?; + let index_mapping: Database = + try_opening_database(&env, &rtxn, "index-mapping")?; + + for result in index_mapping.iter(&rtxn)? { + let (uid, uuid) = result?; + + if uid != index_name { + eprintln!("Found index {uid} and skipping it"); + continue; + } else { + eprintln!("Found index {uid} šŸŽ‰"); + } + + let index_path = db_path.join("indexes").join(uuid.to_string()); + let index = Index::new(EnvOpenOptions::new().read_txn_without_tls(), &index_path, false) + .with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; + + eprintln!("Awaiting for a mutable transaction..."); + let _wtxn = index.write_txn().context("While awaiting for a write transaction")?; + + // We create and immediately drop the file because the + let non_compacted_index_file_path = index_path.join("data.mdb"); + let compacted_index_file_path = index_path.join("data.mdb.cpy"); + + eprintln!("Compacting the index..."); + let before_compaction = Instant::now(); + let new_file = index + .copy_to_path(&compacted_index_file_path, CompactionOption::Enabled) + .with_context(|| format!("While compacting {}", compacted_index_file_path.display()))?; + + let after_size = new_file.metadata()?.len(); + let before_size = std::fs::metadata(&non_compacted_index_file_path) + .with_context(|| { + format!( + "While retrieving the metadata of {}", + non_compacted_index_file_path.display(), + ) + })? + .len(); + + let reduction = before_size as f64 / after_size as f64; + println!("Compaction successful. Took around {:.2?}", before_compaction.elapsed()); + eprintln!("The index went from {before_size} bytes to {after_size} bytes ({reduction:.2}x reduction)"); + + eprintln!("Replacing the non-compacted index by the compacted one..."); + std::fs::rename(&compacted_index_file_path, &non_compacted_index_file_path).with_context( + || { + format!( + "While renaming {} into {}", + compacted_index_file_path.display(), + non_compacted_index_file_path.display(), + ) + }, + )?; + + drop(new_file); + + println!("Everything's done šŸŽ‰"); + return Ok(()); + } + + bail!("Target index {index_name} not found!") +} + +fn export_documents( + db_path: PathBuf, + index_name: String, + ignore_vectors: bool, + offset: Option, +) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let rtxn = env.read_txn()?; + let index_mapping: Database = + try_opening_database(&env, &rtxn, "index-mapping")?; + + for result in index_mapping.iter(&rtxn)? { + let (uid, uuid) = result?; + if uid == index_name { + let index_path = db_path.join("indexes").join(uuid.to_string()); + let index = + Index::new(EnvOpenOptions::new().read_txn_without_tls(), &index_path, false) + .with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; + + let rtxn = index.read_txn()?; + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; + + if let Some(offset) = offset { + eprintln!("Skipping {offset} documents"); + } + + let mut stdout = BufWriter::new(std::io::stdout()); + let all_documents = index.documents_ids(&rtxn)?.into_iter().skip(offset.unwrap_or(0)); + for (i, ret) in index.iter_documents(&rtxn, all_documents)?.enumerate() { + let (id, doc) = ret?; + let mut document = obkv_to_json(&all_fields, &fields_ids_map, doc)?; + + if i % 10_000 == 0 { + eprintln!("Starting the {}th document", i + offset.unwrap_or(0)); + } + + if !ignore_vectors { + 'inject_vectors: { + let embeddings = index.embeddings(&rtxn, id)?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(Object(Default::default())); + + let Object(vectors) = vectors else { + return Err(meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&rtxn, std::iter::once(id)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={id}") + } + }, + value: vectors.clone(), + }, + ) + .into()); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, + }; + vectors + .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); + } + } + } + + serde_json::to_writer(&mut stdout, &document)?; + } + + stdout.flush()?; + } else { + eprintln!("Found index {uid} but it's not the right index..."); + } + } + + Ok(()) +} + +fn hair_dryer( + db_path: PathBuf, + index_names: &[String], + index_parts: &[IndexPart], +) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + eprintln!("Trying to get a read transaction on the index scheduler..."); + + let rtxn = env.read_txn()?; + let index_mapping: Database = + try_opening_database(&env, &rtxn, "index-mapping")?; + + for result in index_mapping.iter(&rtxn)? { + let (uid, uuid) = result?; + if index_names.iter().any(|i| i == uid) { + let index_path = db_path.join("indexes").join(uuid.to_string()); + let index = + Index::new(EnvOpenOptions::new().read_txn_without_tls(), &index_path, false) + .with_context(|| { + format!("While trying to open the index at path {:?}", index_path.display()) + })?; + + eprintln!("Trying to get a read transaction on the {uid} index..."); + + let rtxn = index.read_txn()?; + for part in index_parts { + match part { + IndexPart::Arroy => { + let mut count = 0; + let total = index.vector_arroy.len(&rtxn)?; + eprintln!("Hair drying arroy for {uid}..."); + for (i, result) in index + .vector_arroy + .remap_types::() + .iter(&rtxn)? + .enumerate() + { + let (key, value) = result?; + + // All of this just to avoid compiler optimizations šŸ¤ž + // We must read all the bytes to make the pages hot in cache. + // + count += std::hint::black_box(key.iter().fold(0, |acc, _| acc + 1)); + count += std::hint::black_box(value.iter().fold(0, |acc, _| acc + 1)); + + if i % 10_000 == 0 { + let perc = (i as f64) / (total as f64) * 100.0; + eprintln!("Visited {i}/{total} ({perc:.2}%) keys") + } + } + eprintln!("Done hair drying a total of at least {count} bytes."); + } + } + } + } else { + eprintln!("Found index {uid} but it's not the right index..."); + } + } + + Ok(()) +} diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 36630c3b3..82a57317c 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -1,62 +1,115 @@ mod v1_10; mod v1_11; +mod v1_12; mod v1_9; use std::path::{Path, PathBuf}; use anyhow::{bail, Context}; use meilisearch_types::versioning::create_version_file; - use v1_10::v1_9_to_v1_10; +use v1_12::{v1_11_to_v1_12, v1_12_to_v1_12_3}; use crate::upgrade::v1_11::v1_10_to_v1_11; pub struct OfflineUpgrade { pub db_path: PathBuf, - pub current_version: (String, String, String), - pub target_version: (String, String, String), + pub current_version: (u32, u32, u32), + pub target_version: (u32, u32, u32), } impl OfflineUpgrade { pub fn upgrade(self) -> anyhow::Result<()> { + // Adding a version? + // + // 1. Update the LAST_SUPPORTED_UPGRADE_FROM_VERSION and LAST_SUPPORTED_UPGRADE_TO_VERSION. + // 2. Add new version to the upgrade list if necessary + // 3. Use `no_upgrade` as index for versions that are compatible. + + if self.current_version == self.target_version { + println!("Database is already at the target version. Exiting."); + return Ok(()); + } + + if self.current_version > self.target_version { + bail!( + "Cannot downgrade from {}.{}.{} to {}.{}.{}. Downgrade not supported", + self.current_version.0, + self.current_version.1, + self.current_version.2, + self.target_version.0, + self.target_version.1, + self.target_version.2 + ); + } + + const FIRST_SUPPORTED_UPGRADE_FROM_VERSION: &str = "1.9.0"; + const LAST_SUPPORTED_UPGRADE_FROM_VERSION: &str = "1.12.7"; + const FIRST_SUPPORTED_UPGRADE_TO_VERSION: &str = "1.10.0"; + const LAST_SUPPORTED_UPGRADE_TO_VERSION: &str = "1.12.7"; + let upgrade_list = [ - (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), + ( + v1_9_to_v1_10 as fn(&Path, u32, u32, u32) -> Result<(), anyhow::Error>, + "1", + "10", + "0", + ), (v1_10_to_v1_11, "1", "11", "0"), + (v1_11_to_v1_12, "1", "12", "0"), + (v1_12_to_v1_12_3, "1", "12", "3"), ]; - let (current_major, current_minor, current_patch) = &self.current_version; + let no_upgrade: usize = upgrade_list.len(); - let start_at = match ( - current_major.as_str(), - current_minor.as_str(), - current_patch.as_str(), - ) { - ("1", "9", _) => 0, - ("1", "10", _) => 1, + let (current_major, current_minor, current_patch) = self.current_version; + + let start_at = match (current_major, current_minor, current_patch) { + (1, 9, _) => 0, + (1, 10, _) => 1, + (1, 11, _) => 2, + (1, 12, 0..=2) => 3, + (1, 12, 3..=7) => no_upgrade, _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10") + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from versions in range [{}-{}]", + FIRST_SUPPORTED_UPGRADE_FROM_VERSION, + LAST_SUPPORTED_UPGRADE_FROM_VERSION); } }; - let (target_major, target_minor, target_patch) = &self.target_version; + let (target_major, target_minor, target_patch) = self.target_version; - let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { - ("1", "10", _) => 0, - ("1", "11", _) => 1, - (major, _, _) if major.starts_with('v') => { - bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") - } + let ends_at = match (target_major, target_minor, target_patch) { + (1, 10, _) => 0, + (1, 11, _) => 1, + (1, 12, 0..=2) => 2, + (1, 12, 3..=7) => 3, _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11") + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to versions in range [{}-{}]", + FIRST_SUPPORTED_UPGRADE_TO_VERSION, + LAST_SUPPORTED_UPGRADE_TO_VERSION); } }; println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + if start_at == no_upgrade { + println!("No upgrade operation to perform, writing VERSION file"); + create_version_file( + &self.db_path, + &target_major.to_string(), + &target_minor.to_string(), + &target_patch.to_string(), + ) + .context("while writing VERSION file after the upgrade")?; + println!("Success"); + return Ok(()); + } + #[allow(clippy::needless_range_loop)] for index in start_at..=ends_at { let (func, major, minor, patch) = upgrade_list[index]; - (func)(&self.db_path)?; + (func)(&self.db_path, current_major, current_minor, current_patch)?; println!("Done"); // We're writing the version file just in case an issue arise _while_ upgrading. // We don't want the DB to fail in an unknown state. diff --git a/crates/meilitool/src/upgrade/v1_10.rs b/crates/meilitool/src/upgrade/v1_10.rs index 2efc1773c..ac30055c5 100644 --- a/crates/meilitool/src/upgrade/v1_10.rs +++ b/crates/meilitool/src/upgrade/v1_10.rs @@ -1,18 +1,15 @@ -use anyhow::bail; use std::path::Path; -use anyhow::Context; -use meilisearch_types::{ - heed::{ - types::{SerdeJson, Str}, - Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, - }, - milli::index::{db_name, main_key}, +use anyhow::{bail, Context}; +use meilisearch_types::heed::types::{SerdeJson, Str}; +use meilisearch_types::heed::{ + Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, WithoutTls, }; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; +use meilisearch_types::milli::index::{db_name, main_key}; use super::v1_9; +use crate::uuid_codec::UuidCodec; +use crate::{try_opening_database, try_opening_poly_database}; pub type FieldDistribution = std::collections::BTreeMap; @@ -97,7 +94,7 @@ fn update_index_stats( fn update_date_format( index_uid: &str, - index_env: &Env, + index_env: &Env, index_wtxn: &mut RwTxn, ) -> anyhow::Result<()> { let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) @@ -111,7 +108,7 @@ fn update_date_format( fn find_rest_embedders( index_uid: &str, - index_env: &Env, + index_env: &Env, index_txn: &RoTxn, ) -> anyhow::Result> { let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) @@ -156,7 +153,12 @@ fn date_round_trip( Ok(()) } -pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { +pub fn v1_9_to_v1_10( + db_path: &Path, + _origin_major: u32, + _origin_minor: u32, + _origin_patch: u32, +) -> anyhow::Result<()> { println!("Upgrading from v1.9.0 to v1.10.0"); // 2 changes here @@ -164,8 +166,10 @@ pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { // 2. REST embedders. We don't support this case right now, so bail let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; let mut sched_wtxn = env.write_txn()?; @@ -205,9 +209,13 @@ pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { let index_env = unsafe { // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? + EnvOpenOptions::new() + .read_txn_without_tls() + .max_dbs(25) + .open(&index_path) + .with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? }; let index_txn = index_env.read_txn().with_context(|| { @@ -252,9 +260,13 @@ pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { let index_env = unsafe { // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? + EnvOpenOptions::new() + .read_txn_without_tls() + .max_dbs(25) + .open(&index_path) + .with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? }; let mut index_wtxn = index_env.write_txn().with_context(|| { diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index 0c84d3842..76d2fc24f 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -7,19 +7,26 @@ use std::path::Path; use anyhow::Context; -use meilisearch_types::{ - heed::{types::Str, Database, EnvOpenOptions}, - milli::index::db_name, -}; +use meilisearch_types::heed::types::Str; +use meilisearch_types::heed::{Database, EnvOpenOptions}; +use meilisearch_types::milli::index::db_name; -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; +use crate::uuid_codec::UuidCodec; +use crate::{try_opening_database, try_opening_poly_database}; -pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { +pub fn v1_10_to_v1_11( + db_path: &Path, + _origin_major: u32, + _origin_minor: u32, + _origin_patch: u32, +) -> anyhow::Result<()> { println!("Upgrading from v1.10.0 to v1.11.0"); let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; let sched_rtxn = env.read_txn()?; @@ -45,9 +52,13 @@ pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { ); let index_env = unsafe { - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? + EnvOpenOptions::new() + .read_txn_without_tls() + .max_dbs(25) + .open(&index_path) + .with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? }; let index_rtxn = index_env.read_txn().with_context(|| { @@ -71,11 +82,11 @@ pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) .with_context(|| format!("while updating date format for index `{uid}`"))?; - arroy_v04_to_v05::ugrade_from_prev_version( + meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5( &index_rtxn, - index_read_database, + index_read_database.remap_types(), &mut index_wtxn, - index_write_database, + index_write_database.remap_types(), )?; index_wtxn.commit()?; diff --git a/crates/meilitool/src/upgrade/v1_12.rs b/crates/meilitool/src/upgrade/v1_12.rs new file mode 100644 index 000000000..1dd679eb9 --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_12.rs @@ -0,0 +1,302 @@ +//! The breaking changes that happened between the v1.11 and the v1.12 are: +//! - The new indexer changed the update files format from OBKV to ndjson. https://github.com/meilisearch/meilisearch/pull/4900 + +use std::borrow::Cow; +use std::io::BufWriter; +use std::path::Path; +use std::sync::atomic::AtomicBool; + +use anyhow::Context; +use file_store::FileStore; +use indexmap::IndexMap; +use meilisearch_types::milli::documents::DocumentsBatchReader; +use meilisearch_types::milli::heed::types::{SerdeJson, Str}; +use meilisearch_types::milli::heed::{Database, EnvOpenOptions, RoTxn, RwTxn}; +use meilisearch_types::milli::progress::Step; +use meilisearch_types::milli::{FieldDistribution, Index}; +use serde::Serialize; +use serde_json::value::RawValue; +use tempfile::NamedTempFile; +use time::OffsetDateTime; +use uuid::Uuid; + +use crate::try_opening_database; +use crate::uuid_codec::UuidCodec; + +pub fn v1_11_to_v1_12( + db_path: &Path, + _origin_major: u32, + _origin_minor: u32, + _origin_patch: u32, +) -> anyhow::Result<()> { + println!("Upgrading from v1.11.0 to v1.12.0"); + + convert_update_files(db_path)?; + + Ok(()) +} + +pub fn v1_12_to_v1_12_3( + db_path: &Path, + origin_major: u32, + origin_minor: u32, + origin_patch: u32, +) -> anyhow::Result<()> { + println!("Upgrading from v1.12.{{0, 1, 2}} to v1.12.3"); + + if origin_minor == 12 { + rebuild_field_distribution(db_path)?; + } else { + println!("Not rebuilding field distribution as it wasn't corrupted coming from v{origin_major}.{origin_minor}.{origin_patch}"); + } + + Ok(()) +} + +/// Convert the update files from OBKV to ndjson format. +/// +/// 1) List all the update files using the file store. +/// 2) For each update file, read the update file into a DocumentsBatchReader. +/// 3) For each document in the update file, convert the document to a JSON object. +/// 4) Write the JSON object to a tmp file in the update files directory. +/// 5) Persist the tmp file replacing the old update file. +fn convert_update_files(db_path: &Path) -> anyhow::Result<()> { + let update_files_dir_path = db_path.join("update_files"); + let file_store = FileStore::new(&update_files_dir_path).with_context(|| { + format!("while creating file store for update files dir {update_files_dir_path:?}") + })?; + + for uuid in file_store.all_uuids().context("while retrieving uuids from file store")? { + let uuid = uuid.context("while retrieving uuid from file store")?; + let update_file_path = file_store.get_update_path(uuid); + let update_file = file_store + .get_update(uuid) + .with_context(|| format!("while getting update file for uuid {uuid:?}"))?; + + let mut file = + NamedTempFile::new_in(&update_files_dir_path).map(BufWriter::new).with_context( + || format!("while creating bufwriter for update file {update_file_path:?}"), + )?; + + let reader = DocumentsBatchReader::from_reader(update_file).with_context(|| { + format!("while creating documents batch reader for update file {update_file_path:?}") + })?; + let (mut cursor, index) = reader.into_cursor_and_fields_index(); + + while let Some(document) = cursor.next_document().with_context(|| { + format!( + "while reading documents from batch reader for update file {update_file_path:?}" + ) + })? { + let mut json_document = IndexMap::new(); + for (fid, value) in document { + let field_name = index + .name(fid) + .with_context(|| format!("while getting field name for fid {fid} for update file {update_file_path:?}"))?; + let value: &RawValue = serde_json::from_slice(value)?; + json_document.insert(field_name, value); + } + + serde_json::to_writer(&mut file, &json_document)?; + } + + let file = file.into_inner().map_err(|e| e.into_error()).context(format!( + "while flushing update file bufwriter for update file {update_file_path:?}" + ))?; + let _ = file + // atomically replace the obkv file with the rewritten NDJSON file + .persist(&update_file_path) + .with_context(|| format!("while persisting update file {update_file_path:?}"))?; + } + + Ok(()) +} + +/// Rebuild field distribution as it was wrongly computed in v1.12.x if x < 3 +fn rebuild_field_distribution(db_path: &Path) -> anyhow::Result<()> { + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { + EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path) + } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let mut sched_wtxn = env.write_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_wtxn, "index-mapping")?; + let stats_db: Database> = + try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { + format!("While trying to open {:?}", index_scheduler_path.display()) + })?; + + let index_count = + index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; + + // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn + // 1. immutably for the iteration + // 2. mutably for updating index stats + let indexes: Vec<_> = index_mapping + .iter(&sched_wtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + let progress = meilisearch_types::milli::progress::Progress::default(); + let finished = AtomicBool::new(false); + + std::thread::scope(|scope| { + let display_progress = std::thread::Builder::new() + .name("display_progress".into()) + .spawn_scoped(scope, || { + while !finished.load(std::sync::atomic::Ordering::Relaxed) { + std::thread::sleep(std::time::Duration::from_secs(5)); + let view = progress.as_progress_view(); + let Ok(view) = serde_json::to_string(&view) else { + continue; + }; + println!("{view}"); + } + }) + .unwrap(); + + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + progress.update_progress(VariableNameStep::new( + &uid, + index_index as u32, + index_count as u32, + )); + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Updating index `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + println!("\t- Rebuilding field distribution"); + + let index = meilisearch_types::milli::Index::new( + EnvOpenOptions::new().read_txn_without_tls(), + &index_path, + false, + ) + .with_context(|| format!("while opening index {uid} at '{}'", index_path.display()))?; + + let mut index_txn = index.write_txn()?; + + meilisearch_types::milli::update::new::reindex::field_distribution( + &index, + &mut index_txn, + &progress, + ) + .context("while rebuilding field distribution")?; + + let stats = IndexStats::new(&index, &index_txn) + .with_context(|| format!("computing stats for index `{uid}`"))?; + store_stats_of(stats_db, uuid, &mut sched_wtxn, &uid, &stats)?; + + index_txn.commit().context("while committing the write txn for the updated index")?; + } + + sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; + + finished.store(true, std::sync::atomic::Ordering::Relaxed); + + if let Err(panic) = display_progress.join() { + let msg = match panic.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match panic.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + eprintln!("WARN: the display thread panicked with {msg}"); + } + + println!("Upgrading database succeeded"); + Ok(()) + }) +} + +pub struct VariableNameStep { + name: String, + current: u32, + total: u32, +} + +impl VariableNameStep { + pub fn new(name: impl Into, current: u32, total: u32) -> Self { + Self { name: name.into(), current, total } + } +} + +impl Step for VariableNameStep { + fn name(&self) -> Cow<'static, str> { + self.name.clone().into() + } + + fn current(&self) -> u32 { + self.current + } + + fn total(&self) -> u32 { + self.total + } +} + +pub fn store_stats_of( + stats_db: Database>, + index_uuid: Uuid, + sched_wtxn: &mut RwTxn, + index_uid: &str, + stats: &IndexStats, +) -> anyhow::Result<()> { + stats_db + .put(sched_wtxn, &index_uuid, stats) + .with_context(|| format!("storing stats for index `{index_uid}`"))?; + Ok(()) +} + +/// The statistics that can be computed from an `Index` object. +#[derive(Serialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + #[serde(with = "time::serde::rfc3339")] + pub created_at: OffsetDateTime, + /// Date of the last update of the index. + #[serde(with = "time::serde::rfc3339")] + pub updated_at: OffsetDateTime, +} + +impl IndexStats { + /// Compute the stats of an index + /// + /// # Parameters + /// + /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. + pub fn new(index: &Index, rtxn: &RoTxn) -> meilisearch_types::milli::Result { + Ok(IndexStats { + number_of_documents: index.number_of_documents(rtxn)?, + database_size: index.on_disk_size()?, + used_database_size: index.used_size()?, + field_distribution: index.field_distribution(rtxn)?, + created_at: index.created_at(rtxn)?, + updated_at: index.updated_at(rtxn)?, + }) + } +} diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index c47a0a354..a2a020587 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -15,72 +15,71 @@ license.workspace = true big_s = "1.0.2" bimap = { version = "0.6.3", features = ["serde"] } bincode = "1.3.3" -bstr = "1.9.1" -bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } +bstr = "1.11.3" +bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -# charabia = { version = "0.9.0", default-features = false } -charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false } +charabia = { version = "0.9.3", default-features = false } concat-arrays = "0.1.2" -crossbeam-channel = "0.5.13" -deserr = "0.6.2" +convert_case = "0.6.0" +crossbeam-channel = "0.5.14" +deserr = "0.6.3" either = { version = "1.13.0", features = ["serde"] } flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" -grenad = { version = "0.4.7", default-features = false, features = [ - "rayon", # TODO Should we keep this feature +grenad = { version = "0.5.0", default-features = false, features = [ + "rayon", "tempfile", -], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } -heed = { version = "0.20.3", default-features = false, features = [ +] } +heed = { version = "0.22.0", default-features = false, features = [ "serde-json", "serde-bincode", - "read-txn-no-tls", ] } -indexmap = { version = "2.2.6", features = ["serde"] } +indexmap = { version = "2.7.0", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -memchr = "2.5.0" -memmap2 = "0.9.4" -obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } -once_cell = "1.19.0" -ordered-float = "4.2.1" +memchr = "2.7.4" +memmap2 = "0.9.5" +obkv = "0.3.0" +once_cell = "1.20.2" +ordered-float = "4.6.0" rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } -rstar = { version = "0.12.0", features = ["serde"] } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] } +roaring = { version = "0.10.10", features = ["serde"] } +rstar = { version = "0.12.2", features = ["serde"] } +serde = { version = "1.0.217", features = ["derive"] } +serde_json = { version = "1.0.135", features = ["preserve_order", "raw_value"] } slice-group-by = "0.3.1" smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.13.2" smartstring = "1.0.1" -tempfile = "3.10.1" -thiserror = "1.0.61" -time = { version = "0.3.36", features = [ +tempfile = "3.15.0" +thiserror = "2.0.9" +time = { version = "0.3.37", features = [ "serde-well-known", "formatting", "parsing", "macros", ] } -uuid = { version = "1.10.0", features = ["v4"] } +uuid = { version = "1.11.0", features = ["v4"] } filter-parser = { path = "../filter-parser" } # documents words self-join -itertools = "0.13.0" +itertools = "0.14.0" -csv = "1.3.0" -candle-core = { version = "0.6.0" } -candle-transformers = { version = "0.6.0" } -candle-nn = { version = "0.6.0" } +csv = "1.3.1" +candle-core = { version = "0.8.2" } +candle-transformers = { version = "0.8.2" } +candle-nn = { version = "0.8.2" } tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [ "onig", ] } hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [ "online", ] } -tiktoken-rs = "0.5.9" -liquid = "0.26.6" +tiktoken-rs = "0.6.0" +liquid = "0.26.9" rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = [ "serde", "no_module", @@ -88,33 +87,42 @@ rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838 "no_time", "sync", ] } -arroy = "0.5.0" +arroy = "0.6.1" rand = "0.8.5" -tracing = "0.1.40" -ureq = { version = "2.10.0", features = ["json"] } -url = "2.5.2" +tracing = "0.1.41" +ureq = { version = "2.12.1", features = ["json"] } +url = "2.5.4" rayon-par-bridge = "0.1.0" -hashbrown = "0.15.0" -raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } +hashbrown = "0.15.2" bumpalo = "3.16.0" +bumparaw-collections = "0.1.4" thread_local = "1.1.8" -allocator-api2 = "0.2.18" -rustc-hash = "2.0.0" +allocator-api2 = "0.2.21" +rustc-hash = "2.1.0" uell = "0.1.0" enum-iterator = "2.1.0" +bbqueue = { git = "https://github.com/meilisearch/bbqueue" } +flume = { version = "0.11.1", default-features = false } +utoipa = { version = "5.3.1", features = [ + "non_strict_integers", + "preserve_order", + "uuid", + "time", + "openapi_extensions", +] } +lru = "0.13.0" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } -insta = "1.39.0" +# fixed version due to format breakages in v1.40 +insta = "=1.39.0" maplit = "1.0.2" md5 = "0.7.0" meili-snap = { path = "../meili-snap" } rand = { version = "0.8.5", features = ["small_rng"] } [features] -all-tokenizations = [ - "charabia/default", -] +all-tokenizations = ["charabia/default"] # Use POSIX semaphores instead of SysV semaphores in LMDB # For more information on this feature, see heed's Cargo.toml diff --git a/crates/milli/README.md b/crates/milli/README.md index 8f04d04dc..101da0684 100644 --- a/crates/milli/README.md +++ b/crates/milli/README.md @@ -1,5 +1,5 @@

- the milli logo + the milli logo

a concurrent indexer combined with fast and relevant search algorithms

diff --git a/crates/milli/src/asc_desc.rs b/crates/milli/src/asc_desc.rs index bde0dd440..e75adf83d 100644 --- a/crates/milli/src/asc_desc.rs +++ b/crates/milli/src/asc_desc.rs @@ -6,6 +6,7 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; use thiserror::Error; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::is_reserved_keyword; use crate::search::facet::BadGeoError; use crate::{CriterionError, Error, UserError}; @@ -175,7 +176,7 @@ impl From for SortError { AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { SortError::BadGeoPointUsage { name } } - AscDescError::ReservedKeyword { name } if &name == "_geo" => { + AscDescError::ReservedKeyword { name } if name == RESERVED_GEO_FIELD_NAME => { SortError::ReservedNameForSettings { name } } AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { diff --git a/crates/milli/src/attribute_patterns.rs b/crates/milli/src/attribute_patterns.rs new file mode 100644 index 000000000..00caa2a6d --- /dev/null +++ b/crates/milli/src/attribute_patterns.rs @@ -0,0 +1,152 @@ +use deserr::Deserr; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +use crate::is_faceted_by; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[repr(transparent)] +#[serde(transparent)] +pub struct AttributePatterns { + #[schema(example = json!(["title", "overview_*", "release_date"]))] + pub patterns: Vec, +} + +impl Deserr for AttributePatterns { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + Vec::::deserialize_from_value(value, location).map(|patterns| Self { patterns }) + } +} + +impl From> for AttributePatterns { + fn from(patterns: Vec) -> Self { + Self { patterns } + } +} + +impl AttributePatterns { + /// Match a string against the attribute patterns using the match_pattern function. + pub fn match_str(&self, str: &str) -> PatternMatch { + let mut pattern_match = PatternMatch::NoMatch; + for pattern in &self.patterns { + match match_pattern(pattern, str) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => pattern_match = PatternMatch::Parent, + PatternMatch::NoMatch => {} + } + } + pattern_match + } +} + +/// Match a string against a pattern. +/// +/// The pattern can be a wildcard, a prefix, a suffix or an exact match. +/// +/// # Arguments +/// +/// * `pattern` - The pattern to match against. +/// * `str` - The string to match against the pattern. +fn match_pattern(pattern: &str, str: &str) -> PatternMatch { + // If the pattern is a wildcard, return Match + if pattern == "*" { + return PatternMatch::Match; + } else if pattern.starts_with('*') && pattern.ends_with('*') { + // If the pattern starts and ends with a wildcard, return Match if the string contains the pattern without the wildcards + if str.contains(&pattern[1..pattern.len() - 1]) { + return PatternMatch::Match; + } + } else if let Some(pattern) = pattern.strip_prefix('*') { + // If the pattern starts with a wildcard, return Match if the string ends with the pattern without the wildcard + if str.ends_with(pattern) { + return PatternMatch::Match; + } + } else if let Some(pattern) = pattern.strip_suffix('*') { + // If the pattern ends with a wildcard, return Match if the string starts with the pattern without the wildcard + if str.starts_with(pattern) { + return PatternMatch::Match; + } + } else if pattern == str { + // If the pattern is exactly the string, return Match + return PatternMatch::Match; + } + + // If the field is a parent field of the pattern, return Parent + if is_faceted_by(pattern, str) { + PatternMatch::Parent + } else { + PatternMatch::NoMatch + } +} + +/// Match a field against a pattern using the legacy behavior. +/// +/// A field matches a pattern if it is a parent of the pattern or if it is the pattern itself. +/// This behavior is used to match the sortable attributes, the searchable attributes and the filterable attributes rules `Field`. +/// +/// # Arguments +/// +/// * `pattern` - The pattern to match against. +/// * `field` - The field to match against the pattern. +pub fn match_field_legacy(pattern: &str, field: &str) -> PatternMatch { + if is_faceted_by(field, pattern) { + // If the field matches the pattern or is a nested field of the pattern, return Match (legacy behavior) + PatternMatch::Match + } else if is_faceted_by(pattern, field) { + // If the field is a parent field of the pattern, return Parent + PatternMatch::Parent + } else { + // If the field does not match the pattern and is not a parent of a nested field that matches the pattern, return NoMatch + PatternMatch::NoMatch + } +} + +/// Match a field against a distinct field. +pub fn match_distinct_field(distinct_field: Option<&str>, field: &str) -> PatternMatch { + if let Some(distinct_field) = distinct_field { + if field == distinct_field { + // If the field matches exactly the distinct field, return Match + return PatternMatch::Match; + } else if is_faceted_by(distinct_field, field) { + // If the field is a parent field of the distinct field, return Parent + return PatternMatch::Parent; + } + } + // If the field does not match the distinct field and is not a parent of a nested field that matches the distinct field, return NoMatch + PatternMatch::NoMatch +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum PatternMatch { + /// The field is a parent of a nested field that matches the pattern + /// For example, the field is `toto`, and the pattern is `toto.titi` + Parent, + /// The field matches the pattern + Match, + /// The field does not match the pattern + NoMatch, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_match_pattern() { + assert_eq!(match_pattern("*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("test*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("test*", "testa"), PatternMatch::Match); + assert_eq!(match_pattern("*test", "test"), PatternMatch::Match); + assert_eq!(match_pattern("*test", "atest"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "test"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "atesta"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "atest"), PatternMatch::Match); + assert_eq!(match_pattern("*test*", "testa"), PatternMatch::Match); + assert_eq!(match_pattern("test*test", "test"), PatternMatch::NoMatch); + assert_eq!(match_pattern("*test", "testa"), PatternMatch::NoMatch); + assert_eq!(match_pattern("test*", "atest"), PatternMatch::NoMatch); + } +} diff --git a/crates/milli/src/constants.rs b/crates/milli/src/constants.rs new file mode 100644 index 000000000..39b449661 --- /dev/null +++ b/crates/milli/src/constants.rs @@ -0,0 +1,6 @@ +pub static VERSION_MAJOR: &str = env!("CARGO_PKG_VERSION_MAJOR"); +pub static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR"); +pub static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH"); + +pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; +pub const RESERVED_GEO_FIELD_NAME: &str = "_geo"; diff --git a/crates/milli/src/criterion.rs b/crates/milli/src/criterion.rs index 45cbfe63d..a1487fa79 100644 --- a/crates/milli/src/criterion.rs +++ b/crates/milli/src/criterion.rs @@ -113,6 +113,7 @@ mod tests { use CriterionError::*; use super::*; + use crate::constants::RESERVED_GEO_FIELD_NAME; #[test] fn parse_criterion() { @@ -153,7 +154,7 @@ mod tests { ("price:aasc", InvalidName { name: S("price:aasc") }), ("price:asc and desc", InvalidName { name: S("price:asc and desc") }), ("price:asc:truc", InvalidName { name: S("price:asc:truc") }), - ("_geo:asc", ReservedName { name: S("_geo") }), + ("_geo:asc", ReservedName { name: S(RESERVED_GEO_FIELD_NAME) }), ("_geoDistance:asc", ReservedName { name: S("_geoDistance") }), ("_geoPoint:asc", ReservedNameForSort { name: S("_geoPoint") }), ("_geoPoint(42, 75):asc", ReservedNameForSort { name: S("_geoPoint") }), diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs new file mode 100644 index 000000000..d97dc13ba --- /dev/null +++ b/crates/milli/src/database_stats.rs @@ -0,0 +1,96 @@ +use heed::types::Bytes; +use heed::Database; +use heed::RoTxn; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// The stats of a database. +pub struct DatabaseStats { + /// The number of entries in the database. + number_of_entries: u64, + /// The total size of the keys in the database. + total_key_size: u64, + /// The total size of the values in the database. + total_value_size: u64, +} + +impl DatabaseStats { + /// Returns the stats of the database. + /// + /// This function iterates over the whole database and computes the stats. + /// It is not efficient and should be cached somewhere. + pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> heed::Result { + let mut database_stats = + Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 }; + + let mut iter = database.iter(rtxn)?; + while let Some((key, value)) = iter.next().transpose()? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + database_stats.total_key_size += key_size; + database_stats.total_value_size += value_size; + } + + database_stats.number_of_entries = database.len(rtxn)?; + + Ok(database_stats) + } + + /// Recomputes the stats of the database and returns the new stats. + /// + /// This function is used to update the stats of the database when some keys are modified. + /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. + pub(crate) fn recompute( + mut stats: Self, + database: Database, + before_rtxn: &RoTxn<'_>, + after_rtxn: &RoTxn<'_>, + modified_keys: I, + ) -> heed::Result + where + I: IntoIterator, + K: AsRef<[u8]>, + { + for key in modified_keys { + let key = key.as_ref(); + if let Some(value) = database.get(after_rtxn, key)? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + stats.total_key_size = stats.total_key_size.saturating_add(key_size); + stats.total_value_size = stats.total_value_size.saturating_add(value_size); + } + + if let Some(value) = database.get(before_rtxn, key)? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + stats.total_key_size = stats.total_key_size.saturating_sub(key_size); + stats.total_value_size = stats.total_value_size.saturating_sub(value_size); + } + } + + stats.number_of_entries = database.len(after_rtxn)?; + + Ok(stats) + } + + pub fn average_key_size(&self) -> u64 { + self.total_key_size.checked_div(self.number_of_entries).unwrap_or(0) + } + + pub fn average_value_size(&self) -> u64 { + self.total_value_size.checked_div(self.number_of_entries).unwrap_or(0) + } + + pub fn number_of_entries(&self) -> u64 { + self.number_of_entries + } + + pub fn total_key_size(&self) -> u64 { + self.total_key_size + } + + pub fn total_value_size(&self) -> u64 { + self.total_value_size + } +} diff --git a/crates/milli/src/documents/mod.rs b/crates/milli/src/documents/mod.rs index 88fa38d30..f43f7e842 100644 --- a/crates/milli/src/documents/mod.rs +++ b/crates/milli/src/documents/mod.rs @@ -80,9 +80,13 @@ impl DocumentsBatchIndex { let mut map = Object::new(); for (k, v) in document.iter() { - // TODO: TAMO: update the error type - let key = - self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone(); + let key = self + .0 + .get_by_left(&k) + .ok_or(crate::error::InternalError::FieldIdMapMissingEntry( + FieldIdMapMissingEntry::FieldId { field_id: k, process: "recreate_json" }, + ))? + .clone(); let value = serde_json::from_slice::(v) .map_err(crate::error::InternalError::SerdeJson)?; map.insert(key, value); diff --git a/crates/milli/src/documents/primary_key.rs b/crates/milli/src/documents/primary_key.rs index fb8b3d027..415453349 100644 --- a/crates/milli/src/documents/primary_key.rs +++ b/crates/milli/src/documents/primary_key.rs @@ -271,7 +271,7 @@ fn fetch_matching_values_in_object( } fn starts_with(selector: &str, key: &str) -> bool { - selector.strip_prefix(key).map_or(false, |tail| { + selector.strip_prefix(key).is_some_and(|tail| { tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) }) } @@ -280,7 +280,7 @@ fn starts_with(selector: &str, key: &str) -> bool { pub fn validate_document_id_str(document_id: &str) -> Option<&str> { if document_id.is_empty() - || document_id.len() > 512 + || document_id.len() >= 512 || !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') { None diff --git a/crates/milli/src/documents/serde_impl.rs b/crates/milli/src/documents/serde_impl.rs index e9fc541e5..55eeb52f1 100644 --- a/crates/milli/src/documents/serde_impl.rs +++ b/crates/milli/src/documents/serde_impl.rs @@ -27,7 +27,7 @@ impl<'a, W> DocumentVisitor<'a, W> { } } -impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { +impl<'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'_, W> { /// This Visitor value is nothing, since it write the value to a file. type Value = Result<(), Error>; @@ -61,7 +61,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { } } -impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +impl<'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'_, W> where W: Write, { diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 6c60dcecc..e0d48e0ac 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -1,20 +1,25 @@ use std::collections::BTreeSet; +use std::collections::HashMap; use std::convert::Infallible; use std::fmt::Write; use std::{io, str}; +use bstr::BString; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use rhai::EvalAltResult; use serde_json::Value; use thiserror::Error; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::{self, DocumentsBatchCursorError}; use crate::thread_pool_no_abort::PanicCatched; +use crate::vector::settings::EmbeddingSettings; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { - ["_geo", "_geoDistance", "_geoPoint", "_geoRadius", "_geoBoundingBox"].contains(&keyword) + [RESERVED_GEO_FIELD_NAME, "_geoDistance", "_geoPoint", "_geoRadius", "_geoBoundingBox"] + .contains(&keyword) } #[derive(Error, Debug)] @@ -29,8 +34,6 @@ pub enum Error { #[derive(Error, Debug)] pub enum InternalError { - #[error("{}", HeedError::DatabaseClosing)] - DatabaseClosing, #[error("missing {} in the {db_name} database", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, #[error("missing {key} in the fieldids weights mapping")] @@ -61,12 +64,18 @@ pub enum InternalError { Serialization(#[from] SerializationError), #[error(transparent)] Store(#[from] MdbError), + #[error("Cannot delete {key:?} from database {database_name}: {error}")] + StoreDeletion { database_name: &'static str, key: BString, error: heed::Error }, + #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] + StorePut { database_name: &'static str, key: BString, value_length: usize, error: heed::Error }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] AbortedIndexation, #[error("The matching words list contains at least one invalid member")] InvalidMatchingWords, + #[error("Cannot upgrade to the following version: v{0}.{1}.{2}.")] + CannotUpgradeToVersion(u32, u32, u32), #[error(transparent)] ArroyError(#[from] arroy::Error), #[error(transparent)] @@ -109,16 +118,40 @@ pub enum UserError { "Document identifier `{}` is invalid. \ A document identifier can be of type integer or string, \ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ -and can not be more than 512 bytes.", .document_id.to_string() +and can not be more than 511 bytes.", .document_id.to_string() )] InvalidDocumentId { document_id: Value }, - #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))] + #[error("Invalid facet distribution: {}", + if .invalid_facets_name.len() == 1 { + let field = .invalid_facets_name.iter().next().unwrap(); + match .matching_rule_indices.get(field) { + Some(rule_index) => format!("Attribute `{}` matched rule #{} in filterableAttributes, but this rule does not enable filtering.\nHint: enable filtering in rule #{} by modifying the features.filter object\nHint: prepend another rule matching `{}` with appropriate filter features before rule #{}", + field, rule_index, rule_index, field, rule_index), + None => match .valid_patterns.is_empty() { + true => format!("Attribute `{}` is not filterable. This index does not have configured filterable attributes.", field), + false => format!("Attribute `{}` is not filterable. Available filterable attributes patterns are: `{}`.", + field, + .valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", ")), + } + } + } else { + format!("Attributes `{}` are not filterable. {}", + .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", "), + match .valid_patterns.is_empty() { + true => "This index does not have configured filterable attributes.".to_string(), + false => format!("Available filterable attributes patterns are: `{}`.", + .valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", ")), + } + ) + } + )] InvalidFacetsDistribution { invalid_facets_name: BTreeSet, - valid_facets_name: BTreeSet, + valid_patterns: BTreeSet, + matching_rule_indices: HashMap, }, #[error(transparent)] - InvalidGeoField(#[from] GeoError), + InvalidGeoField(#[from] Box), #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)] InvalidVectorDimensions { expected: usize, found: usize }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] @@ -127,8 +160,20 @@ and can not be more than 512 bytes.", .document_id.to_string() InvalidVectorsEmbedderConf { document_id: String, error: String }, #[error("{0}")] InvalidFilter(String), - #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] + #[error("Invalid type for filter subexpression: expected: {}, found: {}.", .0.join(", "), .1)] InvalidFilterExpression(&'static [&'static str], Value), + #[error("Filter operator `{operator}` is not allowed for the attribute `{field}`.\n - Note: allowed operators: {}.\n - Note: field `{field}` matched rule #{rule_index} in `filterableAttributes`\n - Hint: enable {} in rule #{rule_index} by modifying the features.filter object\n - Hint: prepend another rule matching `{field}` with appropriate filter features before rule #{rule_index}", + allowed_operators.join(", "), + if operator == "=" || operator == "!=" || operator == "IN" {"equality"} + else if operator == "<" || operator == ">" || operator == "<=" || operator == ">=" || operator == "TO" {"comparison"} + else {"the appropriate filter operators"} + )] + FilterOperatorNotAllowed { + field: String, + allowed_operators: Vec, + operator: String, + rule_index: usize, + }, #[error("Attribute `{}` is not sortable. {}", .field, match .valid_fields.is_empty() { @@ -142,29 +187,51 @@ and can not be more than 512 bytes.", .document_id.to_string() InvalidSortableAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, #[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}", .field, - match .valid_fields.is_empty() { - true => "This index does not have configured filterable attributes.".to_string(), - false => format!("Available filterable attributes are: `{}{}`.", - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + match (.valid_patterns.is_empty(), .matching_rule_index) { + // No rules match and no filterable attributes + (true, None) => "This index does not have configured filterable attributes.".to_string(), + + // No rules match but there are some filterable attributes + (false, None) => format!("Available filterable attributes patterns are: `{}{}`.", + valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", "), .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), ), + + // A rule matched but filtering isn't enabled + (_, Some(rule_index)) => format!("Note: this attribute matches rule #{} in filterableAttributes, but this rule does not enable filtering.\nHint: enable filtering in rule #{} by adding appropriate filter features.\nHint: prepend another rule matching {} with filter features before rule #{}", + rule_index, rule_index, .field, rule_index + ), } )] - InvalidDistinctAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, + InvalidDistinctAttribute { + field: String, + valid_patterns: BTreeSet, + hidden_fields: bool, + matching_rule_index: Option, + }, #[error("Attribute `{}` is not facet-searchable. {}", .field, - match .valid_fields.is_empty() { - true => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(), - false => format!("Available facet-searchable attributes are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.", - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "), + match (.valid_patterns.is_empty(), .matching_rule_index) { + // No rules match and no facet searchable attributes + (true, None) => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(), + + // No rules match but there are some facet searchable attributes + (false, None) => format!("Available facet-searchable attributes patterns are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.", + valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", "), .hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""), ), + + // A rule matched but facet search isn't enabled + (_, Some(rule_index)) => format!("Note: this attribute matches rule #{} in filterableAttributes, but this rule does not enable facetSearch.\nHint: enable facetSearch in rule #{} by adding `\"facetSearch\": true` to the rule.\nHint: prepend another rule matching {} with facetSearch: true before rule #{}", + rule_index, rule_index, .field, rule_index + ), } )] InvalidFacetSearchFacetName { field: String, - valid_fields: BTreeSet, + valid_patterns: BTreeSet, hidden_fields: bool, + matching_rule_index: Option, }, #[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.", .field, @@ -176,8 +243,8 @@ and can not be more than 512 bytes.", .document_id.to_string() valid_fields: BTreeSet, hidden_fields: bool, }, - #[error("an environment is already opened with different options")] - InvalidLmdbOpenOptions, + #[error("An LMDB environment is already opened")] + EnvAlreadyOpened, #[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")] SortRankingRuleMissing, #[error("The database file is in an invalid state.")] @@ -215,31 +282,57 @@ and can not be more than 512 bytes.", .document_id.to_string() #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), #[error("Cannot find embedder with name `{0}`.")] - InvalidEmbedder(String), + InvalidSearchEmbedder(String), + #[error("Cannot find embedder with name `{0}`.")] + InvalidSimilarEmbedder(String), #[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")] TooManyVectors(String, usize), - #[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}", - allowed_sources_for_field - .iter() - .map(|accepted| format!("`{}`", accepted)) - .collect::>() - .join(", "), - allowed_fields_for_source - .iter() - .map(|accepted| format!("`{}`", accepted)) - .collect::>() - .join(", ") + #[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}`{for_context}.{available_sources}{available_fields}{available_contexts}", + field=field.name(), + for_context={ + context.in_context() + }, + available_sources={ + let allowed_sources_for_field = EmbeddingSettings::allowed_sources_for_field(*field, *context); + if allowed_sources_for_field.is_empty() { + String::new() + } else { + format!("\n - note: `{}` is available for sources: {}", + field.name(), + allowed_sources_for_field + .iter() + .map(|accepted| format!("`{}`", accepted)) + .collect::>() + .join(", "), + ) + } + }, + available_fields={ + let allowed_fields_for_source = EmbeddingSettings::allowed_fields_for_source(*source_, *context); + format!("\n - note: available fields for source `{source_}`{}: {}",context.in_context(), allowed_fields_for_source + .iter() + .map(|accepted| format!("`{}`", accepted)) + .collect::>() + .join(", "),) + }, + available_contexts={ + let available_not_nested = !matches!(EmbeddingSettings::field_status(*source_, *field, crate::vector::settings::NestingContext::NotNested), crate::vector::settings::FieldStatus::Disallowed); + if available_not_nested { + format!("\n - note: `{}` is available when source `{source_}` is not{}", field.name(), context.in_context()) + } else { + String::new() + } + } )] InvalidFieldForSource { embedder_name: String, source_: crate::vector::settings::EmbedderSource, - field: &'static str, - allowed_fields_for_source: &'static [&'static str], - allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource], + context: crate::vector::settings::NestingContext, + field: crate::vector::settings::MetaEmbeddingSetting, }, #[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())] InvalidOpenAiModel { embedder_name: String, model: String }, - #[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")] + #[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source `{source_}`)")] MissingFieldForSource { field: &'static str, source_: crate::vector::settings::EmbedderSource, @@ -259,6 +352,15 @@ and can not be more than 512 bytes.", .document_id.to_string() dimensions: usize, max_dimensions: usize, }, + #[error("`.embedders.{embedder_name}.source`: Source `{source_}` is not available in a nested embedder")] + InvalidSourceForNested { + embedder_name: String, + source_: crate::vector::settings::EmbedderSource, + }, + #[error("`.embedders.{embedder_name}`: Missing field `source`.\n - note: this field is mandatory for nested embedders")] + MissingSourceForNested { embedder_name: String }, + #[error("`.embedders.{embedder_name}`: {message}")] + InvalidSettingsEmbedder { embedder_name: String, message: String }, #[error("`.embedders.{embedder_name}.dimensions`: `dimensions` cannot be zero")] InvalidSettingsDimensions { embedder_name: String }, #[error( @@ -306,7 +408,8 @@ impl From for Error { | arroy::Error::UnmatchingDistance { .. } | arroy::Error::NeedBuild(_) | arroy::Error::MissingKey { .. } - | arroy::Error::MissingMetadata(_) => { + | arroy::Error::MissingMetadata(_) + | arroy::Error::CannotDecodeKeyMode { .. } => { Error::InternalError(InternalError::ArroyError(value)) } } @@ -333,45 +436,53 @@ pub enum GeoError { BadLongitude { document_id: Value, value: Value }, } +#[allow(dead_code)] fn format_invalid_filter_distribution( invalid_facets_name: &BTreeSet, - valid_facets_name: &BTreeSet, + valid_patterns: &BTreeSet, ) -> String { - if valid_facets_name.is_empty() { - return "this index does not have configured filterable attributes.".into(); - } - let mut result = String::new(); - match invalid_facets_name.len() { - 0 => (), - 1 => write!( - result, - "attribute `{}` is not filterable.", - invalid_facets_name.first().unwrap() - ) - .unwrap(), - _ => write!( - result, - "attributes `{}` are not filterable.", - invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") - ) - .unwrap(), - }; + if invalid_facets_name.is_empty() { + if valid_patterns.is_empty() { + return "this index does not have configured filterable attributes.".into(); + } + } else { + match invalid_facets_name.len() { + 1 => write!( + result, + "Attribute `{}` is not filterable.", + invalid_facets_name.first().unwrap() + ) + .unwrap(), + _ => write!( + result, + "Attributes `{}` are not filterable.", + invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + ) + .unwrap(), + }; + } - match valid_facets_name.len() { - 1 => write!( - result, - " The available filterable attribute is `{}`.", - valid_facets_name.first().unwrap() - ) - .unwrap(), - _ => write!( - result, - " The available filterable attributes are `{}`.", - valid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") - ) - .unwrap(), + if valid_patterns.is_empty() { + if !invalid_facets_name.is_empty() { + write!(result, " This index does not have configured filterable attributes.").unwrap(); + } + } else { + match valid_patterns.len() { + 1 => write!( + result, + " Available filterable attributes patterns are: `{}`.", + valid_patterns.first().unwrap() + ) + .unwrap(), + _ => write!( + result, + " Available filterable attributes patterns are: `{}`.", + valid_patterns.iter().map(AsRef::as_ref).collect::>().join(", ") + ) + .unwrap(), + } } result @@ -383,7 +494,7 @@ fn format_invalid_filter_distribution( /// ```ignore /// impl From for Error { /// fn from(error: FieldIdMapMissingEntry) -> Error { -/// Error::from(InternalError::from(error)) +/// Error::from(::from(error)) /// } /// } /// ``` @@ -408,7 +519,7 @@ error_from_sub_error! { str::Utf8Error => InternalError, ThreadPoolBuildError => InternalError, SerializationError => InternalError, - GeoError => UserError, + Box => UserError, CriterionError => UserError, } @@ -460,8 +571,7 @@ impl From for Error { // TODO use the encoding HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), - HeedError::DatabaseClosing => InternalError(DatabaseClosing), - HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), + HeedError::EnvAlreadyOpened { .. } => UserError(EnvAlreadyOpened), } } } diff --git a/crates/milli/src/external_documents_ids.rs b/crates/milli/src/external_documents_ids.rs index f47df0762..755b801ec 100644 --- a/crates/milli/src/external_documents_ids.rs +++ b/crates/milli/src/external_documents_ids.rs @@ -25,7 +25,7 @@ impl ExternalDocumentsIds { /// Returns `true` if hard and soft external documents lists are empty. pub fn is_empty(&self, rtxn: &RoTxn<'_>) -> heed::Result { - self.0.is_empty(rtxn).map_err(Into::into) + self.0.is_empty(rtxn) } pub fn get>( diff --git a/crates/milli/src/fieldids_weights_map.rs b/crates/milli/src/fieldids_weights_map.rs index 13f2f8afc..0c57ba109 100644 --- a/crates/milli/src/fieldids_weights_map.rs +++ b/crates/milli/src/fieldids_weights_map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] @@ -43,11 +43,6 @@ impl FieldidsWeightsMap { self.map.get(&fid).copied() } - /// Returns highest weight contained in the map if any. - pub fn max_weight(&self) -> Option { - self.map.values().copied().max() - } - /// Return an iterator visiting all field ids in arbitrary order. pub fn ids(&self) -> impl Iterator + '_ { self.map.keys().copied() diff --git a/crates/milli/src/fields_ids_map/global.rs b/crates/milli/src/fields_ids_map/global.rs index 2ffc45eb7..6d7cf6caf 100644 --- a/crates/milli/src/fields_ids_map/global.rs +++ b/crates/milli/src/fields_ids_map/global.rs @@ -105,9 +105,21 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> { self.local.name(id) } + + /// Get the metadata of a field based on its id. + pub fn metadata(&mut self, id: FieldId) -> Option { + if self.local.metadata(id).is_none() { + let global = self.global.read().unwrap(); + + let (name, metadata) = global.name_with_metadata(id)?; + self.local.insert(name, id, metadata); + } + + self.local.metadata(id) + } } -impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> { +impl MutFieldIdMapper for GlobalFieldsIdsMap<'_> { fn insert(&mut self, name: &str) -> Option { self.id_or_insert(name) } diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 65a1111fa..89b0a446b 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -5,14 +5,29 @@ use charabia::Language; use heed::RoTxn; use super::FieldsIdsMap; -use crate::{FieldId, Index, LocalizedAttributesRule, Result}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::{ + is_faceted_by, FieldId, FilterableAttributesFeatures, FilterableAttributesRule, Index, + LocalizedAttributesRule, Result, Weight, +}; #[derive(Debug, Clone, Copy)] pub struct Metadata { - pub searchable: bool, - pub filterable: bool, + /// The weight as defined in the FieldidsWeightsMap of the searchable attribute if it is searchable. + pub searchable: Option, + /// The field is part of the sortable attributes. pub sortable: bool, - localized_attributes_rule_id: Option, + /// The field is defined as the distinct attribute. + pub distinct: bool, + /// The field has been defined as asc/desc in the ranking rules. + pub asc_desc: bool, + /// The field is a geo field (`_geo`, `_geo.lat`, `_geo.lng`). + pub geo: bool, + /// The id of the localized attributes rule if the field is localized. + pub localized_attributes_rule_id: Option, + /// The id of the filterable attributes rule if the field is filterable. + pub filterable_attributes_rule_id: Option, } #[derive(Debug, Clone)] @@ -106,76 +121,227 @@ impl Metadata { let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap(); Some(rule.locales()) } + + pub fn filterable_attributes<'rules>( + &self, + rules: &'rules [FilterableAttributesRule], + ) -> Option<&'rules FilterableAttributesRule> { + self.filterable_attributes_with_rule_index(rules).map(|(_, rule)| rule) + } + + pub fn filterable_attributes_with_rule_index<'rules>( + &self, + rules: &'rules [FilterableAttributesRule], + ) -> Option<(usize, &'rules FilterableAttributesRule)> { + let filterable_attributes_rule_id = self.filterable_attributes_rule_id?.get(); + let rule_id = (filterable_attributes_rule_id - 1) as usize; + let rule = rules.get(rule_id).unwrap(); + Some((rule_id, rule)) + } + + pub fn filterable_attributes_features( + &self, + rules: &[FilterableAttributesRule], + ) -> FilterableAttributesFeatures { + let (_, features) = self.filterable_attributes_features_with_rule_index(rules); + features + } + + pub fn filterable_attributes_features_with_rule_index( + &self, + rules: &[FilterableAttributesRule], + ) -> (Option, FilterableAttributesFeatures) { + self.filterable_attributes_with_rule_index(rules) + .map(|(rule_index, rule)| (Some(rule_index), rule.features())) + // if there is no filterable attributes rule, return no features + .unwrap_or_else(|| (None, FilterableAttributesFeatures::no_features())) + } + + pub fn is_sortable(&self) -> bool { + self.sortable + } + + pub fn is_searchable(&self) -> bool { + self.searchable.is_some() + } + + pub fn searchable_weight(&self) -> Option { + self.searchable + } + + pub fn is_distinct(&self) -> bool { + self.distinct + } + + pub fn is_asc_desc(&self) -> bool { + self.asc_desc + } + + pub fn is_geo(&self) -> bool { + self.geo + } + + /// Returns `true` if the field is part of the facet databases. (sortable, distinct, asc_desc, filterable or facet searchable) + pub fn is_faceted(&self, rules: &[FilterableAttributesRule]) -> bool { + if self.is_distinct() || self.is_sortable() || self.is_asc_desc() { + return true; + } + + let features = self.filterable_attributes_features(rules); + if features.is_filterable() || features.is_facet_searchable() { + return true; + } + + false + } + + pub fn require_facet_level_database(&self, rules: &[FilterableAttributesRule]) -> bool { + let features = self.filterable_attributes_features(rules); + + self.is_sortable() || self.is_asc_desc() || features.is_filterable_comparison() + } } #[derive(Debug, Clone)] pub struct MetadataBuilder { - searchable_attributes: Vec, - filterable_attributes: HashSet, + searchable_attributes: Option>, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, } impl MetadataBuilder { pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result { - let searchable_attributes = - index.searchable_fields(rtxn)?.into_iter().map(|s| s.to_string()).collect(); - let filterable_attributes = index.filterable_fields(rtxn)?; + let searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(|s| s.to_string()).collect()); + let filterable_attributes = index.filterable_attributes_rules(rtxn)?; let sortable_attributes = index.sortable_fields(rtxn)?; let localized_attributes = index.localized_attributes_rules(rtxn)?; + let distinct_attribute = index.distinct_field(rtxn)?.map(|s| s.to_string()); + let asc_desc_attributes = index.asc_desc_fields(rtxn)?; - Ok(Self { + Ok(Self::new( searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, - }) + distinct_attribute, + asc_desc_attributes, + )) } + /// Build a new `MetadataBuilder` from the given parameters. + /// + /// This is used for testing, prefer using `MetadataBuilder::from_index` instead. pub fn new( - searchable_attributes: Vec, - filterable_attributes: HashSet, + searchable_attributes: Option>, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, ) -> Self { + let searchable_attributes = match searchable_attributes { + Some(fields) if fields.iter().any(|f| f == "*") => None, + Some(fields) => Some(fields), + None => None, + }; + Self { searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, + distinct_attribute, + asc_desc_attributes, } } pub fn metadata_for_field(&self, field: &str) -> Metadata { - let searchable = self - .searchable_attributes + if is_faceted_by(field, RESERVED_VECTORS_FIELD_NAME) { + // Vectors fields are not searchable, filterable, distinct or asc_desc + return Metadata { + searchable: None, + sortable: false, + distinct: false, + asc_desc: false, + geo: false, + localized_attributes_rule_id: None, + filterable_attributes_rule_id: None, + }; + } + + // A field is sortable if it is faceted by a sortable attribute + let sortable = self + .sortable_attributes .iter() - .any(|attribute| attribute == "*" || attribute == field); + .any(|pattern| match_field_legacy(pattern, field) == PatternMatch::Match); - let filterable = self.filterable_attributes.contains(field); + let filterable_attributes_rule_id = self + .filterable_attributes + .iter() + .position(|attribute| attribute.match_str(field) == PatternMatch::Match) + // saturating_add(1): make `id` `NonZero` + .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - let sortable = self.sortable_attributes.contains(field); + if match_field_legacy(RESERVED_GEO_FIELD_NAME, field) == PatternMatch::Match { + // Geo fields are not searchable, distinct or asc_desc + return Metadata { + searchable: None, + sortable, + distinct: false, + asc_desc: false, + geo: true, + localized_attributes_rule_id: None, + filterable_attributes_rule_id, + }; + } + + let searchable = match &self.searchable_attributes { + // A field is searchable if it is faceted by a searchable attribute + Some(attributes) => attributes + .iter() + .enumerate() + .find(|(_i, pattern)| is_faceted_by(field, pattern)) + .map(|(i, _)| i as u16), + None => Some(0), + }; + + let distinct = + self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field); + let asc_desc = self.asc_desc_attributes.contains(field); let localized_attributes_rule_id = self .localized_attributes .iter() .flat_map(|v| v.iter()) - .position(|rule| rule.match_str(field)) + .position(|rule| rule.match_str(field) == PatternMatch::Match) // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - Metadata { searchable, filterable, sortable, localized_attributes_rule_id } + Metadata { + searchable, + sortable, + distinct, + asc_desc, + geo: false, + localized_attributes_rule_id, + filterable_attributes_rule_id, + } } - pub fn searchable_attributes(&self) -> &[String] { - self.searchable_attributes.as_slice() + pub fn searchable_attributes(&self) -> Option<&[String]> { + self.searchable_attributes.as_deref() } pub fn sortable_attributes(&self) -> &HashSet { &self.sortable_attributes } - pub fn filterable_attributes(&self) -> &HashSet { + pub fn filterable_attributes(&self) -> &[FilterableAttributesRule] { &self.filterable_attributes } diff --git a/crates/milli/src/filterable_attributes_rules.rs b/crates/milli/src/filterable_attributes_rules.rs new file mode 100644 index 000000000..53af30fd6 --- /dev/null +++ b/crates/milli/src/filterable_attributes_rules.rs @@ -0,0 +1,368 @@ +use deserr::{DeserializeError, Deserr, ValuePointerRef}; +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeSet, HashSet}; +use utoipa::ToSchema; + +use crate::{ + attribute_patterns::{match_distinct_field, match_field_legacy, PatternMatch}, + constants::RESERVED_GEO_FIELD_NAME, + AttributePatterns, +}; + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, ToSchema)] +#[serde(untagged)] +pub enum FilterableAttributesRule { + Field(String), + Pattern(FilterableAttributesPatterns), +} + +impl FilterableAttributesRule { + /// Match a field against the filterable attributes rule. + pub fn match_str(&self, field: &str) -> PatternMatch { + match self { + // If the rule is a field, match the field against the pattern using the legacy behavior + FilterableAttributesRule::Field(pattern) => match_field_legacy(pattern, field), + // If the rule is a pattern, match the field against the pattern using the new behavior + FilterableAttributesRule::Pattern(patterns) => patterns.match_str(field), + } + } + + /// Check if the rule is a geo field. + /// + /// prefer using `index.is_geo_enabled`, `index.is_geo_filtering_enabled` or `index.is_geo_sorting_enabled` + /// to check if the geo feature is enabled. + pub fn has_geo(&self) -> bool { + matches!(self, FilterableAttributesRule::Field(field_name) if field_name == RESERVED_GEO_FIELD_NAME) + } + + /// Get the features of the rule. + pub fn features(&self) -> FilterableAttributesFeatures { + match self { + // If the rule is a field, return the legacy default features + FilterableAttributesRule::Field(_) => FilterableAttributesFeatures::legacy_default(), + // If the rule is a pattern, return the features of the pattern + FilterableAttributesRule::Pattern(patterns) => patterns.features(), + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct FilterableAttributesPatterns { + pub attribute_patterns: AttributePatterns, + #[serde(default)] + #[deserr(default)] + pub features: FilterableAttributesFeatures, +} + +impl FilterableAttributesPatterns { + pub fn match_str(&self, field: &str) -> PatternMatch { + self.attribute_patterns.match_str(field) + } + + pub fn features(&self) -> FilterableAttributesFeatures { + self.features + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +#[derive(Default)] +pub struct FilterableAttributesFeatures { + #[serde(default)] + #[deserr(default)] + facet_search: bool, + #[serde(default)] + #[deserr(default)] + filter: FilterFeatures, +} + +impl FilterableAttributesFeatures { + /// Create a new `FilterableAttributesFeatures` with the legacy default features. + /// + /// This is the default behavior for `FilterableAttributesRule::Field`. + /// This will set the facet search to true and activate all the filter operators. + pub fn legacy_default() -> Self { + Self { facet_search: true, filter: FilterFeatures::legacy_default() } + } + + /// Create a new `FilterableAttributesFeatures` with no features. + pub fn no_features() -> Self { + Self { facet_search: false, filter: FilterFeatures::no_features() } + } + + pub fn is_filterable(&self) -> bool { + self.filter.is_filterable() + } + + /// Check if `IS EMPTY` is allowed + pub fn is_filterable_empty(&self) -> bool { + self.filter.is_filterable_empty() + } + + /// Check if `=` and `IN` are allowed + pub fn is_filterable_equality(&self) -> bool { + self.filter.is_filterable_equality() + } + + /// Check if `IS NULL` is allowed + pub fn is_filterable_null(&self) -> bool { + self.filter.is_filterable_null() + } + + /// Check if `IS EXISTS` is allowed + pub fn is_filterable_exists(&self) -> bool { + self.filter.is_filterable_exists() + } + + /// Check if `<`, `>`, `<=`, `>=` or `TO` are allowed + pub fn is_filterable_comparison(&self) -> bool { + self.filter.is_filterable_comparison() + } + + /// Check if the facet search is allowed + pub fn is_facet_searchable(&self) -> bool { + self.facet_search + } + + pub fn allowed_filter_operators(&self) -> Vec { + self.filter.allowed_operators() + } +} + +impl Deserr for FilterableAttributesRule { + fn deserialize_from_value( + value: deserr::Value, + location: ValuePointerRef, + ) -> Result { + if value.kind() == deserr::ValueKind::Map { + Ok(Self::Pattern(FilterableAttributesPatterns::deserialize_from_value( + value, location, + )?)) + } else { + Ok(Self::Field(String::deserialize_from_value(value, location)?)) + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct FilterFeatures { + #[serde(default = "default_true")] + #[deserr(default = true)] + equality: bool, + #[serde(default)] + #[deserr(default)] + comparison: bool, +} + +fn default_true() -> bool { + true +} + +impl FilterFeatures { + /// Get the allowed operators for the filter. + pub fn allowed_operators(&self) -> Vec { + if !self.is_filterable() { + return vec![]; + } + + let mut operators = vec!["OR", "AND", "NOT"]; + if self.is_filterable_equality() { + operators.extend_from_slice(&["=", "!=", "IN"]); + } + if self.is_filterable_comparison() { + operators.extend_from_slice(&["<", ">", "<=", ">=", "TO"]); + } + if self.is_filterable_empty() { + operators.push("IS EMPTY"); + } + if self.is_filterable_null() { + operators.push("IS NULL"); + } + if self.is_filterable_exists() { + operators.push("EXISTS"); + } + + operators.into_iter().map(String::from).collect() + } + + pub fn is_filterable(&self) -> bool { + self.equality || self.comparison + } + + pub fn is_filterable_equality(&self) -> bool { + self.equality + } + + /// Check if `<`, `>`, `<=`, `>=` or `TO` are allowed + pub fn is_filterable_comparison(&self) -> bool { + self.comparison + } + + /// Check if `IS EMPTY` is allowed + pub fn is_filterable_empty(&self) -> bool { + self.is_filterable() + } + + /// Check if `IS EXISTS` is allowed + pub fn is_filterable_exists(&self) -> bool { + self.is_filterable() + } + + /// Check if `IS NULL` is allowed + pub fn is_filterable_null(&self) -> bool { + self.is_filterable() + } + + /// Create a new `FilterFeatures` with the legacy default features. + /// + /// This is the default behavior for `FilterableAttributesRule::Field`. + /// This will set the equality and comparison to true. + pub fn legacy_default() -> Self { + Self { equality: true, comparison: true } + } + + /// Create a new `FilterFeatures` with no features. + pub fn no_features() -> Self { + Self { equality: false, comparison: false } + } +} + +impl Default for FilterFeatures { + fn default() -> Self { + Self { equality: true, comparison: false } + } +} + +/// Match a field against a set of filterable attributes rules. +/// +/// This function will return the set of patterns that match the given filter. +/// +/// # Arguments +/// +/// * `filterable_attributes` - The set of filterable attributes rules to match against. +/// * `filter` - The filter function to apply to the filterable attributes rules. +pub fn filtered_matching_patterns<'patterns>( + filterable_attributes: &'patterns [FilterableAttributesRule], + filter: &impl Fn(FilterableAttributesFeatures) -> bool, +) -> BTreeSet<&'patterns str> { + let mut result = BTreeSet::new(); + + for rule in filterable_attributes { + if filter(rule.features()) { + match rule { + FilterableAttributesRule::Field(field) => { + result.insert(field.as_str()); + } + FilterableAttributesRule::Pattern(patterns) => { + patterns.attribute_patterns.patterns.iter().for_each(|pattern| { + result.insert(pattern); + }); + } + } + } + } + + result +} + +/// Match a field against a set of filterable attributes rules. +/// +/// This function will return the features that match the given field name. +/// +/// # Arguments +/// +/// * `field_name` - The field name to match against. +/// * `filterable_attributes` - The set of filterable attributes rules to match against. +/// +/// # Returns +/// +/// * `Some((rule_index, features))` - The features of the matching rule and the index of the rule in the `filterable_attributes` array. +/// * `None` - No matching rule was found. +pub fn matching_features( + field_name: &str, + filterable_attributes: &[FilterableAttributesRule], +) -> Option<(usize, FilterableAttributesFeatures)> { + for (id, filterable_attribute) in filterable_attributes.iter().enumerate() { + if filterable_attribute.match_str(field_name) == PatternMatch::Match { + return Some((id, filterable_attribute.features())); + } + } + None +} + +/// Match a field against a set of filterable, facet searchable fields, distinct field, sortable fields, and asc_desc fields. +pub fn match_faceted_field( + field_name: &str, + filterable_fields: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, +) -> PatternMatch { + // Check if the field matches any filterable or facet searchable field + let mut selection = match_pattern_by_features(field_name, filterable_fields, &|features| { + features.is_facet_searchable() || features.is_filterable() + }); + + // If the field matches the pattern, return Match + if selection == PatternMatch::Match { + return selection; + } + + match match_distinct_field(distinct_field.as_deref(), field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + + // Otherwise, check if the field matches any sortable/asc_desc field + for pattern in sortable_fields.iter().chain(asc_desc_fields.iter()) { + match match_field_legacy(pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), + } + } + + selection +} + +fn match_pattern_by_features( + field_name: &str, + filterable_attributes: &[FilterableAttributesRule], + filter: &impl Fn(FilterableAttributesFeatures) -> bool, +) -> PatternMatch { + let mut selection = PatternMatch::NoMatch; + + // `can_match` becomes false if the field name matches (PatternMatch::Match) any pattern that is not facet searchable or filterable, + // this ensures that the field doesn't match a pattern with a lower priority, however it can still match a pattern for a nested field as a parent (PatternMatch::Parent). + // See the test `search::filters::test_filterable_attributes_priority` for more details. + let mut can_match = true; + + // Check if the field name matches any pattern that is facet searchable or filterable + for pattern in filterable_attributes { + match pattern.match_str(field_name) { + PatternMatch::Match => { + let features = pattern.features(); + if filter(features) && can_match { + return PatternMatch::Match; + } else { + can_match = false; + } + } + PatternMatch::Parent => { + let features = pattern.features(); + if filter(features) { + selection = PatternMatch::Parent; + } + } + PatternMatch::NoMatch => (), + } + } + + selection +} diff --git a/crates/milli/src/heed_codec/facet/mod.rs b/crates/milli/src/heed_codec/facet/mod.rs index a8bb5055e..c0870c9fd 100644 --- a/crates/milli/src/heed_codec/facet/mod.rs +++ b/crates/milli/src/heed_codec/facet/mod.rs @@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { fn bytes_encode(value: &'a Self::EItem) -> Result, BoxedError> { let mut v = vec![value.size]; - CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v); Ok(Cow::Owned(v)) } } diff --git a/crates/milli/src/heed_codec/mod.rs b/crates/milli/src/heed_codec/mod.rs index 575b886bd..45f7f7075 100644 --- a/crates/milli/src/heed_codec/mod.rs +++ b/crates/milli/src/heed_codec/mod.rs @@ -10,6 +10,7 @@ mod roaring_bitmap_length; mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; +pub mod version; pub use byte_slice_ref::BytesRefCodec; use heed::BoxedError; diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 257d5bd0a..0ab162880 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -27,18 +27,27 @@ impl CboRoaringBitmapCodec { } } - pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { + pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec) { + Self::serialize_into_writer(roaring, vec).unwrap() + } + + pub fn serialize_into_writer( + roaring: &RoaringBitmap, + mut writer: W, + ) -> io::Result<()> { if roaring.len() <= THRESHOLD as u64 { // If the number of items (u32s) to encode is less than or equal to the threshold // it means that it would weigh the same or less than the RoaringBitmap // header, so we directly encode them using ByteOrder instead. for integer in roaring { - vec.write_u32::(integer).unwrap(); + writer.write_u32::(integer)?; } } else { // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - roaring.serialize_into(vec).unwrap(); + roaring.serialize_into(writer)?; } + + Ok(()) } pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { @@ -143,7 +152,7 @@ impl CboRoaringBitmapCodec { return Ok(None); } - Self::serialize_into(&previous, buffer); + Self::serialize_into_vec(&previous, buffer); Ok(Some(&buffer[..])) } } @@ -169,7 +178,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); - Self::serialize_into(item, &mut vec); + Self::serialize_into_vec(item, &mut vec); Ok(Cow::Owned(vec)) } } diff --git a/crates/milli/src/heed_codec/version.rs b/crates/milli/src/heed_codec/version.rs new file mode 100644 index 000000000..c73b581e3 --- /dev/null +++ b/crates/milli/src/heed_codec/version.rs @@ -0,0 +1,43 @@ +use std::borrow::Cow; +use std::mem::{size_of, size_of_val}; + +use byteorder::{BigEndian, ByteOrder}; +use heed::{BoxedError, BytesDecode, BytesEncode}; + +const VERSION_SIZE: usize = std::mem::size_of::() * 3; + +#[derive(thiserror::Error, Debug)] +#[error( + "Could not decode the version: Expected {VERSION_SIZE} bytes but instead received {0} bytes" +)] +pub struct DecodeVersionError(usize); + +pub struct VersionCodec; +impl<'a> BytesEncode<'a> for VersionCodec { + type EItem = (u32, u32, u32); + + fn bytes_encode(item: &'a Self::EItem) -> Result, BoxedError> { + let mut ret = Vec::with_capacity(size_of::() * 3); + ret.extend(&item.0.to_be_bytes()); + ret.extend(&item.1.to_be_bytes()); + ret.extend(&item.2.to_be_bytes()); + Ok(Cow::Owned(ret)) + } +} +impl<'a> BytesDecode<'a> for VersionCodec { + type DItem = (u32, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Result { + if bytes.len() != VERSION_SIZE { + Err(Box::new(DecodeVersionError(bytes.len()))) + } else { + let major = BigEndian::read_u32(bytes); + let bytes = &bytes[size_of_val(&major)..]; + let minor = BigEndian::read_u32(bytes); + let bytes = &bytes[size_of_val(&major)..]; + let patch = BigEndian::read_u32(bytes); + + Ok((major, minor, patch)) + } + } +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 89f965b7c..e2b6d857b 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1,38 +1,42 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use std::convert::TryInto; use std::fs::File; use std::path::Path; -use heed::types::*; +use heed::{types::*, WithoutTls}; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; use serde::{Deserialize, Serialize}; +use crate::constants::{self, RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; +use crate::heed_codec::version::VersionCodec; use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, - FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec, - RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, + FieldidsWeightsMap, FilterableAttributesRule, GeoPoint, LocalizedAttributesRule, ObkvCodec, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, + BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9; pub mod main_key { + pub const VERSION_KEY: &str = "version"; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; @@ -70,6 +74,9 @@ pub mod main_key { pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; pub const SEARCH_CUTOFF: &str = "search_cutoff"; pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; + pub const FACET_SEARCH: &str = "facet_search"; + pub const PREFIX_SEARCH: &str = "prefix_search"; + pub const DOCUMENTS_STATS: &str = "documents_stats"; } pub mod db_name { @@ -103,7 +110,7 @@ pub mod db_name { #[derive(Clone)] pub struct Index { /// The LMDB environment which this index is associated with. - pub(crate) env: heed::Env, + pub(crate) env: heed::Env, /// Contains many different types (e.g. the fields ids map). pub(crate) main: Database, @@ -170,10 +177,11 @@ pub struct Index { impl Index { pub fn new_with_creation_dates>( - mut options: heed::EnvOpenOptions, + mut options: heed::EnvOpenOptions, path: P, created_at: time::OffsetDateTime, updated_at: time::OffsetDateTime, + creation: bool, ) -> Result { use db_name::*; @@ -221,12 +229,9 @@ impl Index { let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; - wtxn.commit()?; - Index::set_creation_dates(&env, main, created_at, updated_at)?; - - Ok(Index { - env, + let this = Index { + env: env.clone(), main, external_documents_ids, word_docids, @@ -251,16 +256,35 @@ impl Index { vector_arroy, embedder_category_id, documents, - }) + }; + if this.get_version(&wtxn)?.is_none() && creation { + this.put_version( + &mut wtxn, + ( + constants::VERSION_MAJOR.parse().unwrap(), + constants::VERSION_MINOR.parse().unwrap(), + constants::VERSION_PATCH.parse().unwrap(), + ), + )?; + } + wtxn.commit()?; + + Index::set_creation_dates(&this.env, this.main, created_at, updated_at)?; + + Ok(this) } - pub fn new>(options: heed::EnvOpenOptions, path: P) -> Result { + pub fn new>( + options: heed::EnvOpenOptions, + path: P, + creation: bool, + ) -> Result { let now = time::OffsetDateTime::now_utc(); - Self::new_with_creation_dates(options, path, now, now) + Self::new_with_creation_dates(options, path, now, now, creation) } fn set_creation_dates( - env: &heed::Env, + env: &heed::Env, main: Database, created_at: time::OffsetDateTime, updated_at: time::OffsetDateTime, @@ -282,12 +306,12 @@ impl Index { } /// Create a read transaction to be able to read the index. - pub fn read_txn(&self) -> heed::Result> { + pub fn read_txn(&self) -> heed::Result> { self.env.read_txn() } /// Create a static read transaction to be able to read the index without keeping a reference to it. - pub fn static_read_txn(&self) -> heed::Result> { + pub fn static_read_txn(&self) -> heed::Result> { self.env.clone().static_read_txn() } @@ -316,8 +340,12 @@ impl Index { self.env.info().map_size } - pub fn copy_to_file>(&self, path: P, option: CompactionOption) -> Result { - self.env.copy_to_file(path, option).map_err(Into::into) + pub fn copy_to_file(&self, file: &mut File, option: CompactionOption) -> Result<()> { + self.env.copy_to_file(file, option).map_err(Into::into) + } + + pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { + self.env.copy_to_path(path, option).map_err(Into::into) } /// Returns an `EnvClosingEvent` that can be used to wait for the closing event, @@ -329,6 +357,26 @@ impl Index { self.env.prepare_for_closing() } + /* version */ + + /// Writes the version of the database. + pub(crate) fn put_version( + &self, + wtxn: &mut RwTxn<'_>, + (major, minor, patch): (u32, u32, u32), + ) -> heed::Result<()> { + self.main.remap_types::().put( + wtxn, + main_key::VERSION_KEY, + &(major, minor, patch), + ) + } + + /// Get the version of the database. `None` if it was never set. + pub(crate) fn get_version(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + self.main.remap_types::().get(rtxn, main_key::VERSION_KEY) + } + /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. @@ -362,6 +410,58 @@ impl Index { Ok(count.unwrap_or_default()) } + /// Updates the stats of the documents database based on the previous stats and the modified docids. + pub fn update_documents_stats( + &self, + wtxn: &mut RwTxn<'_>, + modified_docids: roaring::RoaringBitmap, + ) -> Result<()> { + let before_rtxn = self.read_txn()?; + let document_stats = match self.documents_stats(&before_rtxn)? { + Some(before_stats) => DatabaseStats::recompute( + before_stats, + self.documents.remap_types(), + &before_rtxn, + wtxn, + modified_docids.iter().map(|docid| docid.to_be_bytes()), + )?, + None => { + // This should never happen when there are already documents in the index, the documents stats should be present. + // If it happens, it means that the index was not properly initialized/upgraded. + debug_assert_eq!( + self.documents.len(&before_rtxn)?, + 0, + "The documents stats should be present when there are documents in the index" + ); + tracing::warn!("No documents stats found, creating new ones"); + DatabaseStats::new(self.documents.remap_types(), &*wtxn)? + } + }; + + self.put_documents_stats(wtxn, document_stats)?; + Ok(()) + } + + /// Writes the stats of the documents database. + pub fn put_documents_stats( + &self, + wtxn: &mut RwTxn<'_>, + stats: DatabaseStats, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::DOCUMENTS_STATS, + &stats, + ) + } + + /// Returns the stats of the documents database. + pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + self.main + .remap_types::>() + .get(rtxn, main_key::DOCUMENTS_STATS) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. @@ -418,6 +518,16 @@ impl Index { .unwrap_or_default()) } + /// Returns the fields ids map with metadata. + /// + /// This structure is not yet stored in the index, and is generated on the fly. + pub fn fields_ids_map_with_metadata(&self, rtxn: &RoTxn<'_>) -> Result { + Ok(FieldIdMapWithMetadata::new( + self.fields_ids_map(rtxn)?, + MetadataBuilder::from_index(self, rtxn)?, + )) + } + /* fieldids weights map */ // This maps the fields ids to their weights. // Their weights is defined by the ordering of the searchable attributes. @@ -453,6 +563,17 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY) } + pub fn max_searchable_attribute_weight(&self, rtxn: &RoTxn<'_>) -> Result> { + let user_defined_searchable_fields = self.user_defined_searchable_fields(rtxn)?; + if let Some(user_defined_searchable_fields) = user_defined_searchable_fields { + if !user_defined_searchable_fields.contains(&"*") { + return Ok(Some(user_defined_searchable_fields.len().saturating_sub(1) as Weight)); + } + } + + Ok(None) + } + pub fn searchable_fields_and_weights<'a>( &self, rtxn: &'a RoTxn<'a>, @@ -643,8 +764,7 @@ impl Index { &self, wtxn: &mut RwTxn<'_>, user_fields: &[&str], - non_searchable_fields_ids: &[FieldId], - fields_ids_map: &FieldsIdsMap, + fields_ids_map: &FieldIdMapWithMetadata, ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; @@ -652,29 +772,17 @@ impl Index { let mut weights = FieldidsWeightsMap::default(); // Now we generate the real searchable fields: - // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. - // 2. Iterate over the user defined searchable fields. - // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. let mut real_fields = Vec::new(); - - for (id, field_from_map) in fields_ids_map.iter() { - for (weight, user_field) in user_fields.iter().enumerate() { - if crate::is_faceted_by(field_from_map, user_field) - && !real_fields.contains(&field_from_map) - && !non_searchable_fields_ids.contains(&id) - { - real_fields.push(field_from_map); - - let weight: u16 = - weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; - weights.insert(id, weight); - } + for (id, field_from_map, metadata) in fields_ids_map.iter() { + if let Some(weight) = metadata.searchable_weight() { + real_fields.push(field_from_map); + weights.insert(id, weight); } } self.put_searchable_fields(wtxn, &real_fields)?; self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } @@ -781,26 +889,32 @@ impl Index { /* filterable fields */ - /// Writes the filterable fields names in the database. - pub(crate) fn put_filterable_fields( + /// Writes the filterable attributes rules in the database. + pub(crate) fn put_filterable_attributes_rules( &self, wtxn: &mut RwTxn<'_>, - fields: &HashSet, + fields: &[FilterableAttributesRule], ) -> heed::Result<()> { self.main.remap_types::>().put( wtxn, main_key::FILTERABLE_FIELDS_KEY, - fields, + &fields, ) } - /// Deletes the filterable fields ids in the database. - pub(crate) fn delete_filterable_fields(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + /// Deletes the filterable attributes rules in the database. + pub(crate) fn delete_filterable_attributes_rules( + &self, + wtxn: &mut RwTxn<'_>, + ) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::FILTERABLE_FIELDS_KEY) } - /// Returns the filterable fields names. - pub fn filterable_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + /// Returns the filterable attributes rules. + pub fn filterable_attributes_rules( + &self, + rtxn: &RoTxn<'_>, + ) -> heed::Result> { Ok(self .main .remap_types::>() @@ -808,21 +922,6 @@ impl Index { .unwrap_or_default()) } - /// Identical to `filterable_fields`, but returns ids instead. - pub fn filterable_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.filterable_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) - } - /* sortable fields */ /// Writes the sortable fields names in the database. @@ -859,83 +958,37 @@ impl Index { Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) } - /* faceted fields */ - - /// Writes the faceted fields in the database. - pub(crate) fn put_faceted_fields( - &self, - wtxn: &mut RwTxn<'_>, - fields: &HashSet, - ) -> heed::Result<()> { - self.main.remap_types::>().put( - wtxn, - main_key::HIDDEN_FACETED_FIELDS_KEY, - fields, - ) + /// Returns true if the geo feature is enabled. + pub fn is_geo_enabled(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_filter = self.is_geo_filtering_enabled(rtxn)?; + let geo_sortable = self.is_geo_sorting_enabled(rtxn)?; + Ok(geo_filter || geo_sortable) } - /// Returns the faceted fields names. - pub fn faceted_fields(&self, rtxn: &RoTxn<'_>) -> heed::Result> { - Ok(self - .main - .remap_types::>() - .get(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? - .unwrap_or_default()) + /// Returns true if the geo sorting feature is enabled. + pub fn is_geo_sorting_enabled(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_sortable = self.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); + Ok(geo_sortable) } - /// Identical to `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.faceted_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) + /// Returns true if the geo filtering feature is enabled. + pub fn is_geo_filtering_enabled(&self, rtxn: &RoTxn<'_>) -> Result { + let geo_filter = + self.filterable_attributes_rules(rtxn)?.iter().any(|field| field.has_geo()); + Ok(geo_filter) } - /* faceted documents ids */ - - /// Returns the user defined faceted fields names. - /// - /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. - pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn<'_>) -> Result> { - let filterable_fields = self.filterable_fields(rtxn)?; - let sortable_fields = self.sortable_fields(rtxn)?; - let distinct_field = self.distinct_field(rtxn)?; - let asc_desc_fields = - self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { + pub fn asc_desc_fields(&self, rtxn: &RoTxn<'_>) -> Result> { + let asc_desc_fields = self + .criteria(rtxn)? + .into_iter() + .filter_map(|criterion| match criterion { Criterion::Asc(field) | Criterion::Desc(field) => Some(field), _otherwise => None, - }); + }) + .collect(); - let mut faceted_fields = filterable_fields; - faceted_fields.extend(sortable_fields); - faceted_fields.extend(asc_desc_fields); - if let Some(field) = distinct_field { - faceted_fields.insert(field.to_owned()); - } - - Ok(faceted_fields) - } - - /// Identical to `user_defined_faceted_fields`, but returns ids instead. - pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn<'_>) -> Result> { - let fields = self.user_defined_faceted_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - - let mut fields_ids = HashSet::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(&name) { - fields_ids.insert(field_id); - } - } - - Ok(fields_ids) + Ok(asc_desc_fields) } /* faceted documents ids */ @@ -1233,6 +1286,10 @@ impl Index { ) } + pub(crate) fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY) + } + /// Returns the FST which is the words prefixes dictionary of the engine. pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn<'t>) -> Result>> { match self.main.remap_types::().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { @@ -1562,6 +1619,41 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::PROXIMITY_PRECISION) } + pub fn prefix_search(&self, txn: &RoTxn<'_>) -> heed::Result> { + self.main.remap_types::>().get(txn, main_key::PREFIX_SEARCH) + } + + pub(crate) fn put_prefix_search( + &self, + txn: &mut RwTxn<'_>, + val: PrefixSearch, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + txn, + main_key::PREFIX_SEARCH, + &val, + ) + } + + pub(crate) fn delete_prefix_search(&self, txn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::PREFIX_SEARCH) + } + + pub fn facet_search(&self, txn: &RoTxn<'_>) -> heed::Result { + self.main + .remap_types::>() + .get(txn, main_key::FACET_SEARCH) + .map(|v| v.unwrap_or(true)) + } + + pub(crate) fn put_facet_search(&self, txn: &mut RwTxn<'_>, val: bool) -> heed::Result<()> { + self.main.remap_types::>().put(txn, main_key::FACET_SEARCH, &val) + } + + pub(crate) fn delete_facet_search(&self, txn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(txn, main_key::FACET_SEARCH) + } + pub fn localized_attributes_rules( &self, rtxn: &RoTxn<'_>, @@ -1647,12 +1739,21 @@ impl Index { Ok(res) } - pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result { - Ok(PrefixSettings { - compute_prefixes: true, - max_prefix_length: 4, - prefix_count_threshold: 100, - }) + pub fn prefix_settings(&self, rtxn: &RoTxn<'_>) -> Result { + let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); + Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) + } + + pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { + let mut stats = ArroyStats::default(); + let embedding_configs = self.embedding_configs(rtxn)?; + for config in embedding_configs { + let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + reader.aggregate_stats(rtxn, &mut stats)?; + } + Ok(stats) } } @@ -1665,9 +1766,17 @@ pub struct IndexEmbeddingConfig { #[derive(Debug, Deserialize, Serialize)] pub struct PrefixSettings { - pub prefix_count_threshold: u64, + pub prefix_count_threshold: usize, pub max_prefix_length: usize, - pub compute_prefixes: bool, + pub compute_prefixes: PrefixSearch, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub enum PrefixSearch { + #[default] + IndexingTime, + Disabled, } #[derive(Serialize, Deserialize)] @@ -1682,12 +1791,14 @@ pub(crate) mod tests { use big_s::S; use bumpalo::Bump; use heed::{EnvOpenOptions, RwTxn}; - use maplit::{btreemap, hashset}; + use maplit::btreemap; use memmap2::Mmap; use tempfile::TempDir; + use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; + use crate::progress::Progress; use crate::update::new::indexer; use crate::update::settings::InnerIndexSettings; use crate::update::{ @@ -1696,7 +1807,8 @@ pub(crate) mod tests { use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::vector::EmbeddingConfigs; use crate::{ - db_snap, obkv_to_json, Filter, Index, Search, SearchResult, ThreadPoolNoAbortBuilder, + db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult, + ThreadPoolNoAbortBuilder, }; pub(crate) struct TempIndex { @@ -1717,10 +1829,11 @@ pub(crate) mod tests { impl TempIndex { /// Creates a temporary index pub fn new_with_map_size(size: usize) -> Self { - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(size); let _tempdir = TempDir::new_in(".").unwrap(); - let inner = Index::new(options, _tempdir.path()).unwrap(); + let inner = Index::new(options, _tempdir.path(), true).unwrap(); let indexer_config = IndexerConfig::default(); let index_documents_config = IndexDocumentsConfig::default(); Self { inner, indexer_config, index_documents_config, _tempdir } @@ -1752,9 +1865,15 @@ pub(crate) mod tests { let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; - let mut indexer = - indexer::DocumentOperation::new(self.index_documents_config.update_method); - indexer.add_documents(&documents).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + match self.index_documents_config.update_method { + IndexDocumentsMethod::ReplaceDocuments => { + indexer.replace_documents(&documents).unwrap() + } + IndexDocumentsMethod::UpdateDocuments => { + indexer.update_documents(&documents).unwrap() + } + } let indexer_alloc = Bump::new(); let (document_changes, operation_stats, primary_key) = indexer.into_changes( @@ -1764,7 +1883,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), )?; if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { @@ -1775,6 +1894,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1782,7 +1902,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) }) .unwrap()?; @@ -1840,8 +1960,7 @@ pub(crate) mod tests { let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; - let mut indexer = - indexer::DocumentOperation::new(self.index_documents_config.update_method); + let mut indexer = indexer::DocumentOperation::new(); let external_document_ids: Vec<_> = external_document_ids.iter().map(AsRef::as_ref).collect(); indexer.delete_documents(external_document_ids.as_slice()); @@ -1854,7 +1973,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), )?; if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { @@ -1865,6 +1984,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1872,7 +1992,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) }) .unwrap()?; @@ -1917,13 +2037,13 @@ pub(crate) mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let payload = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 }, ]); - indexer.add_documents(&payload).unwrap(); + indexer.replace_documents(&payload).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -1934,7 +2054,7 @@ pub(crate) mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1945,6 +2065,7 @@ pub(crate) mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1952,7 +2073,7 @@ pub(crate) mod tests { &document_changes, embedders, &|| should_abort.load(Relaxed), - &|_| (), + &Progress::default(), ) }) .unwrap() @@ -2095,7 +2216,7 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let real = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(real, &["doggo", "name"]); + assert!(real.is_empty()); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -2123,16 +2244,18 @@ pub(crate) mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("_geo") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": "0", "lng": "0" } }, - { "id": 1, "_geo": { "lat": 0, "lng": "-175" } }, - { "id": 2, "_geo": { "lat": "0", "lng": 175 } }, - { "id": 3, "_geo": { "lat": 85, "lng": 0 } }, - { "id": 4, "_geo": { "lat": "-85", "lng": "0" } }, + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": "0", "lng": "0" } }, + { "id": 1, RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": "-175" } }, + { "id": 2, RESERVED_GEO_FIELD_NAME: { "lat": "0", "lng": 175 } }, + { "id": 3, RESERVED_GEO_FIELD_NAME: { "lat": 85, "lng": 0 } }, + { "id": 4, RESERVED_GEO_FIELD_NAME: { "lat": "-85", "lng": "0" } }, ])) .unwrap(); @@ -2231,7 +2354,9 @@ pub(crate) mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("doggo") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "doggo".to_string(), + )]); }) .unwrap(); index @@ -2268,15 +2393,14 @@ pub(crate) mod tests { #[test] fn replace_documents_external_ids_and_soft_deletion_check() { - use big_s::S; - use maplit::hashset; - let index = TempIndex::new(); index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("doggo") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "doggo".to_string(), + )]); }) .unwrap(); @@ -2809,25 +2933,36 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_string()); - settings.set_filterable_fields(HashSet::from(["_geo".to_string()])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); // happy path - index.add_documents(documents!({ "id" : 5, "_geo": {"lat": 12.0, "lng": 11.0}})).unwrap(); + index + .add_documents( + documents!({ "id" : 5, RESERVED_GEO_FIELD_NAME: {"lat": 12.0, "lng": 11.0}}), + ) + .unwrap(); db_snap!(index, geo_faceted_documents_ids); // both are unparseable, we expect GeoError::BadLatitudeAndLongitude let err1 = index .add_documents( - documents!({ "id" : 6, "_geo": {"lat": "unparseable", "lng": "unparseable"}}), + documents!({ "id" : 6, RESERVED_GEO_FIELD_NAME: {"lat": "unparseable", "lng": "unparseable"}}), ) .unwrap_err(); - assert!(matches!( - err1, - Error::UserError(UserError::InvalidGeoField(GeoError::BadLatitudeAndLongitude { .. })) - )); + match err1 { + Error::UserError(UserError::InvalidGeoField(err)) => match *err { + GeoError::BadLatitudeAndLongitude { .. } => (), + otherwise => { + panic!("err1 is not a BadLatitudeAndLongitude error but rather a {otherwise:?}") + } + }, + _ => panic!("err1 is not a BadLatitudeAndLongitude error but rather a {err1:?}"), + } db_snap!(index, geo_faceted_documents_ids); // ensure that no more document was inserted } @@ -2839,13 +2974,15 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_string()); - settings.set_filterable_fields(HashSet::from(["_geo".to_string()])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); let err = index .add_documents( - documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best" }}), + documents!({ "id" : "doggo", RESERVED_GEO_FIELD_NAME: { "lat": 1, "lng": 2, "doggo": "are the best" }}), ) .unwrap_err(); insta::assert_snapshot!(err, @r###"The `_geo` field in the document with the id: `"doggo"` contains the following unexpected fields: `{"doggo":"are the best"}`."###); @@ -2855,7 +2992,7 @@ pub(crate) mod tests { // multiple fields and complex values let err = index .add_documents( - documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }), + documents!({ "id" : "doggo", RESERVED_GEO_FIELD_NAME: { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }), ) .unwrap_err(); insta::assert_snapshot!(err, @r###"The `_geo` field in the document with the id: `"doggo"` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."###); @@ -2872,7 +3009,9 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("name")]); - settings.set_filterable_fields(HashSet::from([S("age")])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "age".to_string(), + )]); }) .unwrap(); @@ -2880,35 +3019,37 @@ pub(crate) mod tests { .add_documents(documents!({ "id": 1, "name": "Many", "age": 28, "realName": "Maxime" })) .unwrap(); db_snap!(index, fields_ids_map, @r###" - 0 name | - 1 id | + 0 id | + 1 name | 2 age | 3 realName | "###); db_snap!(index, searchable_fields, @r###"["name"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight - 0 0 | + 1 0 | "###); index .update_settings(|settings| { settings.set_searchable_fields(vec![S("name"), S("realName")]); - settings.set_filterable_fields(HashSet::from([S("age")])); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "age".to_string(), + )]); }) .unwrap(); // The order of the field id map shouldn't change db_snap!(index, fields_ids_map, @r###" - 0 name | - 1 id | + 0 id | + 1 name | 2 age | 3 realName | "###); db_snap!(index, searchable_fields, @r###"["name", "realName"]"###); db_snap!(index, fieldids_weights_map, @r###" fid weight - 0 0 | + 1 0 | 3 1 | "###); } @@ -2993,14 +3134,16 @@ pub(crate) mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); - settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("_vectors".to_string()), + FilterableAttributesRule::Field("_vectors.doggo".to_string()), + ]); }) .unwrap(); db_snap!(index, fields_ids_map, @r###" 0 id | 1 _vectors | - 2 _vectors.doggo | "###); db_snap!(index, searchable_fields, @"[]"); db_snap!(index, fieldids_weights_map, @r###" @@ -3033,7 +3176,6 @@ pub(crate) mod tests { db_snap!(index, fields_ids_map, @r###" 0 id | 1 _vectors | - 2 _vectors.doggo | "###); db_snap!(index, searchable_fields, @"[]"); db_snap!(index, fieldids_weights_map, @r###" diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 48b03b6cc..516e6d31b 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -1,6 +1,6 @@ -#![cfg_attr(all(test, fuzzing), feature(no_coverage))] #![allow(clippy::type_complexity)] +#[cfg(not(windows))] #[cfg(test)] #[global_allocator] pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -9,11 +9,14 @@ pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; pub mod documents; mod asc_desc; +mod attribute_patterns; mod criterion; +pub mod database_stats; mod error; mod external_documents_ids; pub mod facet; mod fields_ids_map; +mod filterable_attributes_rules; pub mod heed_codec; pub mod index; mod localized_attributes_rules; @@ -29,7 +32,9 @@ pub mod vector; #[cfg(test)] #[macro_use] pub mod snapshot_tests; +pub mod constants; mod fieldids_weights_map; +pub mod progress; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; @@ -49,6 +54,8 @@ pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbor pub use {charabia as tokenizer, heed, rhai}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; +pub use self::attribute_patterns::AttributePatterns; +pub use self::attribute_patterns::PatternMatch; pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, @@ -56,6 +63,10 @@ pub use self::error::{ pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap}; +pub use self::filterable_attributes_rules::{ + FilterFeatures, FilterableAttributesFeatures, FilterableAttributesPatterns, + FilterableAttributesRule, +}; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, @@ -64,13 +75,15 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::localized_attributes_rules::LocalizedAttributesRule; -use self::localized_attributes_rules::LocalizedFieldIds; pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; pub use self::search::similar::Similar; pub use self::search::{ FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy, Search, SearchResult, SemanticSearch, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; +pub use self::update::ChannelCongestion; + +pub use arroy; pub type Result = std::result::Result; @@ -191,7 +204,7 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative // Compute the absolute word position with the field id of the attribute and relative position in the attribute. pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { - (field_id as u32) << 16 | (relative as u32) + ((field_id as u32) << 16) | (relative as u32) } // TODO: this is wrong, but will do for now /// Compute the "bucketed" absolute position from the field id and relative position in the field. @@ -359,7 +372,7 @@ pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator bool { - field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.') + field.starts_with(facet) && field[facet.len()..].chars().next().is_none_or(|c| c == '.') } pub fn normalize_facet(original: &str) -> String { diff --git a/crates/milli/src/localized_attributes_rules.rs b/crates/milli/src/localized_attributes_rules.rs index 3c421ca6b..81015c458 100644 --- a/crates/milli/src/localized_attributes_rules.rs +++ b/crates/milli/src/localized_attributes_rules.rs @@ -2,9 +2,11 @@ use std::collections::HashMap; use charabia::Language; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; +use crate::attribute_patterns::PatternMatch; use crate::fields_ids_map::FieldsIdsMap; -use crate::FieldId; +use crate::{AttributePatterns, FieldId}; /// A rule that defines which locales are supported for a given attribute. /// @@ -14,19 +16,20 @@ use crate::FieldId; /// The pattern `attribute_name*` matches any attribute name that starts with `attribute_name`. /// The pattern `*attribute_name` matches any attribute name that ends with `attribute_name`. /// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] pub struct LocalizedAttributesRule { - pub attribute_patterns: Vec, + pub attribute_patterns: AttributePatterns, + #[schema(value_type = Vec)] pub locales: Vec, } impl LocalizedAttributesRule { pub fn new(attribute_patterns: Vec, locales: Vec) -> Self { - Self { attribute_patterns, locales } + Self { attribute_patterns: AttributePatterns::from(attribute_patterns), locales } } - pub fn match_str(&self, str: &str) -> bool { - self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str)) + pub fn match_str(&self, str: &str) -> PatternMatch { + self.attribute_patterns.match_str(str) } pub fn locales(&self) -> &[Language] { @@ -34,20 +37,6 @@ impl LocalizedAttributesRule { } } -fn match_pattern(pattern: &str, str: &str) -> bool { - if pattern == "*" { - true - } else if pattern.starts_with('*') && pattern.ends_with('*') { - str.contains(&pattern[1..pattern.len() - 1]) - } else if let Some(pattern) = pattern.strip_prefix('*') { - str.ends_with(pattern) - } else if let Some(pattern) = pattern.strip_suffix('*') { - str.starts_with(pattern) - } else { - pattern == str - } -} - #[derive(Debug, Clone, PartialEq, Eq)] pub struct LocalizedFieldIds { field_id_to_locales: HashMap>, @@ -63,13 +52,13 @@ impl LocalizedFieldIds { if let Some(rules) = rules { let fields = fields_ids.filter_map(|field_id| { - fields_ids_map.name(field_id).map(|field_name| (field_id, field_name)) + fields_ids_map.name(field_id).map(|field_name: &str| (field_id, field_name)) }); for (field_id, field_name) in fields { let mut locales = Vec::new(); for rule in rules { - if rule.match_str(field_name) { + if rule.match_str(field_name) == PatternMatch::Match { locales.extend(rule.locales.iter()); // Take the first rule that matches break; @@ -87,10 +76,6 @@ impl LocalizedFieldIds { Self { field_id_to_locales } } - pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> { - self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) - } - pub fn all_locales(&self) -> Vec { let mut locales = Vec::new(); for field_locales in self.field_id_to_locales.values() { @@ -106,24 +91,3 @@ impl LocalizedFieldIds { locales } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_match_pattern() { - assert!(match_pattern("*", "test")); - assert!(match_pattern("test*", "test")); - assert!(match_pattern("test*", "testa")); - assert!(match_pattern("*test", "test")); - assert!(match_pattern("*test", "atest")); - assert!(match_pattern("*test*", "test")); - assert!(match_pattern("*test*", "atesta")); - assert!(match_pattern("*test*", "atest")); - assert!(match_pattern("*test*", "testa")); - assert!(!match_pattern("test*test", "test")); - assert!(!match_pattern("*test", "testa")); - assert!(!match_pattern("test*", "atest")); - } -} diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs new file mode 100644 index 000000000..7eb0cbd6b --- /dev/null +++ b/crates/milli/src/progress.rs @@ -0,0 +1,290 @@ +use enum_iterator::Sequence; +use std::any::TypeId; +use std::borrow::Cow; +use std::marker::PhantomData; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, RwLock}; +use std::time::{Duration, Instant}; + +use indexmap::IndexMap; +use itertools::Itertools; +use serde::Serialize; +use utoipa::ToSchema; + +pub trait Step: 'static + Send + Sync { + fn name(&self) -> Cow<'static, str>; + fn current(&self) -> u32; + fn total(&self) -> u32; +} + +#[derive(Clone, Default)] +pub struct Progress { + steps: Arc>, +} + +#[derive(Default)] +struct InnerProgress { + /// The hierarchy of steps. + steps: Vec<(TypeId, Box, Instant)>, + /// The durations associated to each steps. + durations: Vec<(String, Duration)>, +} + +impl Progress { + pub fn update_progress(&self, sub_progress: P) { + let mut inner = self.steps.write().unwrap(); + let InnerProgress { steps, durations } = &mut *inner; + + let now = Instant::now(); + let step_type = TypeId::of::

(); + if let Some(idx) = steps.iter().position(|(id, _, _)| *id == step_type) { + push_steps_durations(steps, durations, now, idx); + steps.truncate(idx); + } + + steps.push((step_type, Box::new(sub_progress), now)); + } + + // TODO: This code should be in meilisearch_types but cannot because milli can't depend on meilisearch_types + pub fn as_progress_view(&self) -> ProgressView { + let inner = self.steps.read().unwrap(); + let InnerProgress { steps, .. } = &*inner; + + let mut percentage = 0.0; + let mut prev_factors = 1.0; + + let mut step_view = Vec::with_capacity(steps.len()); + for (_, step, _) in steps.iter() { + prev_factors *= step.total() as f32; + percentage += step.current() as f32 / prev_factors; + + step_view.push(ProgressStepView { + current_step: step.name(), + finished: step.current(), + total: step.total(), + }); + } + + ProgressView { steps: step_view, percentage: percentage * 100.0 } + } + + pub fn accumulated_durations(&self) -> IndexMap { + let mut inner = self.steps.write().unwrap(); + let InnerProgress { steps, durations, .. } = &mut *inner; + + let now = Instant::now(); + push_steps_durations(steps, durations, now, 0); + + durations.drain(..).map(|(name, duration)| (name, format!("{duration:.2?}"))).collect() + } + + // TODO: ideally we should expose the progress in a way that let arroy use it directly + pub(crate) fn update_progress_from_arroy(&self, progress: arroy::WriterProgress) { + self.update_progress(progress.main); + if let Some(sub) = progress.sub { + self.update_progress(sub); + } + } +} + +/// Generate the names associated with the durations and push them. +fn push_steps_durations( + steps: &[(TypeId, Box, Instant)], + durations: &mut Vec<(String, Duration)>, + now: Instant, + idx: usize, +) { + for (i, (_, _, started_at)) in steps.iter().skip(idx).enumerate().rev() { + let full_name = steps.iter().take(idx + i + 1).map(|(_, s, _)| s.name()).join(" > "); + durations.push((full_name, now.duration_since(*started_at))); + } +} + +/// This trait lets you use the AtomicSubStep defined right below. +/// The name must be a const that never changed but that can't be enforced by the type system because it make the trait non object-safe. +/// By forcing the Default trait + the &'static str we make it harder to miss-use the trait. +pub trait NamedStep: 'static + Send + Sync + Default { + fn name(&self) -> &'static str; +} + +/// Structure to quickly define steps that need very quick, lockless updating of their current step. +/// You can use this struct if: +/// - The name of the step doesn't change +/// - The total number of steps doesn't change +pub struct AtomicSubStep { + unit_name: Name, + current: Arc, + total: u32, +} + +impl AtomicSubStep { + pub fn new(total: u32) -> (Arc, Self) { + let current = Arc::new(AtomicU32::new(0)); + (current.clone(), Self { current, total, unit_name: Name::default() }) + } +} + +impl Step for AtomicSubStep { + fn name(&self) -> Cow<'static, str> { + self.unit_name.name().into() + } + + fn current(&self) -> u32 { + self.current.load(Ordering::Relaxed) + } + + fn total(&self) -> u32 { + self.total + } +} + +#[doc(hidden)] +pub use convert_case as _private_convert_case; +#[doc(hidden)] +pub use enum_iterator as _private_enum_iterator; + +#[macro_export] +macro_rules! make_enum_progress { + ($visibility:vis enum $name:ident { $($variant:ident,)+ }) => { + #[repr(u8)] + #[derive(Debug, Clone, Copy, PartialEq, Eq, $crate::progress::_private_enum_iterator::Sequence)] + #[allow(clippy::enum_variant_names)] + $visibility enum $name { + $($variant),+ + } + + impl $crate::progress::Step for $name { + fn name(&self) -> std::borrow::Cow<'static, str> { + use $crate::progress::_private_convert_case::Casing; + + match self { + $( + $name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into() + ),+ + } + } + + fn current(&self) -> u32 { + *self as u32 + } + + fn total(&self) -> u32 { + use $crate::progress::_private_enum_iterator::Sequence; + Self::CARDINALITY as u32 + } + } + }; +} + +#[macro_export] +macro_rules! make_atomic_progress { + ($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => { + #[derive(Default, Debug, Clone, Copy)] + pub struct $struct_name {} + impl NamedStep for $struct_name { + fn name(&self) -> &'static str { + $step_name + } + } + pub type $atomic_struct_name = AtomicSubStep<$struct_name>; + }; +} + +make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" ); + +#[derive(Debug, Serialize, Clone, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct ProgressView { + pub steps: Vec, + pub percentage: f32, +} + +#[derive(Debug, Serialize, Clone, ToSchema)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct ProgressStepView { + pub current_step: Cow<'static, str>, + pub finished: u32, + pub total: u32, +} + +/// Used when the name can change but it's still the same step. +/// To avoid conflicts on the `TypeId`, create a unique type every time you use this step: +/// ```text +/// enum UpgradeVersion {} +/// +/// progress.update_progress(VariableNameStep::::new( +/// "v1 to v2", +/// 0, +/// 10, +/// )); +/// ``` +pub struct VariableNameStep { + name: String, + current: u32, + total: u32, + phantom: PhantomData, +} + +impl VariableNameStep { + pub fn new(name: impl Into, current: u32, total: u32) -> Self { + Self { name: name.into(), current, total, phantom: PhantomData } + } +} + +impl Step for VariableNameStep { + fn name(&self) -> Cow<'static, str> { + self.name.clone().into() + } + + fn current(&self) -> u32 { + self.current + } + + fn total(&self) -> u32 { + self.total + } +} + +impl Step for arroy::MainStep { + fn name(&self) -> Cow<'static, str> { + match self { + arroy::MainStep::PreProcessingTheItems => "pre processing the items", + arroy::MainStep::WritingTheDescendantsAndMetadata => { + "writing the descendants and metadata" + } + arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items", + arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes", + arroy::MainStep::UpdatingTheTrees => "updating the trees", + arroy::MainStep::CreateNewTrees => "create new trees", + arroy::MainStep::WritingNodesToDatabase => "writing nodes to database", + arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees", + arroy::MainStep::WriteTheMetadata => "write the metadata", + } + .into() + } + + fn current(&self) -> u32 { + *self as u32 + } + + fn total(&self) -> u32 { + Self::CARDINALITY as u32 + } +} + +impl Step for arroy::SubStep { + fn name(&self) -> Cow<'static, str> { + self.unit.into() + } + + fn current(&self) -> u32 { + self.current.load(Ordering::Relaxed) + } + + fn total(&self) -> u32 { + self.max + } +} diff --git a/crates/milli/src/prompt/context.rs b/crates/milli/src/prompt/context.rs index 02258d067..84523333a 100644 --- a/crates/milli/src/prompt/context.rs +++ b/crates/milli/src/prompt/context.rs @@ -15,7 +15,7 @@ impl<'a, D: ObjectView, F: ArrayView> Context<'a, D, F> { } } -impl<'a, D: ObjectView, F: ArrayView> ObjectView for Context<'a, D, F> { +impl ObjectView for Context<'_, D, F> { fn as_value(&self) -> &dyn ValueView { self } @@ -52,7 +52,7 @@ impl<'a, D: ObjectView, F: ArrayView> ObjectView for Context<'a, D, F> { } } -impl<'a, D: ObjectView, F: ArrayView> ValueView for Context<'a, D, F> { +impl ValueView for Context<'_, D, F> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index dea7946da..b00c4cb42 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -3,12 +3,13 @@ use std::collections::BTreeMap; use std::fmt::{self, Debug}; use bumpalo::Bump; +use bumparaw_collections::{RawMap, RawVec, Value}; use liquid::model::{ ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, Value as LiquidValue, }; use liquid::{ObjectView, ValueView}; -use raw_collections::{RawMap, RawVec}; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; @@ -66,7 +67,7 @@ impl<'a> Document<'a> { } } -impl<'a> ObjectView for Document<'a> { +impl ObjectView for Document<'_> { fn as_value(&self) -> &dyn ValueView { self } @@ -97,7 +98,7 @@ impl<'a> ObjectView for Document<'a> { } } -impl<'a> ValueView for Document<'a> { +impl ValueView for Document<'_> { fn as_debug(&self) -> &dyn Debug { self } @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc } impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { - fn as_debug(&self) -> &dyn fmt::Debug { + fn as_debug(&self) -> &dyn Debug { self } fn render(&self) -> liquid::model::DisplayCow<'_> { @@ -243,14 +244,13 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, } } -#[derive(Debug)] struct ParseableValue<'doc> { - value: raw_collections::Value<'doc>, + value: Value<'doc, FxBuildHasher>, } impl<'doc> ParseableValue<'doc> { pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { - let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); + let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap(); Self { value } } @@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> { } // transparent newtype for implementing ValueView -#[repr(transparent)] #[derive(Debug)] -struct ParseableMap<'doc>(RawMap<'doc>); +#[repr(transparent)] +struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>); // transparent newtype for implementing ValueView -#[repr(transparent)] #[derive(Debug)] +#[repr(transparent)] struct ParseableArray<'doc>(RawVec<'doc>); impl<'doc> ParseableMap<'doc> { - pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> { + pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> { // SAFETY: repr(transparent) - unsafe { &*(map as *const RawMap as *const Self) } + unsafe { &*(map as *const RawMap as *const Self) } } } @@ -283,7 +283,7 @@ impl<'doc> ParseableArray<'doc> { } } -impl<'doc> ArrayView for ParseableArray<'doc> { +impl ArrayView for ParseableArray<'_> { fn as_value(&self) -> &dyn ValueView { self } @@ -311,7 +311,7 @@ impl<'doc> ArrayView for ParseableArray<'doc> { } } -impl<'doc> ValueView for ParseableArray<'doc> { +impl ValueView for ParseableArray<'_> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -353,7 +353,7 @@ impl<'doc> ValueView for ParseableArray<'doc> { } } -impl<'doc> ObjectView for ParseableMap<'doc> { +impl ObjectView for ParseableMap<'_> { fn as_value(&self) -> &dyn ValueView { self } @@ -392,7 +392,7 @@ impl<'doc> ObjectView for ParseableMap<'doc> { } } -impl<'doc> ValueView for ParseableMap<'doc> { +impl ValueView for ParseableMap<'_> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -441,14 +441,15 @@ impl<'doc> ValueView for ParseableMap<'doc> { } } -impl<'doc> ValueView for ParseableValue<'doc> { +impl ValueView for ParseableValue<'_> { fn as_debug(&self) -> &dyn Debug { self } fn render(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.render(), Value::Bool(v) => v.render(), @@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn source(&self) -> DisplayCow<'_> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.source(), Value::Bool(v) => ValueView::source(v), @@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn type_name(&self) -> &'static str { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil.type_name(), Value::Bool(v) => v.type_name(), @@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn query_state(&self, state: State) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::query_state(&LiquidValue::Nil, state), Value::Bool(v) => ValueView::query_state(v, state), @@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_kstr(&self) -> KStringCow<'_> { - use raw_collections::Value; + use bumparaw_collections::Value; + match &self.value { Value::Null => ValueView::to_kstr(&LiquidValue::Nil), Value::Bool(v) => ValueView::to_kstr(v), @@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn to_value(&self) -> LiquidValue { - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Null => LiquidValue::Nil, Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { - raw_collections::value::Number::PosInt(number) => { + Number::PosInt(number) => { let number: i64 = match (*number).try_into() { Ok(number) => number, Err(_) => { @@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { }; LiquidValue::Scalar(ScalarCow::new(number)) } - raw_collections::value::Number::NegInt(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } - raw_collections::value::Number::Finite(number) => { - LiquidValue::Scalar(ScalarCow::new(*number)) - } + Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)), + Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)), }, Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), @@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn as_scalar(&self) -> Option> { - use raw_collections::value::Number; - use raw_collections::Value; + use bumparaw_collections::value::Number; + use bumparaw_collections::Value; + match &self.value { Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), Value::Number(number) => match number { @@ -576,34 +580,41 @@ impl<'doc> ValueView for ParseableValue<'doc> { } fn is_scalar(&self) -> bool { - use raw_collections::Value; + use bumparaw_collections::Value; + matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) } fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { - if let raw_collections::Value::Array(array) = &self.value { + if let Value::Array(array) = &self.value { return Some(ParseableArray::as_parseable(array) as _); } None } fn is_array(&self) -> bool { - matches!(&self.value, raw_collections::Value::Array(_)) + matches!(&self.value, bumparaw_collections::Value::Array(_)) } fn as_object(&self) -> Option<&dyn ObjectView> { - if let raw_collections::Value::Object(object) = &self.value { + if let Value::Object(object) = &self.value { return Some(ParseableMap::as_parseable(object) as _); } None } fn is_object(&self) -> bool { - matches!(&self.value, raw_collections::Value::Object(_)) + matches!(&self.value, bumparaw_collections::Value::Object(_)) } fn is_nil(&self) -> bool { - matches!(&self.value, raw_collections::Value::Null) + matches!(&self.value, bumparaw_collections::Value::Null) + } +} + +impl Debug for ParseableValue<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ParseableValue").field("value", &self.value).finish() } } @@ -611,7 +622,7 @@ struct ArraySource<'s, 'doc> { s: &'s RawVec<'doc>, } -impl<'s, 'doc> fmt::Display for ArraySource<'s, 'doc> { +impl fmt::Display for ArraySource<'_, '_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[")?; for item in self.s { @@ -627,7 +638,7 @@ struct ArrayRender<'s, 'doc> { s: &'s RawVec<'doc>, } -impl<'s, 'doc> fmt::Display for ArrayRender<'s, 'doc> { +impl fmt::Display for ArrayRender<'_, '_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for item in self.s { let v = ParseableValue::new(item, self.s.bump()); diff --git a/crates/milli/src/prompt/error.rs b/crates/milli/src/prompt/error.rs index 8a762b60a..a92e2fdc3 100644 --- a/crates/milli/src/prompt/error.rs +++ b/crates/milli/src/prompt/error.rs @@ -38,6 +38,16 @@ pub struct RenderPromptError { pub fault: FaultSource, } impl RenderPromptError { + pub(crate) fn missing_context_with_external_docid( + external_docid: String, + inner: liquid::Error, + ) -> RenderPromptError { + Self { + kind: RenderPromptErrorKind::MissingContextWithExternalDocid(external_docid, inner), + fault: FaultSource::User, + } + } + pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError { Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User } } @@ -47,6 +57,8 @@ impl RenderPromptError { pub enum RenderPromptErrorKind { #[error("missing field in document: {0}")] MissingContext(liquid::Error), + #[error("missing field in document `{0}`: {1}")] + MissingContextWithExternalDocid(String, liquid::Error), } impl From for crate::Error { diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index ab15c31b0..8d006f0b7 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -7,17 +7,17 @@ use liquid::model::{ }; use liquid::{ObjectView, ValueView}; -use super::{FieldMetadata, FieldsIdsMapWithMetadata}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, Metadata}; use crate::GlobalFieldsIdsMap; #[derive(Debug, Clone, Copy)] pub struct FieldValue<'a, D: ObjectView> { name: &'a str, document: &'a D, - metadata: FieldMetadata, + metadata: Metadata, } -impl<'a, D: ObjectView> ValueView for FieldValue<'a, D> { +impl ValueView for FieldValue<'_, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -67,7 +67,10 @@ impl<'a, D: ObjectView> FieldValue<'a, D> { } pub fn is_searchable(&self) -> &bool { - &self.metadata.searchable + match self.metadata.is_searchable() { + true => &true, + false => &false, + } } pub fn is_empty(&self) -> bool { @@ -75,7 +78,7 @@ impl<'a, D: ObjectView> FieldValue<'a, D> { } } -impl<'a, D: ObjectView> ObjectView for FieldValue<'a, D> { +impl ObjectView for FieldValue<'_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -125,15 +128,11 @@ pub struct BorrowedFields<'a, 'map, D: ObjectView> { } impl<'a, D: ObjectView> OwnedFields<'a, D> { - pub fn new(document: &'a D, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { + pub fn new(document: &'a D, field_id_map: &'a FieldIdMapWithMetadata) -> Self { Self( std::iter::repeat(document) .zip(field_id_map.iter()) - .map(|(document, (fid, name))| FieldValue { - document, - name, - metadata: field_id_map.metadata(fid).unwrap_or_default(), - }) + .map(|(document, (_fid, name, metadata))| FieldValue { document, name, metadata }) .collect(), ) } @@ -149,7 +148,7 @@ impl<'a, 'map, D: ObjectView> BorrowedFields<'a, 'map, D> { } } -impl<'a, D: ObjectView> ArrayView for OwnedFields<'a, D> { +impl ArrayView for OwnedFields<'_, D> { fn as_value(&self) -> &dyn ValueView { self.0.as_value() } @@ -171,7 +170,7 @@ impl<'a, D: ObjectView> ArrayView for OwnedFields<'a, D> { } } -impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { +impl ArrayView for BorrowedFields<'_, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -187,7 +186,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&k), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); fv as _ })) @@ -207,13 +206,13 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&key), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); Some(fv as _) } } -impl<'a, 'map, D: ObjectView> ValueView for BorrowedFields<'a, 'map, D> { +impl ValueView for BorrowedFields<'_, '_, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -255,7 +254,7 @@ impl<'a, 'map, D: ObjectView> ValueView for BorrowedFields<'a, 'map, D> { } } -impl<'a, D: ObjectView> ValueView for OwnedFields<'a, D> { +impl ValueView for OwnedFields<'_, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -293,7 +292,7 @@ struct ArraySource<'a, 'map, D: ObjectView> { s: &'a BorrowedFields<'a, 'map, D>, } -impl<'a, 'map, D: ObjectView> fmt::Display for ArraySource<'a, 'map, D> { +impl fmt::Display for ArraySource<'_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[")?; for item in self.s.values() { @@ -308,7 +307,7 @@ struct ArrayRender<'a, 'map, D: ObjectView> { s: &'a BorrowedFields<'a, 'map, D>, } -impl<'a, 'map, D: ObjectView> fmt::Display for ArrayRender<'a, 'map, D> { +impl fmt::Display for ArrayRender<'_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for item in self.s.values() { write!(f, "{}", item.render())?; diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index bbcf054e6..a5cb8de48 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -5,11 +5,9 @@ mod fields; mod template_checker; use std::cell::RefCell; -use std::collections::BTreeMap; use std::convert::TryFrom; use std::fmt::Debug; use std::num::NonZeroUsize; -use std::ops::Deref; use bumpalo::Bump; use document::ParseableDocument; @@ -18,8 +16,9 @@ use fields::{BorrowedFields, OwnedFields}; use self::context::Context; use self::document::Document; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; -use crate::{FieldId, FieldsIdsMap, GlobalFieldsIdsMap}; +use crate::GlobalFieldsIdsMap; pub struct Prompt { template: liquid::Template, @@ -119,6 +118,7 @@ impl Prompt { 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents >( &self, + external_docid: &str, document: impl crate::update::new::document::Document<'a> + Debug, field_id_map: &RefCell, doc_alloc: &'doc Bump, @@ -130,9 +130,12 @@ impl Prompt { self.max_bytes.unwrap_or_else(default_max_bytes).get(), doc_alloc, ); - self.template - .render_to(&mut rendered, &context) - .map_err(RenderPromptError::missing_context)?; + self.template.render_to(&mut rendered, &context).map_err(|liquid_error| { + RenderPromptError::missing_context_with_external_docid( + external_docid.to_owned(), + liquid_error, + ) + })?; Ok(std::str::from_utf8(rendered.into_bump_slice()) .expect("render can only write UTF-8 because all inputs and processing preserve utf-8")) } @@ -141,9 +144,9 @@ impl Prompt { &self, document: &obkv::KvReaderU16, side: DelAdd, - field_id_map: &FieldsIdsMapWithMetadata, + field_id_map: &FieldIdMapWithMetadata, ) -> Result { - let document = Document::new(document, side, field_id_map); + let document = Document::new(document, side, field_id_map.as_fields_ids_map()); let fields = OwnedFields::new(&document, field_id_map); let context = Context::new(&document, &fields); @@ -168,40 +171,6 @@ fn truncate(s: &mut String, max_bytes: usize) { } } -pub struct FieldsIdsMapWithMetadata<'a> { - fields_ids_map: &'a FieldsIdsMap, - metadata: BTreeMap, -} - -impl<'a> FieldsIdsMapWithMetadata<'a> { - pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self { - let mut metadata: BTreeMap = - fields_ids_map.ids().map(|id| (id, Default::default())).collect(); - for searchable_field_id in searchable_fields_ids { - let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue }; - metadata.searchable = true; - } - Self { fields_ids_map, metadata } - } - - pub fn metadata(&self, field_id: FieldId) -> Option { - self.metadata.get(&field_id).copied() - } -} - -impl<'a> Deref for FieldsIdsMapWithMetadata<'a> { - type Target = FieldsIdsMap; - - fn deref(&self) -> &Self::Target { - self.fields_ids_map - } -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct FieldMetadata { - pub searchable: bool, -} - #[cfg(test)] mod test { use super::Prompt; diff --git a/crates/milli/src/score_details.rs b/crates/milli/src/score_details.rs index 1efa3b8e6..940e5f395 100644 --- a/crates/milli/src/score_details.rs +++ b/crates/milli/src/score_details.rs @@ -1,7 +1,7 @@ use std::cmp::Ordering; use itertools::Itertools; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use crate::distance_between_two_points; @@ -36,6 +36,15 @@ enum RankOrValue<'a> { Score(f64), } +#[derive(Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub enum WeightedScoreValue { + WeightedScore(f64), + Sort { asc: bool, value: serde_json::Value }, + GeoSort { asc: bool, distance: Option }, + VectorSort(f64), +} + impl ScoreDetails { pub fn local_score(&self) -> Option { self.rank().map(Rank::local_score) @@ -87,6 +96,30 @@ impl ScoreDetails { }) } + pub fn weighted_score_values<'a>( + details: impl Iterator + 'a, + weight: f64, + ) -> impl Iterator + 'a { + details + .map(ScoreDetails::rank_or_value) + .coalesce(|left, right| match (left, right) { + (RankOrValue::Rank(left), RankOrValue::Rank(right)) => { + Ok(RankOrValue::Rank(Rank::merge(left, right))) + } + (left, right) => Err((left, right)), + }) + .map(move |rank_or_value| match rank_or_value { + RankOrValue::Rank(r) => WeightedScoreValue::WeightedScore(r.local_score() * weight), + RankOrValue::Sort(s) => { + WeightedScoreValue::Sort { asc: s.ascending, value: s.value.clone() } + } + RankOrValue::GeoSort(g) => { + WeightedScoreValue::GeoSort { asc: g.ascending, distance: g.distance() } + } + RankOrValue::Score(s) => WeightedScoreValue::VectorSort(s * weight), + }) + } + fn rank_or_value(&self) -> RankOrValue<'_> { match self { ScoreDetails::Words(w) => RankOrValue::Rank(w.rank()), @@ -423,34 +456,58 @@ pub struct Sort { pub value: serde_json::Value, } +pub fn compare_sort_values( + ascending: bool, + left: &serde_json::Value, + right: &serde_json::Value, +) -> Ordering { + use serde_json::Value::*; + match (left, right) { + (Null, Null) => Ordering::Equal, + (Null, _) => Ordering::Less, + (_, Null) => Ordering::Greater, + // numbers are always before strings + (Number(_), String(_)) => Ordering::Greater, + (String(_), Number(_)) => Ordering::Less, + (Number(left), Number(right)) => { + // FIXME: unwrap permitted here? + let order = left + .as_f64() + .unwrap() + .partial_cmp(&right.as_f64().unwrap()) + .unwrap_or(Ordering::Equal); + // 12 < 42, and when ascending, we want to see 12 first, so the smallest. + // Hence, when ascending, smaller is better + if ascending { + order.reverse() + } else { + order + } + } + (String(left), String(right)) => { + let order = left.cmp(right); + // Taking e.g. "a" and "z" + // "a" < "z", and when ascending, we want to see "a" first, so the smallest. + // Hence, when ascending, smaller is better + if ascending { + order.reverse() + } else { + order + } + } + (left, right) => { + tracing::warn!(%left, %right, "sort values that are neither numbers, strings or null, handling as equal"); + Ordering::Equal + } + } +} + impl PartialOrd for Sort { fn partial_cmp(&self, other: &Self) -> Option { if self.ascending != other.ascending { return None; } - match (&self.value, &other.value) { - (serde_json::Value::Null, serde_json::Value::Null) => Some(Ordering::Equal), - (serde_json::Value::Null, _) => Some(Ordering::Less), - (_, serde_json::Value::Null) => Some(Ordering::Greater), - // numbers are always before strings - (serde_json::Value::Number(_), serde_json::Value::String(_)) => Some(Ordering::Greater), - (serde_json::Value::String(_), serde_json::Value::Number(_)) => Some(Ordering::Less), - (serde_json::Value::Number(left), serde_json::Value::Number(right)) => { - // FIXME: unwrap permitted here? - let order = left.as_f64().unwrap().partial_cmp(&right.as_f64().unwrap())?; - // 12 < 42, and when ascending, we want to see 12 first, so the smallest. - // Hence, when ascending, smaller is better - Some(if self.ascending { order.reverse() } else { order }) - } - (serde_json::Value::String(left), serde_json::Value::String(right)) => { - let order = left.cmp(right); - // Taking e.g. "a" and "z" - // "a" < "z", and when ascending, we want to see "a" first, so the smallest. - // Hence, when ascending, smaller is better - Some(if self.ascending { order.reverse() } else { order }) - } - _ => None, - } + Some(compare_sort_values(self.ascending, &self.value, &other.value)) } } diff --git a/crates/milli/src/search/facet/facet_distribution.rs b/crates/milli/src/search/facet/facet_distribution.rs index 027f8b176..2e74c309f 100644 --- a/crates/milli/src/search/facet/facet_distribution.rs +++ b/crates/milli/src/search/facet/facet_distribution.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fmt::Display; use std::ops::ControlFlow; use std::{fmt, mem}; @@ -9,8 +9,9 @@ use indexmap::IndexMap; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; -use crate::error::UserError; +use crate::attribute_patterns::match_field_legacy; use crate::facet::FacetType; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; @@ -18,7 +19,7 @@ use crate::heed_codec::{BytesRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter::{ count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, }; -use crate::{FieldId, Index, Result}; +use crate::{Error, FieldId, FilterableAttributesRule, Index, PatternMatch, Result, UserError}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -219,12 +220,19 @@ impl<'a> FacetDistribution<'a> { let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); - let original_string = self - .index - .field_id_docid_facet_strings - .get(self.rtxn, &key)? - .unwrap() - .to_owned(); + let optional_original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?; + + let original_string = match optional_original_string { + Some(original_string) => original_string.to_owned(), + None => { + tracing::error!( + "Missing original facet string. Using the normalized facet {} instead", + facet_key + ); + facet_key.to_string() + } + }; distribution.insert(original_string, nbr_docids); if distribution.len() == self.max_values_per_facet { @@ -280,37 +288,19 @@ impl<'a> FacetDistribution<'a> { } pub fn compute_stats(&self) -> Result> { - let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; let candidates = if let Some(candidates) = self.candidates.clone() { candidates } else { return Ok(Default::default()); }; - let fields = match &self.facets { - Some(facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let filterable_attributes_rules = self.index.filterable_attributes_rules(self.rtxn)?; + self.check_faceted_fields(&filterable_attributes_rules)?; let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { - if crate::is_faceted(name, &fields) { + if self.select_field(name, &filterable_attributes_rules) { let min_value = if let Some(min_value) = crate::search::facet::facet_min_value( self.index, self.rtxn, @@ -341,31 +331,12 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; - - let fields = match self.facets { - Some(ref facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let filterable_attributes_rules = self.index.filterable_attributes_rules(self.rtxn)?; + self.check_faceted_fields(&filterable_attributes_rules)?; let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { - if crate::is_faceted(name, &fields) { + if self.select_field(name, &filterable_attributes_rules) { let order_by = self .facets .as_ref() @@ -378,6 +349,71 @@ impl<'a> FacetDistribution<'a> { Ok(distribution) } + + /// Select a field if it is filterable and in the facets. + fn select_field( + &self, + name: &str, + filterable_attributes_rules: &[FilterableAttributesRule], + ) -> bool { + // If the field is not filterable, we don't want to compute the facet distribution. + if !matching_features(name, filterable_attributes_rules) + .is_some_and(|(_, features)| features.is_filterable()) + { + return false; + } + + match &self.facets { + Some(facets) => { + // The list of facets provided by the user is a legacy pattern ("dog.age" must be selected with "dog"). + facets.keys().any(|key| match_field_legacy(key, name) == PatternMatch::Match) + } + None => true, + } + } + + /// Check if the fields in the facets are valid filterable fields. + fn check_faceted_fields( + &self, + filterable_attributes_rules: &[FilterableAttributesRule], + ) -> Result<()> { + let mut invalid_facets = BTreeSet::new(); + let mut matching_rule_indices = HashMap::new(); + + if let Some(facets) = &self.facets { + for field in facets.keys() { + let matched_rule = matching_features(field, filterable_attributes_rules); + let is_filterable = matched_rule.is_some_and(|(_, f)| f.is_filterable()); + + if !is_filterable { + invalid_facets.insert(field.to_string()); + + // If the field matched a rule but that rule doesn't enable filtering, + // store the rule index for better error messages + if let Some((rule_index, _)) = matched_rule { + matching_rule_indices.insert(field.to_string(), rule_index); + } + } + } + } + + if !invalid_facets.is_empty() { + let valid_patterns = + filtered_matching_patterns(filterable_attributes_rules, &|features| { + features.is_filterable() + }) + .into_iter() + .map(String::from) + .collect(); + return Err(Error::UserError(UserError::InvalidFacetsDistribution { + invalid_facets_name: invalid_facets, + valid_patterns, + matching_rule_indices, + })); + } + + Ok(()) + } } impl fmt::Debug for FacetDistribution<'_> { @@ -405,11 +441,10 @@ mod tests { use std::iter; use big_s::S; - use maplit::hashset; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; - use crate::{milli_snap, FacetDistribution, OrderBy}; + use crate::{milli_snap, FacetDistribution, FilterableAttributesRule, OrderBy}; #[test] fn few_candidates_few_facet_values() { @@ -419,7 +454,9 @@ mod tests { let index = TempIndex::new(); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let documents = documents!([ @@ -490,7 +527,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; @@ -575,7 +614,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::>(); @@ -634,7 +675,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -685,7 +728,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -736,7 +781,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); @@ -787,7 +834,9 @@ mod tests { let index = TempIndex::new_with_map_size(4096 * 10_000); index - .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("colour"))]) + }) .unwrap(); let facet_values = (0..1000).collect::>(); diff --git a/crates/milli/src/search/facet/facet_range_search.rs b/crates/milli/src/search/facet/facet_range_search.rs index 0f8f58771..5fe2366a1 100644 --- a/crates/milli/src/search/facet/facet_range_search.rs +++ b/crates/milli/src/search/facet/facet_range_search.rs @@ -79,7 +79,7 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { docids: &'bitmap mut RoaringBitmap, } -impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { +impl<'t> FacetRangeSearch<'t, '_, '_> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; @@ -132,12 +132,12 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { /// /// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating /// 2. If the element's range is fully contained by the bounds, then all of its docids are added to - /// the roaring bitmap. + /// the roaring bitmap. /// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively - /// on the children of the element from the level below. + /// on the children of the element from the level below. /// 4. If the element's range is greater than the right bound, we do nothing and stop iterating. - /// Note that the right bound is found through either the `left_bound` of the *next* element, - /// or from the `rightmost_bound` argument + /// Note that the right bound is found through either the `left_bound` of the *next* element, + /// or from the `rightmost_bound` argument /// /// ## Arguments /// - `level`: the level being visited diff --git a/crates/milli/src/search/facet/facet_sort_ascending.rs b/crates/milli/src/search/facet/facet_sort_ascending.rs index 59a95e5bd..115f920ab 100644 --- a/crates/milli/src/search/facet/facet_sort_ascending.rs +++ b/crates/milli/src/search/facet/facet_sort_ascending.rs @@ -62,7 +62,7 @@ struct AscendingFacetSort<'t, 'e> { )>, } -impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { +impl<'t> Iterator for AscendingFacetSort<'t, '_> { type Item = Result<(RoaringBitmap, &'t [u8])>; fn next(&mut self) -> Option { diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index c059d2d27..3505f7d4a 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; @@ -10,13 +10,16 @@ use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; use super::facet_range_search; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::{Error, UserError}; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, OrderedF64Codec, }; use crate::index::db_name::FACET_ID_STRING_DOCIDS; use crate::{ - distance_between_two_points, lat_lng_to_xyz, FieldId, Index, InternalError, Result, + distance_between_two_points, lat_lng_to_xyz, FieldId, FieldsIdsMap, + FilterableAttributesFeatures, FilterableAttributesRule, Index, InternalError, Result, SerializationError, }; @@ -59,29 +62,29 @@ impl Display for BadGeoError { #[derive(Debug)] enum FilterError<'a> { - AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet }, + AttributeNotFilterable { attribute: &'a str, filterable_patterns: BTreeSet<&'a str> }, ParseGeoError(BadGeoError), TooDeep, } -impl<'a> std::error::Error for FilterError<'a> {} +impl std::error::Error for FilterError<'_> {} -impl<'a> From for FilterError<'a> { +impl From for FilterError<'_> { fn from(geo_error: BadGeoError) -> Self { FilterError::ParseGeoError(geo_error) } } -impl<'a> Display for FilterError<'a> { +impl Display for FilterError<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::AttributeNotFilterable { attribute, filterable_fields } => { + Self::AttributeNotFilterable { attribute, filterable_patterns } => { write!(f, "Attribute `{attribute}` is not filterable.")?; - if filterable_fields.is_empty() { + if filterable_patterns.is_empty() { write!(f, " This index does not have configured filterable attributes.") } else { - write!(f, " Available filterable attributes are: ")?; + write!(f, " Available filterable attribute patterns are: ")?; let mut filterables_list = - filterable_fields.iter().map(AsRef::as_ref).collect::>(); + filterable_patterns.iter().map(AsRef::as_ref).collect::>(); filterables_list.sort_unstable(); for (idx, filterable) in filterables_list.iter().enumerate() { write!(f, "`{filterable}`")?; @@ -229,8 +232,27 @@ impl<'a> Filter<'a> { impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn<'_>, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let filterable_fields = index.filterable_fields(rtxn)?; - self.inner_evaluate(rtxn, index, &filterable_fields, None) + let fields_ids_map = index.fields_ids_map(rtxn)?; + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + for fid in self.condition.fids(MAX_FILTER_DEPTH) { + let attribute = fid.value(); + if matching_features(attribute, &filterable_attributes_rules) + .is_some_and(|(_, features)| features.is_filterable()) + { + continue; + } + + // If the field is not filterable, return an error + return Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute, + filterable_patterns: filtered_matching_patterns( + &filterable_attributes_rules, + &|features| features.is_filterable(), + ), + }))?; + } + + self.inner_evaluate(rtxn, index, &fields_ids_map, &filterable_attributes_rules, None) } fn evaluate_operator( @@ -239,6 +261,8 @@ impl<'a> Filter<'a> { field_id: FieldId, universe: Option<&RoaringBitmap>, operator: &Condition<'a>, + features: &FilterableAttributesFeatures, + rule_index: usize, ) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; @@ -248,6 +272,38 @@ impl<'a> Filter<'a> { // field id and the level. let (left, right) = match operator { + // return an error if the filter is not allowed for this field + Condition::GreaterThan(_) + | Condition::GreaterThanOrEqual(_) + | Condition::LowerThan(_) + | Condition::LowerThanOrEqual(_) + | Condition::Between { .. } + if !features.is_filterable_comparison() => + { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Empty if !features.is_filterable_empty() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Null if !features.is_filterable_null() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Exists if !features.is_filterable_exists() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } + Condition::Equal(_) | Condition::NotEqual(_) if !features.is_filterable_equality() => { + return Err(generate_filter_error( + rtxn, index, field_id, operator, features, rule_index, + )); + } Condition::GreaterThan(val) => { (Excluded(val.parse_finite_float()?), Included(f64::MAX)) } @@ -297,7 +353,9 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; + let docids = Self::evaluate_operator( + rtxn, index, field_id, None, &operator, features, rule_index, + )?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } @@ -399,10 +457,11 @@ impl<'a> Filter<'a> { &self, rtxn: &heed::RoTxn<'_>, index: &Index, - filterable_fields: &HashSet, + field_ids_map: &FieldsIdsMap, + filterable_attribute_rules: &[FilterableAttributesRule], universe: Option<&RoaringBitmap>, ) -> Result { - if universe.map_or(false, |u| u.is_empty()) { + if universe.is_some_and(|u| u.is_empty()) { return Ok(RoaringBitmap::new()); } @@ -412,7 +471,8 @@ impl<'a> Filter<'a> { &(f.as_ref().clone()).into(), rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; match universe { @@ -424,42 +484,49 @@ impl<'a> Filter<'a> { } } FilterCondition::In { fid, els } => { - if crate::is_faceted(fid.value(), filterable_fields) { - let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - els.iter() - .map(|el| Condition::Equal(el.clone())) - .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) - .union() - } else { - Ok(RoaringBitmap::new()) - } - } else { - Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute: fid.value(), - filterable_fields: filterable_fields.clone(), - }))? - } + let Some(field_id) = field_ids_map.id(fid.value()) else { + return Ok(RoaringBitmap::new()); + }; + let Some((rule_index, features)) = + matching_features(fid.value(), filterable_attribute_rules) + else { + return Ok(RoaringBitmap::new()); + }; + + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| { + Self::evaluate_operator( + rtxn, index, field_id, universe, &op, &features, rule_index, + ) + }) + .union() } FilterCondition::Condition { fid, op } => { - if crate::is_faceted(fid.value(), filterable_fields) { - let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, universe, op) - } else { - Ok(RoaringBitmap::new()) - } - } else { - Err(fid.as_external_error(FilterError::AttributeNotFilterable { - attribute: fid.value(), - filterable_fields: filterable_fields.clone(), - }))? - } + let Some(field_id) = field_ids_map.id(fid.value()) else { + return Ok(RoaringBitmap::new()); + }; + let Some((rule_index, features)) = + matching_features(fid.value(), filterable_attribute_rules) + else { + return Ok(RoaringBitmap::new()); + }; + + Self::evaluate_operator(rtxn, index, field_id, universe, op, &features, rule_index) } FilterCondition::Or(subfilters) => subfilters .iter() .cloned() - .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) + .map(|f| { + Self::inner_evaluate( + &f.into(), + rtxn, + index, + field_ids_map, + filterable_attribute_rules, + universe, + ) + }) .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); @@ -468,7 +535,8 @@ impl<'a> Filter<'a> { &(first_subfilter.clone()).into(), rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; for f in subfilters_iter { @@ -482,7 +550,8 @@ impl<'a> Filter<'a> { &(f.clone()).into(), rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, Some(&bitmap), )?; } @@ -492,7 +561,7 @@ impl<'a> Filter<'a> { } } FilterCondition::GeoLowerThan { point, radius } => { - if filterable_fields.contains("_geo") { + if index.is_geo_filtering_enabled(rtxn)? { let base_point: [f64; 2] = [point[0].parse_finite_float()?, point[1].parse_finite_float()?]; if !(-90.0..=90.0).contains(&base_point[0]) { @@ -521,13 +590,16 @@ impl<'a> Filter<'a> { Ok(result) } else { Err(point[0].as_external_error(FilterError::AttributeNotFilterable { - attribute: "_geo", - filterable_fields: filterable_fields.clone(), + attribute: RESERVED_GEO_FIELD_NAME, + filterable_patterns: filtered_matching_patterns( + filterable_attribute_rules, + &|features| features.is_filterable(), + ), }))? } } FilterCondition::GeoBoundingBox { top_right_point, bottom_left_point } => { - if filterable_fields.contains("_geo") { + if index.is_geo_filtering_enabled(rtxn)? { let top_right: [f64; 2] = [ top_right_point[0].parse_finite_float()?, top_right_point[1].parse_finite_float()?, @@ -582,7 +654,8 @@ impl<'a> Filter<'a> { let selected_lat = Filter { condition: condition_lat }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; @@ -615,7 +688,8 @@ impl<'a> Filter<'a> { let left = Filter { condition: condition_left }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; @@ -629,7 +703,8 @@ impl<'a> Filter<'a> { let right = Filter { condition: condition_right }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )?; @@ -645,7 +720,8 @@ impl<'a> Filter<'a> { Filter { condition: condition_lng }.inner_evaluate( rtxn, index, - filterable_fields, + field_ids_map, + filterable_attribute_rules, universe, )? }; @@ -654,8 +730,11 @@ impl<'a> Filter<'a> { } else { Err(top_right_point[0].as_external_error( FilterError::AttributeNotFilterable { - attribute: "_geo", - filterable_fields: filterable_fields.clone(), + attribute: RESERVED_GEO_FIELD_NAME, + filterable_patterns: filtered_matching_patterns( + filterable_attribute_rules, + &|features| features.is_filterable(), + ), }, ))? } @@ -664,6 +743,28 @@ impl<'a> Filter<'a> { } } +fn generate_filter_error( + rtxn: &heed::RoTxn<'_>, + index: &Index, + field_id: FieldId, + operator: &Condition<'_>, + features: &FilterableAttributesFeatures, + rule_index: usize, +) -> Error { + match index.fields_ids_map(rtxn) { + Ok(fields_ids_map) => { + let field = fields_ids_map.name(field_id).unwrap_or_default(); + Error::UserError(UserError::FilterOperatorNotAllowed { + field: field.to_string(), + allowed_operators: features.allowed_filter_operators(), + operator: operator.operator().to_string(), + rule_index, + }) + } + Err(e) => e.into(), + } +} + impl<'a> From> for Filter<'a> { fn from(fc: FilterCondition<'a>) -> Self { Self { condition: fc } @@ -677,11 +778,12 @@ mod tests { use big_s::S; use either::Either; - use maplit::hashset; + use meili_snap::snapshot; use roaring::RoaringBitmap; + use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::index::tests::TempIndex; - use crate::Filter; + use crate::{Filter, FilterableAttributesRule}; #[test] fn empty_db() { @@ -689,7 +791,9 @@ mod tests { //Set the filterable fields to be the channel. index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("PrIcE") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "PrIcE".to_string(), + )]); }) .unwrap(); @@ -773,27 +877,32 @@ mod tests { let rtxn = index.read_txn().unwrap(); let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. This index does not have configured filterable attributes." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. This index does not have configured filterable attributes. + 12:14 _geoRadius(42, 150, 10) + "###); let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. This index does not have configured filterable attributes." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. This index does not have configured filterable attributes. + 18:20 _geoBoundingBox([42, 150], [30, 10]) + "###); let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `dog` is not filterable. This index does not have configured filterable attributes." - )); + snapshot!(error.to_string(), @r###" + Attribute `dog` is not filterable. This index does not have configured filterable attributes. + 1:4 dog = "bernese mountain" + "###); drop(rtxn); index .update_settings(|settings| { settings.set_searchable_fields(vec![S("title")]); - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "title".to_string(), + )]); }) .unwrap(); @@ -801,21 +910,45 @@ mod tests { let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. Available filterable attribute patterns are: `title`. + 12:16 _geoRadius(-100, 150, 10) + "###); let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `_geo` is not filterable. Available filterable attribute patterns are: `title`. + 18:20 _geoBoundingBox([42, 150], [30, 10]) + "###); let filter = Filter::from_str("name = 12").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); - assert!(error.to_string().starts_with( - "Attribute `name` is not filterable. Available filterable attributes are: `title`." - )); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 1:5 name = 12 + "###); + + let filter = Filter::from_str("title = \"test\" AND name = 12").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 20:24 title = "test" AND name = 12 + "###); + + let filter = Filter::from_str("title = \"test\" AND name IN [12]").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 20:24 title = "test" AND name IN [12] + "###); + + let filter = Filter::from_str("title = \"test\" AND name != 12").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + snapshot!(error.to_string(), @r###" + Attribute `name` is not filterable. Available filterable attribute patterns are: `title`. + 20:24 title = "test" AND name != 12 + "###); } #[test] @@ -841,7 +974,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S("monitor_diagonal"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "monitor_diagonal".to_string(), + )]); }) .unwrap(); @@ -872,7 +1007,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("_geo") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S( + RESERVED_GEO_FIELD_NAME, + ))]); }) .unwrap(); @@ -884,7 +1021,7 @@ mod tests { "address": "Viale Vittorio Veneto, 30, 20124, Milan, Italy", "type": "pizza", "rating": 9, - "_geo": { + RESERVED_GEO_FIELD_NAME: { "lat": 45.4777599, "lng": 9.1967508 } @@ -895,7 +1032,7 @@ mod tests { "address": "Via Dogana, 1, 20123 Milan, Italy", "type": "ice cream", "rating": 10, - "_geo": { + RESERVED_GEO_FIELD_NAME: { "lat": 45.4632046, "lng": 9.1719421 } @@ -918,8 +1055,11 @@ mod tests { index .update_settings(|settings| { - settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S("_geo"), S("price") }); + settings.set_searchable_fields(vec![S(RESERVED_GEO_FIELD_NAME), S("price")]); // to keep the fields order + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesRule::Field("price".to_string()), + ]); }) .unwrap(); @@ -968,8 +1108,11 @@ mod tests { index .update_settings(|settings| { - settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S("_geo"), S("price") }); + settings.set_searchable_fields(vec![S(RESERVED_GEO_FIELD_NAME), S("price")]); // to keep the fields order + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesRule::Field("price".to_string()), + ]); }) .unwrap(); @@ -1079,7 +1222,9 @@ mod tests { index .update_settings(|settings| { settings.set_searchable_fields(vec![S("price")]); // to keep the fields order - settings.set_filterable_fields(hashset! { S("price") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "price".to_string(), + )]); }) .unwrap(); index @@ -1135,7 +1280,11 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id"), S("one"), S("two") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("id".to_string()), + FilterableAttributesRule::Field("one".to_string()), + FilterableAttributesRule::Field("two".to_string()), + ]); }) .unwrap(); diff --git a/crates/milli/src/search/facet/search.rs b/crates/milli/src/search/facet/search.rs index cdba7ee16..3e5fc62f2 100644 --- a/crates/milli/src/search/facet/search.rs +++ b/crates/milli/src/search/facet/search.rs @@ -10,6 +10,7 @@ use roaring::RoaringBitmap; use tracing::error; use crate::error::UserError; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::search::build_dfa; use crate::{DocumentId, FieldId, OrderBy, Result, Search}; @@ -73,25 +74,34 @@ impl<'a> SearchForFacetValues<'a> { let index = self.search_query.index; let rtxn = self.search_query.rtxn; - let filterable_fields = index.filterable_fields(rtxn)?; - if !filterable_fields.contains(&self.facet) { - let (valid_fields, hidden_fields) = - index.remove_hidden_fields(rtxn, filterable_fields)?; + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + let matched_rule = matching_features(&self.facet, &filterable_attributes_rules); + let is_facet_searchable = + matched_rule.is_some_and(|(_, features)| features.is_facet_searchable()); + + if !is_facet_searchable { + let matching_field_names = + filtered_matching_patterns(&filterable_attributes_rules, &|features| { + features.is_facet_searchable() + }); + let (valid_patterns, hidden_fields) = + index.remove_hidden_fields(rtxn, matching_field_names)?; + + // Get the matching rule index if any rule matched the attribute + let matching_rule_index = matched_rule.map(|(rule_index, _)| rule_index); return Err(UserError::InvalidFacetSearchFacetName { field: self.facet.clone(), - valid_fields, + valid_patterns, hidden_fields, + matching_rule_index, } .into()); - } + }; let fields_ids_map = index.fields_ids_map(rtxn)?; - let fid = match fields_ids_map.id(&self.facet) { - Some(fid) => fid, - // we return an empty list of results when the attribute has been - // set as filterable but no document contains this field (yet). - None => return Ok(Vec::new()), + let Some(fid) = fields_ids_map.id(&self.facet) else { + return Ok(Vec::new()); }; let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { @@ -125,7 +135,7 @@ impl<'a> SearchForFacetValues<'a> { if authorize_typos && field_authorizes_typos { let exact_words_fst = self.search_query.index.exact_words(rtxn)?; - if exact_words_fst.map_or(false, |fst| fst.contains(query)) { + if exact_words_fst.is_some_and(|fst| fst.contains(query)) { if fst.contains(query) { self.fetch_original_facets_using_normalized( fid, diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index 5187b572b..81f74fdad 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -151,7 +151,7 @@ impl ScoreWithRatioResult { } } -impl<'a> Search<'a> { +impl Search<'_> { #[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")] pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result<(SearchResult, Option)> { // TODO: find classier way to achieve that than to reset vector and query params @@ -203,11 +203,15 @@ impl<'a> Search<'a> { let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3); - match embedder.embed_one(query, Some(deadline)) { + match embedder.embed_search(&query, Some(deadline)) { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); - return Ok((keyword_results, Some(0))); + return Ok(return_keyword_results( + self.limit, + self.offset, + keyword_results, + )); } } } diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index d5b05f515..0dd639c59 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -9,6 +9,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult}; +use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::vector::Embedder; use crate::{ @@ -187,14 +188,29 @@ impl<'a> Search<'a> { } if let Some(distinct) = &self.distinct { - let filterable_fields = ctx.index.filterable_fields(ctx.txn)?; - if !crate::is_faceted(distinct, &filterable_fields) { - let (valid_fields, hidden_fields) = - ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?; + let filterable_fields = ctx.index.filterable_attributes_rules(ctx.txn)?; + // check if the distinct field is in the filterable fields + let matched_rule = matching_features(distinct, &filterable_fields); + let is_filterable = matched_rule.is_some_and(|(_, features)| features.is_filterable()); + + if !is_filterable { + // if not, remove the hidden fields from the filterable fields to generate the error message + let matching_patterns = + filtered_matching_patterns(&filterable_fields, &|features| { + features.is_filterable() + }); + let (valid_patterns, hidden_fields) = + ctx.index.remove_hidden_fields(ctx.txn, matching_patterns)?; + + // Get the matching rule index if any rule matched the attribute + let matching_rule_index = matched_rule.map(|(rule_index, _)| rule_index); + + // and return the error return Err(Error::UserError(UserError::InvalidDistinctAttribute { field: distinct.clone(), - valid_fields, + valid_patterns, hidden_fields, + matching_rule_index, })); } } diff --git a/crates/milli/src/search/new/db_cache.rs b/crates/milli/src/search/new/db_cache.rs index 243303ba2..1db82e6fb 100644 --- a/crates/milli/src/search/new/db_cache.rs +++ b/crates/milli/src/search/new/db_cache.rs @@ -537,7 +537,7 @@ impl<'ctx> SearchContext<'ctx> { fid: u16, ) -> Result> { // if the requested fid isn't in the restricted list, return None. - if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + if self.restricted_fids.as_ref().is_some_and(|fids| !fids.contains(&fid)) { return Ok(None); } @@ -558,7 +558,7 @@ impl<'ctx> SearchContext<'ctx> { fid: u16, ) -> Result> { // if the requested fid isn't in the restricted list, return None. - if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) { + if self.restricted_fids.as_ref().is_some_and(|fids| !fids.contains(&fid)) { return Ok(None); } diff --git a/crates/milli/src/search/new/distinct.rs b/crates/milli/src/search/new/distinct.rs index d2ace6ffb..17859b6f8 100644 --- a/crates/milli/src/search/new/distinct.rs +++ b/crates/milli/src/search/new/distinct.rs @@ -18,10 +18,10 @@ pub struct DistinctOutput { /// Return a [`DistinctOutput`] containing: /// - `remaining`: a set of docids built such that exactly one element from `candidates` -/// is kept for each distinct value inside the given field. If the field does not exist, it -/// is considered unique. +/// is kept for each distinct value inside the given field. If the field does not exist, it +/// is considered unique. /// - `excluded`: the set of document ids that contain a value for the given field that occurs -/// in the given candidates. +/// in the given candidates. pub fn apply_distinct_rule( ctx: &mut SearchContext<'_>, field_id: u16, diff --git a/crates/milli/src/search/new/matches/best_match_interval.rs b/crates/milli/src/search/new/matches/best_match_interval.rs index a6497f351..1a8914e98 100644 --- a/crates/milli/src/search/new/matches/best_match_interval.rs +++ b/crates/milli/src/search/new/matches/best_match_interval.rs @@ -72,7 +72,7 @@ pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; let interval_score = get_interval_score(&matches[interval_first..=interval_last]); let is_interval_score_better = &best_interval .as_ref() - .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score); + .is_none_or(|MatchIntervalWithScore { score, .. }| interval_score > *score); if *is_interval_score_better { best_interval = Some(MatchIntervalWithScore { diff --git a/crates/milli/src/search/new/matches/matching_words.rs b/crates/milli/src/search/new/matches/matching_words.rs index 1f30a17ad..64235298b 100644 --- a/crates/milli/src/search/new/matches/matching_words.rs +++ b/crates/milli/src/search/new/matches/matching_words.rs @@ -149,7 +149,7 @@ pub type WordId = u16; /// A given token can partially match a query word for several reasons: /// - split words /// - multi-word synonyms -/// In these cases we need to match consecutively several tokens to consider that the match is full. +/// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index d7bc27c94..e30f11e94 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -8,17 +8,19 @@ use std::cmp::{max, min}; use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; +use itertools::Itertools; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; -use serde::Serialize; +use serde::{Deserialize, Serialize}; use simple_token_kind::SimpleTokenKind; +use utoipa::ToSchema; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; -/// Structure used to build a Matcher allowing to customize formating tags. +/// Structure used to build a Matcher allowing to customize formatting tags. pub struct MatcherBuilder<'m> { matching_words: MatchingWords, tokenizer: Tokenizer<'m>, @@ -100,11 +102,11 @@ impl FormatOptions { } } -#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, ToSchema)] pub struct MatchBounds { pub start: usize, pub length: usize, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(skip_serializing_if = "Option::is_none", default)] pub indices: Option>, } @@ -121,7 +123,7 @@ pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { matches: Option<(Vec>, Vec)>, } -impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { +impl<'t> Matcher<'t, '_, '_, '_> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { /// some words are counted as matches only if they are close together and in the good order, @@ -228,8 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - // TODO: Why is this in chars, while start is in bytes? - length: m.char_count, + length: self.calc_byte_length(tokens, m), indices: if array_indices.is_empty() { None } else { @@ -240,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } + fn calc_byte_length(&self, tokens: &[Token<'t>], m: &Match) -> usize { + (m.get_first_token_pos()..=m.get_last_token_pos()) + .flat_map(|i| match &tokens[i].char_map { + Some(char_map) => { + char_map.iter().map(|(original, _)| *original as usize).collect_vec() + } + None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(), + }) + .take(m.char_count) + .sum() + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { let ( @@ -274,7 +287,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { last_match_last_token_position_plus_one } else { // we have matched the end of possible tokens, there's nothing to advance - tokens.len() - 1 + tokens.len() } }; diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index f7c590360..b9161b417 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -49,6 +49,8 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; +use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::index::PrefixSearch; use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; @@ -68,6 +70,7 @@ pub struct SearchContext<'ctx> { pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, pub restricted_fids: Option, + pub prefix_search: PrefixSearch, } impl<'ctx> SearchContext<'ctx> { @@ -85,6 +88,8 @@ impl<'ctx> SearchContext<'ctx> { } } + let prefix_search = index.prefix_search(txn)?.unwrap_or_default(); + Ok(Self { index, txn, @@ -94,9 +99,14 @@ impl<'ctx> SearchContext<'ctx> { term_interner: <_>::default(), phrase_docids: <_>::default(), restricted_fids: None, + prefix_search, }) } + pub fn is_prefix_search_allowed(&self) -> bool { + self.prefix_search != PrefixSearch::Disabled + } + pub fn attributes_to_search_on( &mut self, attributes_to_search_on: &'ctx [String], @@ -553,7 +563,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( Ok(()) } -#[tracing::instrument(level = "trace", skip_all, target = "search::universe")] +#[tracing::instrument(level = "debug", skip_all, target = "search::universe")] pub fn filtered_universe( index: &Index, txn: &RoTxn<'_>, @@ -854,12 +864,12 @@ fn check_sort_criteria( } .into()); } - Member::Geo(_) if !sortable_fields.contains("_geo") => { + Member::Geo(_) if !sortable_fields.contains(RESERVED_GEO_FIELD_NAME) => { let (valid_fields, hidden_fields) = ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; return Err(UserError::InvalidSortableAttribute { - field: "_geo".to_string(), + field: RESERVED_GEO_FIELD_NAME.to_string(), valid_fields, hidden_fields, } diff --git a/crates/milli/src/search/new/query_graph.rs b/crates/milli/src/search/new/query_graph.rs index 9ab5d9dad..4235614c3 100644 --- a/crates/milli/src/search/new/query_graph.rs +++ b/crates/milli/src/search/new/query_graph.rs @@ -21,9 +21,9 @@ use crate::Result; /// 1. `Start` : unique, represents the start of the query /// 2. `End` : unique, represents the end of a query /// 3. `Deleted` : represents a node that was deleted. -/// All deleted nodes are unreachable from the start node. +/// All deleted nodes are unreachable from the start node. /// 4. `Term` is a regular node representing a word or combination of words -/// from the user query. +/// from the user query. #[derive(Clone)] pub struct QueryNode { pub data: QueryNodeData, @@ -327,7 +327,7 @@ impl QueryGraph { let mut peekable = term_with_frequency.into_iter().peekable(); while let Some((idx, frequency)) = peekable.next() { term_weight.insert(idx, weight); - if peekable.peek().map_or(false, |(_, f)| frequency != *f) { + if peekable.peek().is_some_and(|(_, f)| frequency != *f) { weight += 1; } } diff --git a/crates/milli/src/search/new/query_term/compute_derivations.rs b/crates/milli/src/search/new/query_term/compute_derivations.rs index e2a136328..52a230b01 100644 --- a/crates/milli/src/search/new/query_term/compute_derivations.rs +++ b/crates/milli/src/search/new/query_term/compute_derivations.rs @@ -215,7 +215,7 @@ pub fn partially_initialized_term_from_word( let mut zero_typo = None; let mut prefix_of = BTreeSet::new(); - if fst.contains(word) { + if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() { zero_typo = Some(word_interned); } @@ -418,7 +418,7 @@ fn split_best_frequency( let right = ctx.word_interner.insert(right.to_owned()); if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? { - if best.map_or(true, |(old, _, _)| frequency > old) { + if best.is_none_or(|(old, _, _)| frequency > old) { best = Some((frequency, left, right)); } } diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs index bb98f19ce..e492363f8 100644 --- a/crates/milli/src/search/new/query_term/parse_query.rs +++ b/crates/milli/src/search/new/query_term/parse_query.rs @@ -28,6 +28,7 @@ pub fn located_query_terms_from_tokens( words_limit: Option, ) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; + let allow_prefix_search = ctx.is_prefix_search_allowed(); let mut query_terms = Vec::new(); @@ -94,7 +95,7 @@ pub fn located_query_terms_from_tokens( ctx, word, nbr_typos(word), - true, + allow_prefix_search, false, )?; let located_term = LocatedQueryTerm { @@ -202,7 +203,7 @@ pub fn number_of_typos_allowed<'ctx>( Ok(Box::new(move |word: &str| { if !authorize_typos || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + || exact_words.as_ref().is_some_and(|fst| fst.contains(word)) { 0 } else if word.len() < min_len_two_typos as usize { diff --git a/crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index cacdcfa9f..65580bce5 100644 --- a/crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -8,7 +8,7 @@ with them, they are "unconditional". These kinds of edges are used to "skip" a n The algorithm uses a depth-first search. It benefits from two main optimisations: - The list of all possible costs to go from any node to the END node is precomputed - The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges -untraversable depending on what other edges were selected. + untraversable depending on what other edges were selected. These two optimisations are meant to avoid traversing edges that wouldn't lead to a valid path. In practically all cases, we avoid the exponential complexity @@ -24,6 +24,7 @@ For example, the DeadEndsCache could say the following: - if we take `g`, then `[f]` is also forbidden - etc. - etc. + As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden. diff --git a/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 67775ddea..5f0c37cc3 100644 --- a/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -57,6 +57,7 @@ impl RankingRuleGraphTrait for FidGraph { let term = to_term; let mut all_fields = FxHashSet::default(); + let mut current_max_weight = 0; for word in term.term_subset.all_single_words_except_prefix_db(ctx)? { let fields = ctx.get_db_word_fids(word.interned())?; all_fields.extend(fields); @@ -81,6 +82,9 @@ impl RankingRuleGraphTrait for FidGraph { let weight = weights_map .weight(fid) .ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?; + if weight > current_max_weight { + current_max_weight = weight; + } edges.push(( weight as u32 * term.term_ids.len() as u32, conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }), @@ -88,10 +92,10 @@ impl RankingRuleGraphTrait for FidGraph { } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_weight: Option = weights_map.max_weight(); + let max_weight = ctx.index.max_searchable_attribute_weight(ctx.txn)?; if let Some(max_weight) = max_weight { - if !all_fields.contains(&max_weight) { + if current_max_weight < max_weight { edges.push(( max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. conditions_interner.insert(FidCondition { diff --git a/crates/milli/src/search/new/ranking_rule_graph/mod.rs b/crates/milli/src/search/new/ranking_rule_graph/mod.rs index 670402bcb..041cb99b8 100644 --- a/crates/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/crates/milli/src/search/new/ranking_rule_graph/mod.rs @@ -58,7 +58,7 @@ pub struct ComputedCondition { /// 2. The cost of traversing this edge /// 3. The condition associated with it /// 4. The list of nodes that have to be skipped -/// if this edge is traversed. +/// if this edge is traversed. #[derive(Clone)] pub struct Edge { pub source_node: Interned, diff --git a/crates/milli/src/search/new/resolve_query_graph.rs b/crates/milli/src/search/new/resolve_query_graph.rs index 7a47b0a66..3bbe699b2 100644 --- a/crates/milli/src/search/new/resolve_query_graph.rs +++ b/crates/milli/src/search/new/resolve_query_graph.rs @@ -17,7 +17,7 @@ use crate::Result; pub struct PhraseDocIdsCache { pub cache: FxHashMap, RoaringBitmap>, } -impl<'ctx> SearchContext<'ctx> { +impl SearchContext<'_> { /// Get the document ids associated with the given phrase pub fn get_phrase_docids(&mut self, phrase: Interned) -> Result<&RoaringBitmap> { if self.phrase_docids.cache.contains_key(&phrase) { @@ -193,15 +193,23 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - let mut candidates = RoaringBitmap::new(); + let mut candidates = None; for word in words.iter().flatten().copied() { if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { - candidates |= word_docids; + if let Some(candidates) = candidates.as_mut() { + *candidates &= word_docids; + } else { + candidates = Some(word_docids); + } } else { return Ok(RoaringBitmap::new()); } } + let Some(mut candidates) = candidates else { + return Ok(RoaringBitmap::new()); + }; + let winsize = words.len().min(3); for win in words.windows(winsize) { diff --git a/crates/milli/src/search/new/small_bitmap.rs b/crates/milli/src/search/new/small_bitmap.rs index 3fe404622..174aa6d0b 100644 --- a/crates/milli/src/search/new/small_bitmap.rs +++ b/crates/milli/src/search/new/small_bitmap.rs @@ -263,7 +263,7 @@ impl SmallBitmapInternal { pub fn contains(&self, x: u16) -> bool { let (set, x) = self.get_set_index(x); - set & 0b1 << x != 0 + set & (0b1 << x) != 0 } pub fn insert(&mut self, x: u16) { @@ -381,7 +381,7 @@ pub enum SmallBitmapInternalIter<'b> { Tiny(u64), Small { cur: u64, next: &'b [u64], base: u16 }, } -impl<'b> Iterator for SmallBitmapInternalIter<'b> { +impl Iterator for SmallBitmapInternalIter<'_> { type Item = u16; fn next(&mut self) -> Option { diff --git a/crates/milli/src/search/new/tests/cutoff.rs b/crates/milli/src/search/new/tests/cutoff.rs index 63b67f2e7..f2dfb45d6 100644 --- a/crates/milli/src/search/new/tests/cutoff.rs +++ b/crates/milli/src/search/new/tests/cutoff.rs @@ -5,13 +5,11 @@ use std::time::Duration; -use big_s::S; -use maplit::hashset; use meili_snap::snapshot; use crate::index::tests::TempIndex; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::{Criterion, Filter, Search, TimeBudget}; +use crate::{Criterion, Filter, FilterableAttributesRule, Search, TimeBudget}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -20,7 +18,7 @@ fn create_index() -> TempIndex { .update_settings(|s| { s.set_primary_key("id".to_owned()); s.set_searchable_fields(vec!["text".to_owned()]); - s.set_filterable_fields(hashset! { S("id") }); + s.set_filterable_fields(vec![FilterableAttributesRule::Field("id".to_owned())]); s.set_criteria(vec![Criterion::Words, Criterion::Typo]); }) .unwrap(); diff --git a/crates/milli/src/search/new/tests/distinct.rs b/crates/milli/src/search/new/tests/distinct.rs index dd27bfc8a..d3c453957 100644 --- a/crates/milli/src/search/new/tests/distinct.rs +++ b/crates/milli/src/search/new/tests/distinct.rs @@ -19,7 +19,10 @@ use maplit::hashset; use super::collect_field_values; use crate::index::tests::TempIndex; -use crate::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + AscDesc, Criterion, FilterableAttributesRule, Index, Member, Search, SearchResult, + TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -236,7 +239,7 @@ fn test_distinct_placeholder_no_ranking_rules() { // Set the letter as filterable and unset the distinct attribute. index .update_settings(|s| { - s.set_filterable_fields(hashset! { S("letter") }); + s.set_filterable_fields(vec![FilterableAttributesRule::Field("letter".to_owned())]); s.reset_distinct_field(); }) .unwrap(); diff --git a/crates/milli/src/search/new/tests/exactness.rs b/crates/milli/src/search/new/tests/exactness.rs index c52006e3d..e1d6cc1ca 100644 --- a/crates/milli/src/search/new/tests/exactness.rs +++ b/crates/milli/src/search/new/tests/exactness.rs @@ -14,7 +14,7 @@ This module tests the following properties about the exactness ranking rule: 3. those that contain the most exact words from the remaining query - if it is followed by other graph-based ranking rules (`typo`, `proximity`, `attribute`). -Then these rules will only work with + Then these rules will only work with 1. the exact terms selected by `exactness 2. the full query term otherwise */ diff --git a/crates/milli/src/search/new/tests/geo_sort.rs b/crates/milli/src/search/new/tests/geo_sort.rs index 0d65b589a..2eda39ba1 100644 --- a/crates/milli/src/search/new/tests/geo_sort.rs +++ b/crates/milli/src/search/new/tests/geo_sort.rs @@ -6,6 +6,7 @@ use big_s::S; use heed::RoTxn; use maplit::hashset; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::index::tests::TempIndex; use crate::score_details::ScoreDetails; use crate::search::new::tests::collect_field_values; @@ -17,7 +18,7 @@ fn create_index() -> TempIndex { index .update_settings(|s| { s.set_primary_key("id".to_owned()); - s.set_sortable_fields(hashset! { S("_geo") }); + s.set_sortable_fields(hashset! { S(RESERVED_GEO_FIELD_NAME) }); s.set_criteria(vec![Criterion::Words, Criterion::Sort]); }) .unwrap(); @@ -68,12 +69,12 @@ fn test_geo_sort() { index .add_documents(documents!([ - { "id": 2, "_geo": { "lat": 2, "lng": -1 } }, - { "id": 3, "_geo": { "lat": -2, "lng": -2 } }, - { "id": 5, "_geo": { "lat": 6, "lng": -5 } }, - { "id": 4, "_geo": { "lat": 3, "lng": 5 } }, - { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, - { "id": 1, "_geo": { "lat": 1, "lng": 1 } }, + { "id": 2, RESERVED_GEO_FIELD_NAME: { "lat": 2, "lng": -1 } }, + { "id": 3, RESERVED_GEO_FIELD_NAME: { "lat": -2, "lng": -2 } }, + { "id": 5, RESERVED_GEO_FIELD_NAME: { "lat": 6, "lng": -5 } }, + { "id": 4, RESERVED_GEO_FIELD_NAME: { "lat": 3, "lng": 5 } }, + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, RESERVED_GEO_FIELD_NAME: { "lat": 1, "lng": 1 } }, { "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 }, ])) .unwrap(); @@ -100,12 +101,12 @@ fn test_geo_sort_around_the_edge_of_the_flat_earth() { index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": 0, "lng": 0 } }, - { "id": 1, "_geo": { "lat": 88, "lng": 0 } }, - { "id": 2, "_geo": { "lat": -89, "lng": 0 } }, + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, RESERVED_GEO_FIELD_NAME: { "lat": 88, "lng": 0 } }, + { "id": 2, RESERVED_GEO_FIELD_NAME: { "lat": -89, "lng": 0 } }, - { "id": 3, "_geo": { "lat": 0, "lng": 178 } }, - { "id": 4, "_geo": { "lat": 0, "lng": -179 } }, + { "id": 3, RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 178 } }, + { "id": 4, RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": -179 } }, ])) .unwrap(); @@ -177,11 +178,11 @@ fn geo_sort_mixed_with_words() { index .add_documents(documents!([ - { "id": 0, "doggo": "jean", "_geo": { "lat": 0, "lng": 0 } }, - { "id": 1, "doggo": "intel", "_geo": { "lat": 88, "lng": 0 } }, - { "id": 2, "doggo": "jean bob", "_geo": { "lat": -89, "lng": 0 } }, - { "id": 3, "doggo": "jean michel", "_geo": { "lat": 0, "lng": 178 } }, - { "id": 4, "doggo": "bob marley", "_geo": { "lat": 0, "lng": -179 } }, + { "id": 0, "doggo": "jean", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 0 } }, + { "id": 1, "doggo": "intel", RESERVED_GEO_FIELD_NAME: { "lat": 88, "lng": 0 } }, + { "id": 2, "doggo": "jean bob", RESERVED_GEO_FIELD_NAME: { "lat": -89, "lng": 0 } }, + { "id": 3, "doggo": "jean michel", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": 178 } }, + { "id": 4, "doggo": "bob marley", RESERVED_GEO_FIELD_NAME: { "lat": 0, "lng": -179 } }, ])) .unwrap(); diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 79668b34b..4a6cc9b90 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -5,17 +5,20 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use crate::progress::Progress; use crate::update::new::indexer; -use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings}; +use crate::update::{IndexerConfig, Settings}; use crate::vector::EmbeddingConfigs; -use crate::{db_snap, Criterion, Index}; +use crate::{db_snap, Criterion, FilterableAttributesRule, Index}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); +use crate::constants::RESERVED_GEO_FIELD_NAME; pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = Index::new(options, &path, true).unwrap(); let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); @@ -23,14 +26,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.to_vec()); - builder.set_filterable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - S("_geo"), - S("opt1"), - S("opt1.opt2"), - S("tag_in") - }); + builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("tag")), + FilterableAttributesRule::Field(S("asc_desc_rank")), + FilterableAttributesRule::Field(S(RESERVED_GEO_FIELD_NAME)), + FilterableAttributesRule::Field(S("opt1")), + FilterableAttributesRule::Field(S("opt1.opt2")), + FilterableAttributesRule::Field(S("tag_in")), + ]); builder.set_sortable_fields(hashset! { S("tag"), S("asc_desc_rank"), @@ -53,7 +56,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut new_fields_ids_map = db_fields_ids_map.clone(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); file.write_all(CONTENT.as_bytes()).unwrap(); @@ -61,7 +64,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let payload = unsafe { memmap2::Mmap::map(&file).unwrap() }; // index documents - indexer.add_documents(&payload).unwrap(); + indexer.replace_documents(&payload).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, operation_stats, primary_key) = indexer @@ -72,7 +75,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -83,6 +86,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -90,7 +94,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/src/search/new/tests/proximity.rs b/crates/milli/src/search/new/tests/proximity.rs index 2d181a537..97c85a53d 100644 --- a/crates/milli/src/search/new/tests/proximity.rs +++ b/crates/milli/src/search/new/tests/proximity.rs @@ -4,15 +4,14 @@ This module tests the Proximity ranking rule: 1. A proximity of >7 always has the same cost. 2. Phrase terms can be in sprximity to other terms via their start and end words, -but we need to make sure that the phrase exists in the document that meets this -proximity condition. This is especially relevant with split words and synonyms. + but we need to make sure that the phrase exists in the document that meets this + proximity condition. This is especially relevant with split words and synonyms. 3. An ngram has the same sprximity cost as its component words being consecutive. -e.g. `sunflower` equivalent to `sun flower`. + e.g. `sunflower` equivalent to `sun flower`. 4. The prefix databases can be used to find the sprximity between two words, but -they store fewer sprximities than the regular word sprximity DB. - + they store fewer sprximities than the regular word sprximity DB. */ use std::collections::BTreeMap; diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap index 2626ee7d4..e55e221a2 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_different_fields.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap index 2626ee7d4..e55e221a2 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_ngrams.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap index 73dec5f8b..5ae6fafc5 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_repeated.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap index 2626ee7d4..e55e221a2 100644 --- a/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap +++ b/crates/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_position__attribute_position_simple-2.snap @@ -1,5 +1,5 @@ --- -source: milli/src/search/new/tests/attribute_position.rs +source: crates/milli/src/search/new/tests/attribute_position.rs expression: "format!(\"{document_ids_scores:#?}\")" --- [ diff --git a/crates/milli/src/search/new/tests/typo.rs b/crates/milli/src/search/new/tests/typo.rs index 61d4c4387..1bbe08977 100644 --- a/crates/milli/src/search/new/tests/typo.rs +++ b/crates/milli/src/search/new/tests/typo.rs @@ -11,7 +11,7 @@ This module tests the following properties: 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` 9. 3grams are not typo tolerant (but they can be split into two words) 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly -if `words` doesn't exist before it. + if `words` doesn't exist before it. 11. The `typo` ranking rule places documents with the same number of typos in the same bucket 12. Prefix tolerance costs nothing according to the typo ranking rule 13. Split words cost 1 typo according to the typo ranking rule diff --git a/crates/milli/src/search/new/tests/words_tms.rs b/crates/milli/src/search/new/tests/words_tms.rs index ee8cfc51b..e058d81ae 100644 --- a/crates/milli/src/search/new/tests/words_tms.rs +++ b/crates/milli/src/search/new/tests/words_tms.rs @@ -2,11 +2,11 @@ This module tests the following properties: 1. The `last` term matching strategy starts removing terms from the query -starting from the end if no more results match it. + starting from the end if no more results match it. 2. Phrases are never deleted by the `last` term matching strategy 3. Duplicate words don't affect the ranking of a document according to the `words` ranking rule 4. The proximity of the first and last word of a phrase to its adjacent terms is taken into -account by the proximity ranking rule. + account by the proximity ranking rule. 5. Unclosed double quotes still make a phrase 6. The `all` term matching strategy does not remove any term from the query 7. The search is capable of returning no results if no documents match the query diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index 90377c09c..a25605cfc 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -32,7 +32,7 @@ impl VectorSort { .index .embedder_category_id .get(ctx.txn, embedder_name)? - .ok_or_else(|| crate::UserError::InvalidEmbedder(embedder_name.to_owned()))?; + .ok_or_else(|| crate::UserError::InvalidSearchEmbedder(embedder_name.to_owned()))?; Ok(Self { query: None, diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 5547d800e..759940f9c 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -65,10 +65,9 @@ impl<'a> Similar<'a> { let universe = universe; let embedder_index = - self.index - .embedder_category_id - .get(self.rtxn, &self.embedder_name)? - .ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?; + self.index.embedder_category_id.get(self.rtxn, &self.embedder_name)?.ok_or_else( + || crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()), + )?; let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); let results = reader.nns_by_item( diff --git a/crates/milli/src/snapshot_tests.rs b/crates/milli/src/snapshot_tests.rs index 6635ab2f4..3e58c44d9 100644 --- a/crates/milli/src/snapshot_tests.rs +++ b/crates/milli/src/snapshot_tests.rs @@ -386,7 +386,7 @@ pub fn snap_settings(index: &Index) -> String { write_setting_to_snap!(criteria); write_setting_to_snap!(displayed_fields); write_setting_to_snap!(distinct_field); - write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(filterable_attributes_rules); write_setting_to_snap!(sortable_fields); write_setting_to_snap!(synonyms); write_setting_to_snap!(authorize_typos); diff --git a/crates/milli/src/thread_pool_no_abort.rs b/crates/milli/src/thread_pool_no_abort.rs index 14e5b0491..b57050a63 100644 --- a/crates/milli/src/thread_pool_no_abort.rs +++ b/crates/milli/src/thread_pool_no_abort.rs @@ -1,4 +1,4 @@ -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; use rayon::{ThreadPool, ThreadPoolBuilder}; @@ -9,6 +9,8 @@ use thiserror::Error; #[derive(Debug)] pub struct ThreadPoolNoAbort { thread_pool: ThreadPool, + /// The number of active operations. + active_operations: AtomicUsize, /// Set to true if the thread pool catched a panic. pool_catched_panic: Arc, } @@ -19,7 +21,9 @@ impl ThreadPoolNoAbort { OP: FnOnce() -> R + Send, R: Send, { + self.active_operations.fetch_add(1, Ordering::Relaxed); let output = self.thread_pool.install(op); + self.active_operations.fetch_sub(1, Ordering::Relaxed); // While reseting the pool panic catcher we return an error if we catched one. if self.pool_catched_panic.swap(false, Ordering::SeqCst) { Err(PanicCatched) @@ -31,6 +35,11 @@ impl ThreadPoolNoAbort { pub fn current_num_threads(&self) -> usize { self.thread_pool.current_num_threads() } + + /// The number of active operations. + pub fn active_operations(&self) -> usize { + self.active_operations.load(Ordering::Relaxed) + } } #[derive(Error, Debug)] @@ -64,6 +73,10 @@ impl ThreadPoolNoAbortBuilder { let catched_panic = pool_catched_panic.clone(); move |_result| catched_panic.store(true, Ordering::SeqCst) }); - Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic }) + Ok(ThreadPoolNoAbort { + thread_pool: self.0.build()?, + active_operations: AtomicUsize::new(0), + pool_catched_panic, + }) } } diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index 6c4efb859..b0ae070de 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -103,6 +103,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { #[cfg(test)] mod tests { use super::*; + use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::index::tests::TempIndex; #[test] @@ -114,7 +115,7 @@ mod tests { .add_documents_using_wtxn(&mut wtxn, documents!([ { "id": 0, "name": "kevin", "age": 20 }, { "id": 1, "name": "kevina" }, - { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } + { "id": 2, "name": "benoit", "country": "France", RESERVED_GEO_FIELD_NAME: { "lng": 42, "lat": 35 } } ])) .unwrap(); diff --git a/crates/milli/src/update/del_add.rs b/crates/milli/src/update/del_add.rs index 97ff86f2a..6825e2bd3 100644 --- a/crates/milli/src/update/del_add.rs +++ b/crates/milli/src/update/del_add.rs @@ -81,6 +81,17 @@ pub enum DelAddOperation { DeletionAndAddition, } +impl DelAddOperation { + /// Merge two DelAddOperation enum variants. + pub fn merge(self, other: Self) -> Self { + match (self, other) { + (Self::Deletion, Self::Deletion) => Self::Deletion, + (Self::Addition, Self::Addition) => Self::Addition, + _ => Self::DeletionAndAddition, + } + } +} + /// Creates a Kv> from two Kv /// /// putting each deletion obkv's keys under an DelAdd::Deletion diff --git a/crates/milli/src/update/facet/bulk.rs b/crates/milli/src/update/facet/bulk.rs index 1ab8740ed..5de0ff4ed 100644 --- a/crates/milli/src/update/facet/bulk.rs +++ b/crates/milli/src/update/facet/bulk.rs @@ -6,7 +6,7 @@ use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use super::{clear_facet_levels, FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, @@ -97,9 +97,7 @@ pub(crate) struct FacetsUpdateBulkInner { impl FacetsUpdateBulkInner { pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> { self.update_level0(wtxn)?; - for &field_id in field_ids.iter() { - self.clear_levels(wtxn, field_id)?; - } + clear_facet_levels(wtxn, &self.db.remap_data_type(), field_ids)?; for &field_id in field_ids.iter() { let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; @@ -114,14 +112,6 @@ impl FacetsUpdateBulkInner { Ok(()) } - fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> { - let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; - let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; - let range = left..=right; - self.db.delete_range(wtxn, &range).map(drop)?; - Ok(()) - } - fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> { let delta_data = match self.delta_data.take() { Some(x) => x, @@ -365,8 +355,6 @@ impl FacetsUpdateBulkInner { mod tests { use std::iter::once; - use big_s::S; - use maplit::hashset; use roaring::RoaringBitmap; use crate::documents::mmap_from_objects; @@ -374,7 +362,7 @@ mod tests { use crate::heed_codec::StrRefCodec; use crate::index::tests::TempIndex; use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; - use crate::{db_snap, milli_snap}; + use crate::{db_snap, milli_snap, FilterableAttributesRule}; #[test] fn insert() { @@ -474,7 +462,8 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id") }); + settings + .set_filterable_fields(vec![FilterableAttributesRule::Field("id".to_string())]); }) .unwrap(); diff --git a/crates/milli/src/update/facet/incremental.rs b/crates/milli/src/update/facet/incremental.rs index a1fa07fe3..5e91daf5a 100644 --- a/crates/milli/src/update/facet/incremental.rs +++ b/crates/milli/src/update/facet/incremental.rs @@ -21,29 +21,30 @@ use crate::{CboRoaringBitmapCodec, Index, Result}; /// Enum used as a return value for the facet incremental indexing. /// /// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have -/// an effect on the number of keys in that level. Therefore, it did not increase the number of children -/// of the parent node. +/// an effect on the number of keys in that level. Therefore, it did not increase the number of children +/// of the parent node. /// /// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted -/// in the addition of a new key in that level, and that therefore the number of children -/// of the parent node should be incremented. +/// in the addition of a new key in that level, and that therefore the number of children +/// of the parent node should be incremented. /// /// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the -/// number of keys in the level. For example, removing a document id from the facet value `3` could -/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted -/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must -/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. +/// number of keys in the level. For example, removing a document id from the facet value `3` could +/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted +/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must +/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. /// /// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the -/// bounds of the keys of the level. For example, removing a document id from the facet value -/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, -/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). -/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust -/// its left bound as well. +/// bounds of the keys of the level. For example, removing a document id from the facet value +/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, +/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). +/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust +/// its left bound as well. /// /// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`. -/// This case is reachable when a document id is removed from a sub-level node but is still present in another one. -/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`. +/// This case is reachable when a document id is removed from a sub-level node but is still present in another one. +/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` +/// but should remain in the group node [1..4] in `level 1`. enum ModificationResult { InPlace, Expand, @@ -100,8 +101,7 @@ impl FacetsUpdateIncremental { let key = FacetGroupKeyCodec::::bytes_decode(key) .map_err(heed::Error::Encoding)?; - if facet_level_may_be_updated - && current_field_id.map_or(false, |fid| fid != key.field_id) + if facet_level_may_be_updated && current_field_id.is_some_and(|fid| fid != key.field_id) { // Only add or remove a level after making all the field modifications. self.inner.add_or_delete_level(wtxn, current_field_id.unwrap())?; @@ -529,8 +529,8 @@ impl FacetsUpdateIncrementalInner { add_docids: Option<&RoaringBitmap>, del_docids: Option<&RoaringBitmap>, ) -> Result { - if add_docids.map_or(true, RoaringBitmap::is_empty) - && del_docids.map_or(true, RoaringBitmap::is_empty) + if add_docids.is_none_or(RoaringBitmap::is_empty) + && del_docids.is_none_or(RoaringBitmap::is_empty) { return Ok(false); } @@ -669,7 +669,7 @@ impl FacetsUpdateIncrementalInner { } } -impl<'a> FacetGroupKey<&'a [u8]> { +impl FacetGroupKey<&[u8]> { pub fn into_owned(self) -> FacetGroupKey> { FacetGroupKey { field_id: self.field_id, @@ -1059,208 +1059,3 @@ mod tests { milli_snap!(format!("{index}"), "after_delete"); } } - -// fuzz tests -#[cfg(all(test, fuzzing))] -/** -Fuzz test for the incremental indxer. - -The fuzz test uses fuzzcheck, a coverage-guided fuzzer. -See https://github.com/loiclec/fuzzcheck-rs and https://fuzzcheck.neocities.org -for more information. - -It is only run when using the `cargo fuzzcheck` command line tool, which can be installed with: -```sh -cargo install cargo-fuzzcheck -``` -To start the fuzz test, run (from the base folder or from milli/): -```sh -cargo fuzzcheck update::facet::incremental::fuzz::fuzz -``` -and wait a couple minutes to make sure the code was thoroughly tested, then -hit `Ctrl-C` to stop the fuzzer. The corpus generated by the fuzzer is located in milli/fuzz. - -To work on this module with rust-analyzer working properly, add the following to your .cargo/config.toml file: -```toml -[build] -rustflags = ["--cfg", "fuzzing"] -``` - -The fuzz test generates sequences of additions and deletions to the facet database and -ensures that: -1. its structure is still internally valid -2. its content is the same as a trivially correct implementation of the same database -*/ -mod fuzz { - use std::collections::{BTreeMap, HashMap}; - use std::iter::FromIterator; - use std::rc::Rc; - - use fuzzcheck::mutators::integer::U8Mutator; - use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; - use fuzzcheck::mutators::vector::VecMutator; - use fuzzcheck::DefaultMutator; - use roaring::RoaringBitmap; - use tempfile::TempDir; - - use super::*; - use crate::update::facet::test_helpers::FacetIndex; - #[derive(Default)] - pub struct TrivialDatabase { - pub elements: BTreeMap>, - } - impl TrivialDatabase - where - T: Ord + Clone + Eq + std::fmt::Debug, - { - #[no_coverage] - pub fn insert(&mut self, field_id: u16, new_key: &T, new_values: &RoaringBitmap) { - if new_values.is_empty() { - return; - } - let values_field_id = self.elements.entry(field_id).or_default(); - let values = values_field_id.entry(new_key.clone()).or_default(); - *values |= new_values; - } - #[no_coverage] - pub fn delete(&mut self, field_id: u16, key: &T, values_to_remove: &RoaringBitmap) { - if let Some(values_field_id) = self.elements.get_mut(&field_id) { - if let Some(values) = values_field_id.get_mut(&key) { - *values -= values_to_remove; - if values.is_empty() { - values_field_id.remove(&key); - } - } - if values_field_id.is_empty() { - self.elements.remove(&field_id); - } - } - } - } - #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] - struct Operation { - #[field_mutator(VecMutator = { VecMutator::new(u8::default_mutator(), 0 ..= 5) })] - key: Vec, - #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] - group_size: u8, - #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] - max_group_size: u8, - #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] - min_level_size: u8, - #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] - field_id: u16, - kind: OperationKind, - } - #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] - enum OperationKind { - Insert( - #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] - Vec, - ), - Delete( - #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] - Vec, - ), - } - - #[no_coverage] - fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { - let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten - let mut txn = index.env.write_txn().unwrap(); - - let mut trivial_db = TrivialDatabase::>::default(); - let mut value_to_keys = HashMap::>>::new(); - for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in - operations - { - index.set_group_size(*group_size); - index.set_max_group_size(*max_group_size); - index.set_min_level_size(*min_level_size); - match kind { - OperationKind::Insert(values) => { - let mut bitmap = RoaringBitmap::new(); - for value in values { - bitmap.insert(*value as u32); - value_to_keys.entry(*value).or_default().push(key.clone()); - } - index.insert(&mut txn, *field_id, &key.as_slice(), &bitmap); - trivial_db.insert(*field_id, &key, &bitmap); - } - OperationKind::Delete(values) => { - let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); - let mut values_per_key = HashMap::new(); - - for value in values { - if let Some(keys) = value_to_keys.get(&(value as u8)) { - for key in keys { - let values: &mut RoaringBitmap = - values_per_key.entry(key).or_default(); - values.insert(value); - } - } - } - for (key, values) in values_per_key { - index.delete(&mut txn, *field_id, &key.as_slice(), &values); - trivial_db.delete(*field_id, &key, &values); - } - } - } - } - - for (field_id, values_field_id) in trivial_db.elements.iter() { - let level0iter = index - .content - .as_polymorph() - .prefix_iter::<_, Bytes, FacetGroupValueCodec>(&mut txn, &field_id.to_be_bytes()) - .unwrap(); - - for ((key, values), group) in values_field_id.iter().zip(level0iter) { - let (group_key, group_values) = group.unwrap(); - let group_key = - FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); - assert_eq!(key, &group_key.left_bound); - assert_eq!(values, &group_values.bitmap); - } - } - - for (field_id, values_field_id) in trivial_db.elements.iter() { - let level0iter = index - .content - .as_polymorph() - .prefix_iter::<_, Bytes, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) - .unwrap(); - - for ((key, values), group) in values_field_id.iter().zip(level0iter) { - let (group_key, group_values) = group.unwrap(); - let group_key = - FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); - assert_eq!(key, &group_key.left_bound); - assert_eq!(values, &group_values.bitmap); - } - index.verify_structure_validity(&txn, *field_id); - } - txn.abort().unwrap(); - } - - #[test] - #[no_coverage] - fn fuzz() { - let tempdir = Rc::new(TempDir::new().unwrap()); - let tempdir_cloned = tempdir.clone(); - let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { - compare_with_trivial_database(tempdir_cloned.clone(), operations) - }) - .default_mutator() - .serde_serializer() - .default_sensor_and_pool_with_custom_filter(|file, function| { - file == std::path::Path::new("milli/src/update/facet/incremental.rs") - && !function.contains("serde") - && !function.contains("tests::") - && !function.contains("fuzz::") - && !function.contains("display_bitmap") - }) - .arguments_from_cargo_fuzzcheck() - .launch(); - assert!(!result.found_test_failure); - } -} diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index 2e592519b..c40916670 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -79,22 +79,30 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use std::collections::BTreeSet; use std::fs::File; use std::io::BufReader; +use std::ops::Bound; use grenad::Merger; use heed::types::{Bytes, DecodeIgnore}; +use heed::BytesDecode as _; +use roaring::RoaringBitmap; use time::OffsetDateTime; use tracing::debug; use self::incremental::FacetsUpdateIncremental; +use super::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, +}; use crate::heed_codec::BytesRefCodec; +use crate::search::facet::get_highest_level; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::{try_split_array_at, FieldId, Index, Result}; pub mod bulk; pub mod incremental; +pub mod new_incremental; /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. /// @@ -140,7 +148,11 @@ impl<'i> FacetsUpdate<'i> { } } - pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> { + pub fn execute( + self, + wtxn: &mut heed::RwTxn<'_>, + new_settings: &InnerIndexSettings, + ) -> Result<()> { if self.data_size == 0 { return Ok(()); } @@ -149,8 +161,7 @@ impl<'i> FacetsUpdate<'i> { // See self::comparison_bench::benchmark_facet_indexing if self.data_size >= (self.database.len(wtxn)? / 500) { - let field_ids = - self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + let field_ids = facet_levels_field_ids(new_settings); let bulk_update = FacetsUpdateBulk::new( self.index, field_ids, @@ -172,6 +183,14 @@ impl<'i> FacetsUpdate<'i> { incremental_update.execute(wtxn)?; } + if !self.index.facet_search(wtxn)? { + // If facet search is disabled, we don't need to compute facet search databases. + // We clear the facet search databases. + self.index.facet_id_string_fst.clear(wtxn)?; + self.index.facet_id_normalized_string_strings.clear(wtxn)?; + return Ok(()); + } + match self.normalized_delta_data { Some(data) => index_facet_search(wtxn, data, self.index), None => Ok(()), @@ -276,6 +295,53 @@ fn index_facet_search( Ok(()) } +/// Clear all the levels greater than 0 for given field ids. +pub fn clear_facet_levels<'a, I>( + wtxn: &mut heed::RwTxn<'_>, + db: &heed::Database, DecodeIgnore>, + field_ids: I, +) -> Result<()> +where + I: IntoIterator, +{ + for field_id in field_ids { + let field_id = *field_id; + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + db.delete_range(wtxn, &range).map(drop)?; + } + Ok(()) +} + +pub fn clear_facet_levels_based_on_settings_diff( + wtxn: &mut heed::RwTxn<'_>, + index: &Index, + settings_diff: &InnerIndexSettingsDiff, +) -> Result<()> { + let new_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.new); + let old_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.old); + + let field_ids_to_clear: Vec<_> = old_field_ids.difference(&new_field_ids).copied().collect(); + clear_facet_levels(wtxn, &index.facet_id_string_docids.remap_types(), &field_ids_to_clear)?; + clear_facet_levels(wtxn, &index.facet_id_f64_docids.remap_types(), &field_ids_to_clear)?; + Ok(()) +} + +fn facet_levels_field_ids(settings: &InnerIndexSettings) -> B +where + B: FromIterator, +{ + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| { + metadata.require_facet_level_database(&settings.filterable_attributes_rules) + }) + .map(|(id, _)| id) + .collect() +} + #[cfg(test)] pub(crate) mod test_helpers { use std::cell::Cell; @@ -286,7 +352,7 @@ pub(crate) mod test_helpers { use grenad::MergerBuilder; use heed::types::Bytes; - use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn, WithoutTls}; use roaring::RoaringBitmap; use super::bulk::FacetsUpdateBulkInner; @@ -324,7 +390,7 @@ pub(crate) mod test_helpers { for<'a> BoundCodec: BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { - pub env: Env, + pub env: Env, pub content: heed::Database, FacetGroupValueCodec>, pub group_size: Cell, pub min_level_size: Cell, @@ -338,35 +404,6 @@ pub(crate) mod test_helpers { for<'a> BoundCodec: BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { - #[cfg(all(test, fuzzing))] - pub fn open_from_tempdir( - tempdir: Rc, - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16 - let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16 - let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 - - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 10 * 1000); - unsafe { - options.flag(heed::flags::Flags::MdbAlwaysFreePages); - } - let env = options.open(tempdir.path()).unwrap(); - let content = env.open_database(None).unwrap().unwrap(); - - FacetIndex { - content, - group_size: Cell::new(group_size), - max_group_size: Cell::new(max_group_size), - min_level_size: Cell::new(min_level_size), - _tempdir: tempdir, - env, - _phantom: PhantomData, - } - } pub fn new( group_size: u8, max_group_size: u8, @@ -375,7 +412,8 @@ pub(crate) mod test_helpers { let group_size = group_size.clamp(2, 127); let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - let mut options = heed::EnvOpenOptions::new(); + let options = heed::EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); let env = unsafe { options.open(tempdir.path()) }.unwrap(); @@ -394,26 +432,6 @@ pub(crate) mod test_helpers { } } - #[cfg(all(test, fuzzing))] - pub fn set_group_size(&self, group_size: u8) { - // 2 <= x <= 64 - self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); - } - #[cfg(all(test, fuzzing))] - pub fn set_max_group_size(&self, max_group_size: u8) { - // 2*group_size <= x <= 128 - let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); - self.max_group_size.set(max_group_size); - if self.group_size.get() < max_group_size / 2 { - self.group_size.set(max_group_size / 2); - } - } - #[cfg(all(test, fuzzing))] - pub fn set_min_level_size(&self, min_level_size: u8) { - // 1 <= x <= inf - self.min_level_size.set(std::cmp::max(1, min_level_size)); - } - pub fn insert<'a>( &self, wtxn: &'a mut RwTxn<'_>, @@ -638,3 +656,194 @@ mod comparison_bench { } } } + +/// Run sanity checks on the specified fid tree +/// +/// 1. No "orphan" child value, any child value has a parent +/// 2. Any docid in the child appears in the parent +/// 3. No docid in the parent is missing from all its children +/// 4. no group is bigger than max_group_size +/// 5. Less than 50% of groups are bigger than group_size +/// 6. group size matches the number of children +/// 7. max_level is < 255 +pub(crate) fn sanity_checks( + index: &Index, + rtxn: &heed::RoTxn, + field_id: FieldId, + facet_type: FacetType, + group_size: usize, + _min_level_size: usize, // might add a check on level size later + max_group_size: usize, +) -> Result<()> { + tracing::info!(%field_id, ?facet_type, "performing sanity checks"); + let database = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + + let leaf_prefix: FacetGroupKey<&[u8]> = FacetGroupKey { field_id, level: 0, left_bound: &[] }; + + let leaf_it = database.prefix_iter(rtxn, &leaf_prefix)?; + + let max_level = get_highest_level(rtxn, database, field_id)?; + if max_level == u8::MAX { + panic!("max_level == 255"); + } + + for leaf in leaf_it { + let (leaf_facet_value, leaf_docids) = leaf?; + let mut current_level = 0; + + let mut current_parent_facet_value: Option> = None; + let mut current_parent_docids: Option = None; + loop { + current_level += 1; + if current_level >= max_level { + break; + } + let parent_key_right_bound = FacetGroupKey { + field_id, + level: current_level, + left_bound: leaf_facet_value.left_bound, + }; + let (parent_facet_value, parent_docids) = database + .get_lower_than_or_equal_to(rtxn, &parent_key_right_bound)? + .expect("no parent found"); + if parent_facet_value.level != current_level { + panic!( + "wrong parent level, found_level={}, expected_level={}", + parent_facet_value.level, current_level + ); + } + if parent_facet_value.field_id != field_id { + panic!("wrong parent fid"); + } + if parent_facet_value.left_bound > leaf_facet_value.left_bound { + panic!("wrong parent left bound"); + } + + if !leaf_docids.bitmap.is_subset(&parent_docids.bitmap) { + panic!( + "missing docids from leaf in parent, current_level={}, parent={}, child={}, missing={missing:?}, child_len={}, child={:?}", + current_level, + facet_to_string(parent_facet_value.left_bound, facet_type), + facet_to_string(leaf_facet_value.left_bound, facet_type), + leaf_docids.bitmap.len(), + leaf_docids.bitmap.clone(), + missing=leaf_docids.bitmap - parent_docids.bitmap, + ) + } + + if let Some(current_parent_facet_value) = current_parent_facet_value { + if current_parent_facet_value.field_id != parent_facet_value.field_id { + panic!("wrong parent parent fid"); + } + if current_parent_facet_value.level + 1 != parent_facet_value.level { + panic!("wrong parent parent level"); + } + if current_parent_facet_value.left_bound < parent_facet_value.left_bound { + panic!("wrong parent parent left bound"); + } + } + + if let Some(current_parent_docids) = current_parent_docids { + if !current_parent_docids.bitmap.is_subset(&parent_docids.bitmap) { + panic!("missing docids from intermediate node in parent, parent_level={}, parent={}, intermediate={}, missing={missing:?}, intermediate={:?}", + parent_facet_value.level, + facet_to_string(parent_facet_value.left_bound, facet_type), + facet_to_string(current_parent_facet_value.unwrap().left_bound, facet_type), + current_parent_docids.bitmap.clone(), + missing=current_parent_docids.bitmap - parent_docids.bitmap, + ); + } + } + + current_parent_facet_value = Some(parent_facet_value); + current_parent_docids = Some(parent_docids); + } + } + tracing::info!(%field_id, ?facet_type, "checked all leaves"); + + let mut current_level = max_level; + let mut greater_than_group = 0usize; + let mut total = 0usize; + loop { + if current_level == 0 { + break; + } + let child_level = current_level - 1; + tracing::info!(%field_id, ?facet_type, %current_level, "checked groups for level"); + let level_groups_prefix: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id, level: current_level, left_bound: &[] }; + let mut level_groups_it = database.prefix_iter(rtxn, &level_groups_prefix)?.peekable(); + + 'group_it: loop { + let Some(group) = level_groups_it.next() else { break 'group_it }; + + let (group_facet_value, group_docids) = group?; + let child_left_bound = group_facet_value.left_bound.to_owned(); + let mut expected_docids = RoaringBitmap::new(); + let mut expected_size = 0usize; + let right_bound = level_groups_it + .peek() + .and_then(|res| res.as_ref().ok()) + .map(|(key, _)| key.left_bound); + let child_left_bound = FacetGroupKey { + field_id, + level: child_level, + left_bound: child_left_bound.as_slice(), + }; + let child_left_bound = Bound::Included(&child_left_bound); + let child_right_bound; + let child_right_bound = if let Some(right_bound) = right_bound { + child_right_bound = + FacetGroupKey { field_id, level: child_level, left_bound: right_bound }; + Bound::Excluded(&child_right_bound) + } else { + Bound::Unbounded + }; + let children = database.range(rtxn, &(child_left_bound, child_right_bound))?; + for child in children { + let (child_facet_value, child_docids) = child?; + if child_facet_value.field_id != field_id { + break; + } + if child_facet_value.level != child_level { + break; + } + expected_size += 1; + expected_docids |= &child_docids.bitmap; + } + assert_eq!(expected_size, group_docids.size as usize); + assert!(expected_size <= max_group_size); + assert_eq!(expected_docids, group_docids.bitmap); + total += 1; + if expected_size > group_size { + greater_than_group += 1; + } + } + + current_level -= 1; + } + if greater_than_group * 2 > total { + panic!("too many groups have a size > group_size"); + } + + tracing::info!("sanity checks OK"); + + Ok(()) +} + +fn facet_to_string(facet_value: &[u8], facet_type: FacetType) -> String { + match facet_type { + FacetType::String => bstr::BStr::new(facet_value).to_string(), + FacetType::Number => match OrderedF64Codec::bytes_decode(facet_value) { + Ok(value) => value.to_string(), + Err(e) => format!("error: {e} (bytes: {facet_value:?}"), + }, + } +} diff --git a/crates/milli/src/update/facet/new_incremental.rs b/crates/milli/src/update/facet/new_incremental.rs new file mode 100644 index 000000000..0890f8593 --- /dev/null +++ b/crates/milli/src/update/facet/new_incremental.rs @@ -0,0 +1,498 @@ +use std::ops::Bound; + +use heed::types::{Bytes, DecodeIgnore}; +use heed::{BytesDecode as _, Database, RwTxn}; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::BytesRefCodec; +use crate::search::facet::get_highest_level; +use crate::update::valid_facet_value; +use crate::{FieldId, Index, Result}; + +pub struct FacetsUpdateIncremental { + inner: FacetsUpdateIncrementalInner, + delta_data: Vec, +} + +struct FacetsUpdateIncrementalInner { + db: Database, FacetGroupValueCodec>, + field_id: FieldId, + group_size: u8, + min_level_size: u8, + max_group_size: u8, +} + +impl FacetsUpdateIncremental { + pub fn new( + index: &Index, + facet_type: FacetType, + field_id: FieldId, + delta_data: Vec, + group_size: u8, + min_level_size: u8, + max_group_size: u8, + ) -> Self { + FacetsUpdateIncremental { + inner: FacetsUpdateIncrementalInner { + db: match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => index + .facet_id_f64_docids + .remap_key_type::>(), + }, + field_id, + group_size, + min_level_size, + max_group_size, + }, + + delta_data, + } + } + + #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")] + pub fn execute(mut self, wtxn: &mut RwTxn) -> Result<()> { + if self.delta_data.is_empty() { + return Ok(()); + } + self.delta_data.sort_unstable_by( + |FacetFieldIdChange { facet_value: left, .. }, + FacetFieldIdChange { facet_value: right, .. }| { + left.cmp(right) + // sort in **reverse** lexicographic order + .reverse() + }, + ); + + self.inner.find_changed_parents(wtxn, self.delta_data)?; + + self.inner.add_or_delete_level(wtxn) + } +} + +impl FacetsUpdateIncrementalInner { + /// WARNING: `changed_children` must be sorted in **reverse** lexicographic order. + fn find_changed_parents( + &self, + wtxn: &mut RwTxn, + mut changed_children: Vec, + ) -> Result<()> { + let mut changed_parents = vec![]; + for child_level in 0u8..u8::MAX { + // child_level < u8::MAX by construction + let parent_level = child_level + 1; + let parent_level_left_bound: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id: self.field_id, level: parent_level, left_bound: &[] }; + + let mut last_parent: Option> = None; + let mut child_it = changed_children + // drain all changed children + .drain(..) + // keep only children whose value is valid in the LMDB sense + .filter(|child| valid_facet_value(&child.facet_value)); + // `while let` rather than `for` because we advance `child_it` inside of the loop + 'current_level: while let Some(child) = child_it.next() { + if let Some(last_parent) = &last_parent { + if &child.facet_value >= last_parent { + self.compute_parent_group(wtxn, child_level, child.facet_value)?; + continue 'current_level; + } + } + + // need to find a new parent + let parent_key_prefix = FacetGroupKey { + field_id: self.field_id, + level: parent_level, + left_bound: &*child.facet_value, + }; + + let parent = self + .db + .remap_data_type::() + .rev_range( + wtxn, + &( + Bound::Excluded(&parent_level_left_bound), + Bound::Included(&parent_key_prefix), + ), + )? + .next(); + + match parent { + Some(Ok((parent_key, _parent_value))) => { + // found parent, cache it for next keys + last_parent = Some(parent_key.left_bound.to_owned().into_boxed_slice()); + + // add to modified list for parent level + changed_parents.push(FacetFieldIdChange { + facet_value: parent_key.left_bound.to_owned().into_boxed_slice(), + }); + self.compute_parent_group(wtxn, child_level, child.facet_value)?; + } + Some(Err(err)) => return Err(err.into()), + None => { + // no parent for that key + let mut parent_it = self + .db + .remap_data_type::() + .prefix_iter_mut(wtxn, &parent_level_left_bound)?; + match parent_it.next() { + // 1. left of the current left bound, or + Some(Ok((first_key, _first_value))) => { + // make sure we don't spill on the neighboring fid (level also included defensively) + if first_key.field_id != self.field_id + || first_key.level != parent_level + { + // max level reached, exit + drop(parent_it); + self.compute_parent_group( + wtxn, + child_level, + child.facet_value, + )?; + for child in child_it.by_ref() { + self.compute_parent_group( + wtxn, + child_level, + child.facet_value, + )?; + } + return Ok(()); + } + // remove old left bound + unsafe { parent_it.del_current()? }; + drop(parent_it); + changed_parents.push(FacetFieldIdChange { + facet_value: child.facet_value.clone(), + }); + self.compute_parent_group(wtxn, child_level, child.facet_value)?; + // pop all elements in order to visit the new left bound + let new_left_bound = + &mut changed_parents.last_mut().unwrap().facet_value; + for child in child_it.by_ref() { + new_left_bound.clone_from(&child.facet_value); + + self.compute_parent_group( + wtxn, + child_level, + child.facet_value, + )?; + } + } + Some(Err(err)) => return Err(err.into()), + // 2. max level reached, exit + None => { + drop(parent_it); + self.compute_parent_group(wtxn, child_level, child.facet_value)?; + for child in child_it.by_ref() { + self.compute_parent_group( + wtxn, + child_level, + child.facet_value, + )?; + } + return Ok(()); + } + } + } + } + } + if changed_parents.is_empty() { + return Ok(()); + } + drop(child_it); + std::mem::swap(&mut changed_children, &mut changed_parents); + // changed_parents is now empty because changed_children was emptied by the drain + } + Ok(()) + } + + fn compute_parent_group( + &self, + wtxn: &mut RwTxn<'_>, + parent_level: u8, + parent_left_bound: Box<[u8]>, + ) -> Result<()> { + let mut range_left_bound: Vec = parent_left_bound.into(); + if parent_level == 0 { + return Ok(()); + } + let child_level = parent_level - 1; + + let parent_key = FacetGroupKey { + field_id: self.field_id, + level: parent_level, + left_bound: &*range_left_bound, + }; + let child_right_bound = self + .db + .remap_data_type::() + .get_greater_than(wtxn, &parent_key)? + .and_then( + |( + FacetGroupKey { + level: right_level, + field_id: right_fid, + left_bound: right_bound, + }, + _, + )| { + if parent_level != right_level || self.field_id != right_fid { + // there was a greater key, but with a greater level or fid, so not a sibling to the parent: ignore + return None; + } + Some(right_bound.to_owned()) + }, + ); + let child_right_bound = match &child_right_bound { + Some(right_bound) => Bound::Excluded(FacetGroupKey { + left_bound: right_bound.as_slice(), + field_id: self.field_id, + level: child_level, + }), + None => Bound::Unbounded, + }; + + let child_left_key = FacetGroupKey { + field_id: self.field_id, + level: child_level, + left_bound: &*range_left_bound, + }; + let mut child_left_bound = Bound::Included(child_left_key); + + loop { + // do a first pass on the range to find the number of children + let child_count = self + .db + .remap_data_type::() + .range(wtxn, &(child_left_bound, child_right_bound))? + .take(self.max_group_size as usize * 2) + .count(); + let mut child_it = self.db.range(wtxn, &(child_left_bound, child_right_bound))?; + + // pick the right group_size depending on the number of children + let group_size = if child_count >= self.max_group_size as usize * 2 { + // more than twice the max_group_size => there will be space for at least 2 groups of max_group_size + self.max_group_size as usize + } else if child_count >= self.group_size as usize { + // size in [group_size, max_group_size * 2[ + // divided by 2 it is between [group_size / 2, max_group_size[ + // this ensures that the tree is balanced + child_count / 2 + } else { + // take everything + child_count + }; + + let res: Result<_> = child_it + .by_ref() + .take(group_size) + // stop if we go to the next level or field id + .take_while(|res| match res { + Ok((child_key, _)) => { + child_key.field_id == self.field_id && child_key.level == child_level + } + Err(_) => true, + }) + .try_fold( + (None, FacetGroupValue { size: 0, bitmap: Default::default() }), + |(bounds, mut group_value), child_res| { + let (child_key, child_value) = child_res?; + let bounds = match bounds { + Some((left_bound, _)) => Some((left_bound, child_key.left_bound)), + None => Some((child_key.left_bound, child_key.left_bound)), + }; + // max_group_size <= u8::MAX + group_value.size += 1; + group_value.bitmap |= &child_value.bitmap; + Ok((bounds, group_value)) + }, + ); + + let (bounds, group_value) = res?; + + let Some((group_left_bound, right_bound)) = bounds else { + let update_key = FacetGroupKey { + field_id: self.field_id, + level: parent_level, + left_bound: &*range_left_bound, + }; + drop(child_it); + if let Bound::Included(_) = child_left_bound { + self.db.delete(wtxn, &update_key)?; + } + + break; + }; + + drop(child_it); + let current_left_bound = group_left_bound.to_owned(); + + let delete_old_bound = match child_left_bound { + Bound::Included(bound) => { + if bound.left_bound != current_left_bound { + Some(range_left_bound.clone()) + } else { + None + } + } + _ => None, + }; + + range_left_bound.clear(); + range_left_bound.extend_from_slice(right_bound); + let child_left_key = FacetGroupKey { + field_id: self.field_id, + level: child_level, + left_bound: range_left_bound.as_slice(), + }; + child_left_bound = Bound::Excluded(child_left_key); + + if let Some(old_bound) = delete_old_bound { + let update_key = FacetGroupKey { + field_id: self.field_id, + level: parent_level, + left_bound: old_bound.as_slice(), + }; + self.db.delete(wtxn, &update_key)?; + } + + let update_key = FacetGroupKey { + field_id: self.field_id, + level: parent_level, + left_bound: current_left_bound.as_slice(), + }; + if group_value.bitmap.is_empty() { + self.db.delete(wtxn, &update_key)?; + } else { + self.db.put(wtxn, &update_key, &group_value)?; + } + } + + Ok(()) + } + + /// Check whether the highest level has exceeded `min_level_size` * `self.group_size`. + /// If it has, we must build an addition level above it. + /// Then check whether the highest level is under `min_level_size`. + /// If it has, we must remove the complete level. + pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn<'_>) -> Result<()> { + let highest_level = get_highest_level(txn, self.db, self.field_id)?; + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&self.field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + let size_highest_level = + self.db.remap_types::().prefix_iter(txn, &highest_level_prefix)?.count(); + + if size_highest_level >= self.group_size as usize * self.min_level_size as usize { + self.add_level(txn, highest_level, &highest_level_prefix, size_highest_level) + } else if size_highest_level < self.min_level_size as usize && highest_level != 0 { + self.delete_level(txn, &highest_level_prefix) + } else { + Ok(()) + } + } + + /// Delete a level. + fn delete_level(&self, txn: &mut RwTxn<'_>, highest_level_prefix: &[u8]) -> Result<()> { + let mut to_delete = vec![]; + let mut iter = + self.db.remap_types::().prefix_iter(txn, highest_level_prefix)?; + for el in iter.by_ref() { + let (k, _) = el?; + to_delete.push( + FacetGroupKeyCodec::::bytes_decode(k) + .map_err(heed::Error::Encoding)? + .into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } + + /// Build an additional level for the field id. + fn add_level( + &self, + txn: &mut RwTxn<'_>, + highest_level: u8, + highest_level_prefix: &[u8], + size_highest_level: usize, + ) -> Result<()> { + let mut groups_iter = self + .db + .remap_types::() + .prefix_iter(txn, highest_level_prefix)?; + + let nbr_new_groups = size_highest_level / self.group_size as usize; + let nbr_leftover_elements = size_highest_level % self.group_size as usize; + + let mut to_add = vec![]; + for _ in 0..nbr_new_groups { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..self.group_size { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) + .map_err(heed::Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id: self.field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: self.group_size, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + // now we add the rest of the level, in case its size is > group_size * min_level_size + // this can indeed happen if the min_level_size parameter changes between two calls to `insert` + if nbr_leftover_elements > 0 { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..nbr_leftover_elements { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) + .map_err(heed::Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id: self.field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + // Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size` + // when it is created above. + let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + + drop(groups_iter); + for (key, value) in to_add { + self.db.put(txn, &key.as_ref(), &value)?; + } + Ok(()) + } +} + +#[derive(Debug)] +pub struct FacetFieldIdChange { + pub facet_value: Box<[u8]>, +} diff --git a/crates/milli/src/update/index_documents/enrich.rs b/crates/milli/src/update/index_documents/enrich.rs index 85f871830..0aaab70e8 100644 --- a/crates/milli/src/update/index_documents/enrich.rs +++ b/crates/milli/src/update/index_documents/enrich.rs @@ -5,6 +5,7 @@ use std::result::Result as StdResult; use serde::{Deserialize, Serialize}; use serde_json::Value; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::{ DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, @@ -93,13 +94,8 @@ pub fn enrich_documents_batch( // If the settings specifies that a _geo field must be used therefore we must check the // validity of it in all the documents of this batch and this is when we return `Some`. - let geo_field_id = match documents_batch_index.id("_geo") { - Some(geo_field_id) - if index.sortable_fields(rtxn)?.contains("_geo") - || index.filterable_fields(rtxn)?.contains("_geo") => - { - Some(geo_field_id) - } + let geo_field_id = match documents_batch_index.id(RESERVED_GEO_FIELD_NAME) { + Some(geo_field_id) if index.is_geo_enabled(rtxn)? => Some(geo_field_id), _otherwise => None, }; @@ -119,7 +115,7 @@ pub fn enrich_documents_batch( if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { - return Ok(Err(UserError::from(user_error))); + return Ok(Err(UserError::from(Box::new(user_error)))); } } diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index b1e6f24be..d502e69cc 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -58,9 +58,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let old_dictionary: Option> = settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let del_builder = + let mut del_builder = tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); - let del_tokenizer = del_builder.into_tokenizer(); + let del_tokenizer = del_builder.build(); let new_stop_words = settings_diff.new.stop_words.as_ref(); let new_separators: Option> = settings_diff @@ -70,9 +70,9 @@ pub fn extract_docid_word_positions( .map(|s| s.iter().map(String::as_str).collect()); let new_dictionary: Option> = settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let add_builder = + let mut add_builder = tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); - let add_tokenizer = add_builder.into_tokenizer(); + let add_tokenizer = add_builder.build(); // iterate over documents. let mut cursor = obkv_documents.into_cursor()?; @@ -150,9 +150,14 @@ fn searchable_fields_changed( obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, ) -> bool { - let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.contains(&field_id) { + let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + // If the field id is not in the fields ids map, skip it. + // This happens for the vectors sub-fields. for example: + // "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered. + continue; + }; + if metadata.is_searchable() { let del_add = KvReaderDelAdd::from_slice(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -200,8 +205,14 @@ fn tokens_from_document<'a>( buffers.obkv_buffer.clear(); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + // If the field id is not in the fields ids map, skip it. + // This happens for the vectors sub-fields. for example: + // "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered. + continue; + }; // if field is searchable. - if settings.searchable_fields_ids.contains(&field_id) { + if metadata.is_searchable() { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) { // parse json. @@ -216,7 +227,7 @@ fn tokens_from_document<'a>( buffers.field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { // create an iterator of token with their positions. - let locales = settings.localized_searchable_fields_ids.locales(field_id); + let locales = metadata.locales(&settings.localized_attributes_rules); let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); diff --git a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index e0d7e1386..5b7639e59 100644 --- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,12 +12,11 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; -use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps, }; -use crate::update::settings::InnerIndexSettingsDiff; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -33,11 +32,10 @@ pub fn extract_facet_string_docids( if settings_diff.settings_update_only() { extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) } else { - let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; extract_facet_string_docids_document_update( docid_fid_facet_string, indexer, - localized_field_ids, + &settings_diff.new, ) } } @@ -50,7 +48,7 @@ pub fn extract_facet_string_docids( fn extract_facet_string_docids_document_update( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, - localized_field_ids: &LocalizedFieldIds, + settings: &InnerIndexSettings, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -89,6 +87,14 @@ fn extract_facet_string_docids_document_update( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + unreachable!("metadata not found for field_id: {}", field_id) + }; + + if !metadata.is_faceted(&settings.filterable_attributes_rules) { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -96,8 +102,10 @@ fn extract_facet_string_docids_document_update( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - { - let locales = localized_field_ids.locales(field_id); + let features = + metadata.filterable_attributes_features(&settings.filterable_attributes_rules); + if features.is_facet_searchable() && settings.facet_search { + let locales = metadata.locales(&settings.localized_attributes_rules); let hyper_normalized_value = normalize_facet_string(normalized_value, locales); let set = BTreeSet::from_iter(std::iter::once(normalized_value)); @@ -175,12 +183,21 @@ fn extract_facet_string_docids_settings( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else { + unreachable!("old metadata not found for field_id: {}", field_id) + }; + let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + unreachable!("new metadata not found for field_id: {}", field_id) + }; + + let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules); + let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules); let are_same_locales = old_locales == new_locales; + let reindex_facet_search = + settings_diff.new.facet_search && !settings_diff.old.facet_search; - if is_same_value && are_same_locales { + if is_same_value && are_same_locales && !reindex_facet_search { continue; } @@ -191,18 +208,33 @@ fn extract_facet_string_docids_settings( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - { - let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); - let new_hyper_normalized_value = if are_same_locales { - &old_hyper_normalized_value + if settings_diff.new.facet_search { + let new_filterable_features = new_metadata + .filterable_attributes_features(&settings_diff.new.filterable_attributes_rules); + let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); + let old_hyper_normalized_value; + let old_filterable_features = old_metadata + .filterable_attributes_features(&settings_diff.old.filterable_attributes_rules); + let old_hyper_normalized_value = if !settings_diff.old.facet_search + || deladd_reader.get(DelAdd::Deletion).is_none() + || !old_filterable_features.is_facet_searchable() + { + // if the facet search is disabled in the old settings or if no facet string is deleted, + // we don't need to normalize the facet string. + None + } else if are_same_locales { + Some(&new_hyper_normalized_value) } else { - &normalize_facet_string(normalized_value, new_locales) + old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); + Some(&old_hyper_normalized_value) }; let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { + if old_hyper_normalized_value == Some(&new_hyper_normalized_value) + && new_filterable_features.is_facet_searchable() + { // nothing to do if we delete and re-add the value. if is_same_value { continue; @@ -222,7 +254,7 @@ fn extract_facet_string_docids_settings( } else { // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different. // deletion - if deladd_reader.get(DelAdd::Deletion).is_some() { + if let Some(old_hyper_normalized_value) = old_hyper_normalized_value { // insert old value let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; buffer.clear(); @@ -236,7 +268,9 @@ fn extract_facet_string_docids_settings( } // addition - if deladd_reader.get(DelAdd::Addition).is_some() { + if new_filterable_features.is_facet_searchable() + && deladd_reader.get(DelAdd::Addition).is_some() + { // insert new value let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; buffer.clear(); diff --git a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 047669521..d259ce34f 100644 --- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -76,11 +76,11 @@ pub fn extract_fid_docid_facet_values( let mut strings_key_buffer = Vec::new(); let old_faceted_fids: BTreeSet<_> = - settings_diff.old.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion); let new_faceted_fids: BTreeSet<_> = - settings_diff.new.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition); - if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { + if !settings_diff.settings_update_only || settings_diff.reindex_facets() { let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::from_slice(value); @@ -112,8 +112,10 @@ pub fn extract_fid_docid_facet_values( (field_id, None, add_value) } EitherOrBoth::Both(&field_id, _) => { - // during settings update, recompute the changing settings only. - if settings_diff.settings_update_only { + // during settings update, recompute the changing settings only unless a global change is detected. + if settings_diff.settings_update_only + && !settings_diff.global_facet_settings_changed() + { continue; } @@ -158,11 +160,11 @@ pub fn extract_fid_docid_facet_values( let del_geo_support = settings_diff .old .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + .is_some_and(|(lat, lng)| field_id == lat || field_id == lng); let add_geo_support = settings_diff .new .geo_fields_ids - .map_or(false, |(lat, lng)| field_id == lat || field_id == lng); + .is_some_and(|(lat, lng)| field_id == lat || field_id == lng); let del_filterable_values = del_value.map(|value| extract_facet_values(&value, del_geo_support)); let add_filterable_values = diff --git a/crates/milli/src/update/index_documents/extract/extract_geo_points.rs b/crates/milli/src/update/index_documents/extract/extract_geo_points.rs index 84f5e556b..fb2ea9d77 100644 --- a/crates/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -80,22 +80,28 @@ fn extract_lat_lng( let (lat, lng) = match (lat, lng) { (Some(lat), Some(lng)) => (lat, lng), (Some(_), None) => { - return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) + return Err( + Box::new(GeoError::MissingLatitude { document_id: document_id() }).into() + ) } (None, Some(_)) => { - return Err(GeoError::MissingLongitude { document_id: document_id() }.into()) + return Err( + Box::new(GeoError::MissingLongitude { document_id: document_id() }).into() + ) } (None, None) => return Ok(None), }; let lat = extract_finite_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) - .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat }) + .map_err(Box::new)?; let lng = extract_finite_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) - .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng }) + .map_err(Box::new)?; Ok(Some([lat, lng])) } None => Ok(None), diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 7b5bf3f40..cb8c121ce 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -13,13 +13,15 @@ use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; -use crate::prompt::{FieldsIdsMapWithMetadata, Prompt}; +use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; -use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; use crate::vector::settings::ReindexAction; use crate::vector::{Embedder, Embedding}; use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort}; @@ -189,12 +191,8 @@ pub fn extract_vector_points( let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; - let old_fields_ids_map = - FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids); let new_fields_ids_map = &settings_diff.new.fields_ids_map; - let new_fields_ids_map = - FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids); // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); @@ -382,7 +380,7 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, &new_fields_ids_map)? + regenerate_prompt(obkv, prompt, new_fields_ids_map)? } }, // prompt regeneration is only triggered for existing embedders @@ -399,7 +397,7 @@ pub fn extract_vector_points( regenerate_if_prompt_changed( obkv, (old_prompt, prompt), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), )? } else { // we can simply ignore user provided vectors as they are not regenerated and are @@ -415,7 +413,7 @@ pub fn extract_vector_points( prompt, (add_to_user_provided, remove_from_user_provided), (old, new), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), document_id, embedder_name, embedder_is_manual, @@ -485,10 +483,7 @@ fn extract_vector_document_diff( prompt: &Prompt, (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), (old, new): (VectorState, VectorState), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, embedder_name: &str, embedder_is_manual: bool, @@ -610,10 +605,7 @@ fn extract_vector_document_diff( fn regenerate_if_prompt_changed( obkv: &obkv::KvReader, (old_prompt, new_prompt): (&Prompt, &Prompt), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), ) -> Result { let old_prompt = old_prompt .render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) @@ -629,7 +621,7 @@ fn regenerate_if_prompt_changed( fn regenerate_prompt( obkv: &obkv::KvReader, prompt: &Prompt, - new_fields_ids_map: &FieldsIdsMapWithMetadata, + new_fields_ids_map: &FieldIdMapWithMetadata, ) -> Result { let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; @@ -794,7 +786,7 @@ fn embed_chunks( unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { - match embedder.embed_chunks(text_chunks, request_threads) { + match embedder.embed_index(text_chunks, request_threads) { Ok(chunks) => Ok(chunks), Err(error) => { if let FaultSource::Bug = error.fault { diff --git a/crates/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 6194da23d..bd8444fd1 100644 --- a/crates/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -69,7 +69,7 @@ pub fn extract_word_pair_proximity_docids( let document_id = u32::from_be_bytes(document_id_bytes); // if we change document, we fill the sorter - if current_document_id.map_or(false, |id| id != document_id) { + if current_document_id.is_some_and(|id| id != document_id) { // FIXME: span inside of a hot loop might degrade performance and create big reports let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter"); let _entered = span.enter(); @@ -96,7 +96,7 @@ pub fn extract_word_pair_proximity_docids( if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) { for (position, word) in KvReaderU16::from_slice(deletion).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. - while del_word_positions.front().map_or(false, |(_w, p)| { + while del_word_positions.front().is_some_and(|(_w, p)| { index_proximity(*p as u32, position as u32) >= MAX_DISTANCE }) { word_positions_into_word_pair_proximity( @@ -129,7 +129,7 @@ pub fn extract_word_pair_proximity_docids( if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) { for (position, word) in KvReaderU16::from_slice(addition).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. - while add_word_positions.front().map_or(false, |(_w, p)| { + while add_word_positions.front().is_some_and(|(_w, p)| { index_proximity(*p as u32, position as u32) >= MAX_DISTANCE }) { word_positions_into_word_pair_proximity( diff --git a/crates/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/crates/milli/src/update/index_documents/extract/extract_word_position_docids.rs index f870fbe1b..87cced2c5 100644 --- a/crates/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -46,7 +46,7 @@ pub fn extract_word_position_docids( .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); - if current_document_id.map_or(false, |id| document_id != id) { + if current_document_id.is_some_and(|id| document_id != id) { words_position_into_sorter( current_document_id.unwrap(), &mut key_buffer, diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index cab84400c..8cd664a2f 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -281,7 +281,7 @@ fn send_original_documents_data( }; if !(remove_vectors.is_empty() && manual_vectors.is_empty() - && embeddings.as_ref().map_or(true, |e| e.is_empty())) + && embeddings.as_ref().is_none_or(|e| e.is_empty())) { let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints { remove_vectors, diff --git a/crates/milli/src/update/index_documents/helpers/mod.rs b/crates/milli/src/update/index_documents/helpers/mod.rs index c188e324d..5dec54ffc 100644 --- a/crates/milli/src/update/index_documents/helpers/mod.rs +++ b/crates/milli/src/update/index_documents/helpers/mod.rs @@ -10,10 +10,14 @@ use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::*; pub use merge_functions::*; -use crate::MAX_WORD_LENGTH; +use crate::MAX_LMDB_KEY_LENGTH; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { - key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() + key.as_ref().len() <= MAX_LMDB_KEY_LENGTH - 3 && !key.as_ref().is_empty() +} + +pub fn valid_facet_value(facet_value: impl AsRef<[u8]>) -> bool { + facet_value.as_ref().len() <= MAX_LMDB_KEY_LENGTH - 3 && !facet_value.as_ref().is_empty() } /// Divides one slice into two at an index, returns `None` if mid is out of bounds. diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index baecbdcf0..2bdc94f05 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -26,9 +26,12 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk}; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; pub use self::helpers::*; pub use self::transform::{Transform, TransformOutput}; +use super::facet::clear_facet_levels_based_on_settings_diff; use super::new::StdResult; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; +use crate::index::{PrefixSearch, PrefixSettings}; +use crate::progress::Progress; use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ @@ -82,8 +85,6 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub words_prefix_threshold: Option, - pub max_prefix_length: Option, pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, @@ -216,9 +217,8 @@ where flattened_documents, } = output; - // update the internal facet and searchable list, + // update the searchable list, // because they might have changed due to the nested documents flattening. - settings_diff.new.recompute_facets(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); @@ -308,6 +308,7 @@ where let current_span = tracing::Span::current(); // Run extraction pipeline in parallel. + let mut modified_docids = RoaringBitmap::new(); pool.install(|| { let settings_diff_cloned = settings_diff.clone(); rayon::spawn(move || { @@ -368,7 +369,7 @@ where Err(status) => { if let Some(typed_chunks) = chunk_accumulator.pop_longest() { let (docids, is_merged_database) = - write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?; + write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); @@ -465,9 +466,18 @@ where } } + // If the settings are only being updated, we may have to clear some of the facet levels. + if settings_diff.settings_update_only() { + clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?; + } + Ok(()) }).map_err(InternalError::from)??; + if !settings_diff.settings_update_only { + // Update the stats of the documents database when there is a document update. + self.index.update_documents_stats(self.wtxn, modified_docids)?; + } // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -502,16 +512,22 @@ where InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name); - let was_quantized = settings_diff - .old - .embedding_configs - .get(&embedder_name) - .map_or(false, |conf| conf.2); - let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized); + let was_quantized = + settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); pool.install(|| { let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); - writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?; + writer.build_and_quantize( + wtxn, + // In the settings we don't have any progress to share + &Progress::default(), + &mut rng, + dimension, + is_quantizing, + self.indexer_config.max_memory, + cancel, + )?; Result::Ok(()) }) .map_err(InternalError::from)??; @@ -565,14 +581,32 @@ where self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; // Run the words prefixes update operation. - let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); - if let Some(value) = self.config.words_prefix_threshold { - builder.threshold(value); + let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = + self.index.prefix_settings(self.wtxn)?; + + // If the prefix search is enabled at indexing time, we compute the prefixes. + if compute_prefixes == PrefixSearch::IndexingTime { + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); + builder.threshold(prefix_count_threshold); + builder.max_prefix_length(max_prefix_length); + builder.execute()?; + } else { + // If the prefix search is disabled at indexing time, we delete the previous words prefixes fst. + // And all the associated docids databases. + self.index.delete_words_prefixes_fst(self.wtxn)?; + self.index.word_prefix_docids.clear(self.wtxn)?; + self.index.exact_word_prefix_docids.clear(self.wtxn)?; + self.index.word_prefix_position_docids.clear(self.wtxn)?; + self.index.word_prefix_fid_docids.clear(self.wtxn)?; + + databases_seen += 3; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + return Ok(()); } - if let Some(value) = self.config.max_prefix_length { - builder.max_prefix_length(value); - } - builder.execute()?; if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); @@ -746,13 +780,15 @@ mod tests { use maplit::hashset; use super::*; + use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; use crate::index::IndexEmbeddingConfig; + use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; - use crate::{db_snap, Filter, Search, UserError}; + use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError}; #[test] fn simple_document_replacement() { @@ -926,12 +962,12 @@ mod tests { index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index.add_documents(documents!([ - { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, + { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 42 } }, { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-ExupĆ©ry", "genre": "adventure" , "price": 10.0 }, { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, - { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", RESERVED_GEO_FIELD_NAME: { "lat": 35, "lng": 23 } } ])).unwrap(); db_snap!(index, word_docids, "initial"); @@ -971,18 +1007,20 @@ mod tests { // We send 6 documents and mix the ones that have _geo and those that don't have it. index .add_documents(documents!([ - { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, + { "id": 2, "price": 3.5, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 42 } }, { "id": 456 }, { "id": 1 }, { "id": 1344 }, { "id": 4 }, - { "id": 42, "_geo": { "lat": 35, "lng": 23 } } + { "id": 42, RESERVED_GEO_FIELD_NAME: { "lat": 35, "lng": 23 } } ])) .unwrap(); index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); } @@ -994,13 +1032,15 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); let error = index .add_documents(documents!([ - { "id": 0, "_geo": { "lng": 42 } } + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lng": 42 } } ])) .unwrap_err(); assert_eq!( @@ -1010,7 +1050,7 @@ mod tests { let error = index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": 42 } } + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 42 } } ])) .unwrap_err(); assert_eq!( @@ -1020,7 +1060,7 @@ mod tests { let error = index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": "lol", "lng": 42 } } ])) .unwrap_err(); assert_eq!( @@ -1030,7 +1070,7 @@ mod tests { let error = index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": [12, 13], "lng": 42 } } ])) .unwrap_err(); assert_eq!( @@ -1040,7 +1080,7 @@ mod tests { let error = index .add_documents(documents!([ - { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } + { "id": 0, RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": "hello" } } ])) .unwrap_err(); assert_eq!( @@ -1058,7 +1098,7 @@ mod tests { { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, - { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + { "objectId": 30, "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } } ])) .unwrap(); @@ -1073,7 +1113,7 @@ mod tests { index .add_documents(documents!([ - { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + { "objectId": 30, "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } } ])) .unwrap(); @@ -1084,7 +1124,7 @@ mod tests { index .add_documents(documents!([ - { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + { "objectId": 30, "title": "Hamlet", RESERVED_GEO_FIELD_NAME: { "lat": 12, "lng": 89 } } ])) .unwrap(); } @@ -1210,16 +1250,17 @@ mod tests { let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; settings.set_searchable_fields(searchable_fields); - let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + let faceted_fields = vec![ + FilterableAttributesRule::Field("title".to_string()), + FilterableAttributesRule::Field("nested.object".to_string()), + FilterableAttributesRule::Field("nested.machin".to_string()), + ]; settings.set_filterable_fields(faceted_fields); }) .unwrap(); let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); - // testing the simple query search let mut search = crate::Search::new(&rtxn, &index); search.query("document"); @@ -1414,7 +1455,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(String::from("dog"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "dog".to_string(), + )]); }) .unwrap(); @@ -1433,10 +1476,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let hidden = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); - for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { let mut search = crate::Search::new(&rtxn, &index); let filter = format!(r#""dog.race.bernese mountain" = {s}"#); @@ -1454,12 +1493,6 @@ mod tests { db_snap!(index, facet_id_string_docids, @""); db_snap!(index, field_id_docid_facet_strings, @""); - let rtxn = index.read_txn().unwrap(); - - let facets = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(facets, hashset!()); - // update the settings to test the sortable index .update_settings(|settings| { @@ -1482,10 +1515,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); - let mut search = crate::Search::new(&rtxn, &index); search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( "dog.race.bernese mountain", @@ -1693,8 +1722,6 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1714,7 +1741,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![7]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1799,8 +1826,6 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1820,7 +1845,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1863,8 +1888,6 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); @@ -1883,7 +1906,7 @@ mod tests { assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); }; - let faceted_fields = hashset!(S("tags")); + let faceted_fields = vec![FilterableAttributesRule::Field("tags".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1932,11 +1955,11 @@ mod tests { let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); - indexer.add_documents(&doc1).unwrap(); - indexer.add_documents(&doc2).unwrap(); - indexer.add_documents(&doc3).unwrap(); - indexer.add_documents(&doc4).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + indexer.replace_documents(&doc1).unwrap(); + indexer.replace_documents(&doc2).unwrap(); + indexer.replace_documents(&doc3).unwrap(); + indexer.replace_documents(&doc4).unwrap(); let indexer_alloc = Bump::new(); let (_document_changes, operation_stats, _primary_key) = indexer @@ -1947,7 +1970,7 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -1955,6 +1978,174 @@ mod tests { assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_some()).count(), 3); } + #[test] + fn mixing_documents_replace_with_updates() { + let index = TempIndex::new_with_map_size(4096 * 100); + + let doc1 = documents! {[{ + "id": 1, + "title": "asdsad", + "description": "Wat wat wat, wat" + }]}; + + let doc2 = documents! {[{ + "id": 1, + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": 1, + "title": "another something", + }]}; + + let doc4 = documents! {[{ + "id": 1, + "description": "This is it!", + }]}; + + let rtxn = index.inner.read_txn().unwrap(); + let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap(); + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + let mut indexer = indexer::DocumentOperation::new(); + indexer.replace_documents(&doc1).unwrap(); + indexer.update_documents(&doc2).unwrap(); + indexer.update_documents(&doc3).unwrap(); + indexer.update_documents(&doc4).unwrap(); + + let indexer_alloc = Bump::new(); + let (document_changes, operation_stats, primary_key) = indexer + .into_changes( + &indexer_alloc, + &index.inner, + &rtxn, + None, + &mut new_fields_ids_map, + &|| false, + Progress::default(), + ) + .unwrap(); + + assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 4); + + let mut wtxn = index.write_txn().unwrap(); + indexer::index( + &mut wtxn, + &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), + index.indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + EmbeddingConfigs::default(), + &|| false, + &Progress::default(), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let obkv = index.document(&rtxn, 0).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let json_document = all_obkv_to_json(obkv, &fields_ids_map).unwrap(); + let expected = serde_json::json!({ + "id": 1, + "title": "another something", + "description": "This is it!", + }); + let expected = expected.as_object().unwrap(); + assert_eq!(&json_document, expected); + } + + #[test] + fn mixing_documents_replace_with_updates_even_more() { + let index = TempIndex::new_with_map_size(4096 * 100); + + let doc1 = documents! {[{ + "id": 1, + "title": "asdsad", + "description": "Wat wat wat, wat" + }]}; + + let doc2 = documents! {[{ + "id": 1, + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": 1, + "title": "another something", + }]}; + + let doc4 = documents! {[{ + "id": 1, + "title": "Woooof", + }]}; + + let doc5 = documents! {[{ + "id": 1, + "description": "This is it!", + }]}; + + let rtxn = index.inner.read_txn().unwrap(); + let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap(); + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + let mut indexer = indexer::DocumentOperation::new(); + indexer.replace_documents(&doc1).unwrap(); + indexer.update_documents(&doc2).unwrap(); + indexer.update_documents(&doc3).unwrap(); + indexer.replace_documents(&doc4).unwrap(); + indexer.update_documents(&doc5).unwrap(); + + let indexer_alloc = Bump::new(); + let (document_changes, operation_stats, primary_key) = indexer + .into_changes( + &indexer_alloc, + &index.inner, + &rtxn, + None, + &mut new_fields_ids_map, + &|| false, + Progress::default(), + ) + .unwrap(); + + assert_eq!(operation_stats.iter().filter(|ps| ps.error.is_none()).count(), 5); + + let mut wtxn = index.write_txn().unwrap(); + indexer::index( + &mut wtxn, + &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), + index.indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + EmbeddingConfigs::default(), + &|| false, + &Progress::default(), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let obkv = index.document(&rtxn, 0).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let json_document = all_obkv_to_json(obkv, &fields_ids_map).unwrap(); + let expected = serde_json::json!({ + "id": 1, + "title": "Woooof", + "description": "This is it!", + }); + let expected = expected.as_object().unwrap(); + assert_eq!(&json_document, expected); + } + #[test] fn primary_key_must_not_contain_whitespace() { let index = TempIndex::new(); @@ -2067,40 +2258,15 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "title".to_string(), + )]); }) .unwrap(); index.add_documents(doc1).unwrap(); } - #[cfg(feature = "default")] - #[test] - fn store_detected_script_and_language_per_document_during_indexing() { - use charabia::{Language, Script}; - let index = TempIndex::new(); - index - .add_documents(documents!([ - { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, - { "id": 2, "title": "äŗŗäŗŗē”Ÿč€Œč‡Ŗē”±ļ¹åœØå°Šåš“å’Œę¬Šåˆ©äøŠäø€å¾‹å¹³ē­‰ć€‚ä»–å€‘č³¦ęœ‰ē†ę€§å’Œč‰Æåæƒļ¹äø¦ę‡‰ä»„å…„å¼Ÿé—œäæ‚ēš„ē²¾ē„žäŗ’ē›øå°å¾…ć€‚" }, - { "id": 3, "title": "×”Ö·×©Ö¼××•Ö¼×¢Öø×œ ×”Ö·×žÖ¼Öø×”Ö“×™×Ø (דהַחוּםד) לֹא ×™Öø×›×•Ö¹×œ ל֓קְפֹּׄ 9.94 ×žÖ¶×˜Ö°×ØÖ“×™×, × Öø×›×•Ö¹×Ÿ? ברר, 1.5°C- בַּחוּׄ!" }, - { "id": 4, "title": "é–¢č„æå›½éš›ē©ŗęøÆé™å®šćƒˆćƒ¼ćƒˆćƒćƒƒć‚° 恙悂悂悂悂悂悂悂悂恮恆恔" }, - { "id": 5, "title": "ąø ąø²ąø©ąø²ą¹„ąø—ąø¢ąø‡ą¹ˆąø²ąø¢ąø™ąø“ąø”ą¹€ąø”ąøµąø¢ąø§" }, - { "id": 6, "title": "The quick åœØå°Šåš“å’Œę¬Šåˆ©äøŠäø€å¾‹å¹³ē­‰ć€‚" }, - ])) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - let key_jpn = (Script::Cj, Language::Jpn); - let key_cmn = (Script::Cj, Language::Cmn); - let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap(); - let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); - let expected_cj_jpn_docids = [3].iter().collect(); - assert_eq!(cj_jpn_docs, expected_cj_jpn_docids); - let expected_cj_cmn_docids = [1, 5].iter().collect(); - assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); - } - #[test] fn add_and_delete_documents_in_single_transform() { let mut index = TempIndex::new(); @@ -2120,8 +2286,8 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); - indexer.add_documents(&documents).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + indexer.replace_documents(&documents).unwrap(); indexer.delete_documents(&["2"]); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2131,13 +2297,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2145,7 +2312,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2172,14 +2339,14 @@ mod tests { { "id": 2, "doggo": { "name": "bob", "age": 20 } }, { "id": 3, "name": "jean", "age": 25 }, ]); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments); - indexer.add_documents(&documents).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + indexer.update_documents(&documents).unwrap(); let documents = documents!([ { "id": 2, "catto": "jorts" }, { "id": 3, "legs": 4 }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.update_documents(&documents).unwrap(); indexer.delete_documents(&["1", "2"]); let indexer_alloc = Bump::new(); @@ -2192,13 +2359,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2206,7 +2374,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2233,8 +2401,8 @@ mod tests { ]); let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments); - indexer.add_documents(&documents).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + indexer.update_documents(&documents).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2244,13 +2412,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2258,7 +2427,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2283,8 +2452,8 @@ mod tests { ]); let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments); - indexer.add_documents(&documents).unwrap(); + let mut indexer = indexer::DocumentOperation::new(); + indexer.update_documents(&documents).unwrap(); indexer.delete_documents(&["1", "2"]); let (document_changes, _operation_stats, primary_key) = indexer @@ -2295,13 +2464,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2309,7 +2479,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2331,14 +2501,14 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments); + let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2"]); let documents = documents!([ { "id": 2, "doggo": { "name": "jean", "age": 20 } }, { "id": 3, "name": "bob", "age": 25 }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.update_documents(&documents).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2348,13 +2518,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2362,7 +2533,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2385,7 +2556,7 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments); + let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2", "1", "2"]); @@ -2394,7 +2565,7 @@ mod tests { { "id": 2, "doggo": { "name": "jean", "age": 20 } }, { "id": 3, "name": "bob", "age": 25 }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.update_documents(&documents).unwrap(); indexer.delete_documents(&["1", "2", "1", "2"]); @@ -2406,13 +2577,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2420,7 +2592,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2442,12 +2614,12 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ { "id": 1, "doggo": "kevin" }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.update_documents(&documents).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2457,13 +2629,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2471,7 +2644,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2490,7 +2663,7 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2498,7 +2671,7 @@ mod tests { { "id": 1, "catto": "jorts" }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2508,13 +2681,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2522,7 +2696,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2595,6 +2769,7 @@ mod tests { source: Setting::Set(crate::vector::settings::EmbedderSource::UserProvided), model: Setting::NotSet, revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::NotSet, dimensions: Setting::Set(3), document_template: Setting::NotSet, @@ -2604,6 +2779,8 @@ mod tests { response: Setting::NotSet, distribution: Setting::NotSet, headers: Setting::NotSet, + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, binary_quantized: Setting::NotSet, }), ); @@ -2629,8 +2806,9 @@ mod tests { embedding_configs.pop().unwrap(); insta::assert_snapshot!(embedder_name, @"manual"); insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); - let embedder = - std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); + let embedder = std::sync::Arc::new( + crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(), + ); let res = index .search(&rtxn) .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec())) @@ -2683,14 +2861,14 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); // OP let documents = documents!([ { "id": 1, "doggo": "bernese" }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); // FINISHING let (document_changes, _operation_stats, primary_key) = indexer @@ -2701,13 +2879,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2715,7 +2894,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2742,14 +2921,14 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); let documents = documents!([ { "id": 0, "catto": "jorts" }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2759,13 +2938,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2773,7 +2953,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2799,12 +2979,12 @@ mod tests { let indexer_alloc = Bump::new(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ { "id": 1, "catto": "jorts" }, ]); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( @@ -2814,13 +2994,14 @@ mod tests { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2828,7 +3009,7 @@ mod tests { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); wtxn.commit().unwrap(); @@ -2938,7 +3119,10 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("label".to_string()), + FilterableAttributesRule::Field("label2".to_string()), + ]); }) .unwrap(); wtxn.commit().unwrap(); @@ -3117,34 +3301,36 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S("_geo"))); - settings.set_sortable_fields(hashset!(S("_geo"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); + settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); }) .unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, - { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, - { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, - { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, - { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, - { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, - { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, - { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, - { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, - { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, - { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, - { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, - { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, - { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, - { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, - { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, - { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, - { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, - { "id": "19", "city": "CompiĆØgne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, - { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } + { "id": "1", "city": "Lille", RESERVED_GEO_FIELD_NAME: { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", RESERVED_GEO_FIELD_NAME: { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", RESERVED_GEO_FIELD_NAME: { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", RESERVED_GEO_FIELD_NAME: { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", RESERVED_GEO_FIELD_NAME: { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", RESERVED_GEO_FIELD_NAME: { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", RESERVED_GEO_FIELD_NAME: { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", RESERVED_GEO_FIELD_NAME: { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", RESERVED_GEO_FIELD_NAME: { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", RESERVED_GEO_FIELD_NAME: { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", RESERVED_GEO_FIELD_NAME: { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", RESERVED_GEO_FIELD_NAME: { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", RESERVED_GEO_FIELD_NAME: { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", RESERVED_GEO_FIELD_NAME: { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", RESERVED_GEO_FIELD_NAME: { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", RESERVED_GEO_FIELD_NAME: { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", RESERVED_GEO_FIELD_NAME: { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", RESERVED_GEO_FIELD_NAME: { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "CompiĆØgne", RESERVED_GEO_FIELD_NAME: { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", RESERVED_GEO_FIELD_NAME: { "lat": 48.9021, "lng": 2.3708 } } ])).unwrap(); wtxn.commit().unwrap(); @@ -3305,6 +3491,44 @@ mod tests { rtxn.commit().unwrap(); } + #[test] + fn incremental_update_without_changing_facet_distribution() { + let index = TempIndex::new(); + index + .add_documents(documents!([ + {"id": 0, "some_field": "aaa", "other_field": "aaa" }, + {"id": 1, "some_field": "bbb", "other_field": "bbb" }, + ])) + .unwrap(); + { + let rtxn = index.read_txn().unwrap(); + // count field distribution + let results = index.field_distribution(&rtxn).unwrap(); + assert_eq!(Some(&2), results.get("id")); + assert_eq!(Some(&2), results.get("some_field")); + assert_eq!(Some(&2), results.get("other_field")); + } + + let mut index = index; + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .add_documents(documents!([ + {"id": 0, "other_field": "bbb" }, + {"id": 1, "some_field": "ccc" }, + ])) + .unwrap(); + + { + let rtxn = index.read_txn().unwrap(); + // count field distribution + let results = index.field_distribution(&rtxn).unwrap(); + assert_eq!(Some(&2), results.get("id")); + assert_eq!(Some(&2), results.get("some_field")); + assert_eq!(Some(&2), results.get("other_field")); + } + } + #[test] fn delete_words_exact_attributes() { let index = TempIndex::new(); diff --git a/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap index c45c350e7..7ab60b90d 100644 --- a/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap +++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap @@ -1,5 +1,5 @@ --- -source: milli/src/update/index_documents/mod.rs +source: crates/milli/src/update/index_documents/mod.rs --- 3 0 48.9021 1 [19, ] 3 0 49.9314 1 [17, ] @@ -15,6 +15,11 @@ source: milli/src/update/index_documents/mod.rs 3 0 50.7453 1 [7, ] 3 0 50.8466 1 [10, ] 3 0 51.0537 1 [9, ] +3 1 48.9021 2 [17, 19, ] +3 1 50.1793 3 [13, 14, 15, ] +3 1 50.4502 4 [0, 3, 8, 12, ] +3 1 50.6312 2 [1, 2, ] +3 1 50.7453 3 [7, 9, 10, ] 4 0 2.271 1 [17, ] 4 0 2.3708 1 [19, ] 4 0 2.7637 1 [14, ] @@ -28,4 +33,3 @@ source: milli/src/update/index_documents/mod.rs 4 0 3.6957 1 [9, ] 4 0 3.9623 1 [12, ] 4 0 4.337 1 [10, ] - diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index 38bf90435..e17625ad4 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::fs::File; use std::io::{Read, Seek}; @@ -18,8 +18,10 @@ use super::helpers::{ ObkvsMergeAdditionsAndDeletions, }; use super::{create_writer, IndexDocumentsMethod, IndexerConfig, KeepFirst}; +use crate::attribute_patterns::PatternMatch; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::{db_name, main_key}; use crate::update::del_add::{ into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, @@ -31,9 +33,7 @@ use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::WriteBackToDocuments; use crate::vector::ArroyWrapper; -use crate::{ - is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, -}; +use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; pub struct TransformOutput { pub primary_key: String, @@ -52,7 +52,7 @@ pub struct TransformOutput { /// containing all those documents. pub struct Transform<'a, 'i> { pub index: &'i Index, - fields_ids_map: FieldsIdsMap, + fields_ids_map: FieldIdMapWithMetadata, indexer_settings: &'a IndexerConfig, pub index_documents_method: IndexDocumentsMethod, @@ -84,7 +84,7 @@ pub enum Operation { /// /// If new fields are present in the addition, they are added to the index field ids map. fn create_fields_mapping( - index_field_map: &mut FieldsIdsMap, + index_field_map: &mut FieldIdMapWithMetadata, batch_field_map: &DocumentsBatchIndex, ) -> Result> { batch_field_map @@ -141,10 +141,13 @@ impl<'a, 'i> Transform<'a, 'i> { true, ); let documents_ids = index.documents_ids(wtxn)?; + let fields_ids_map = index.fields_ids_map(wtxn)?; + let builder = MetadataBuilder::from_index(index, wtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Transform { index, - fields_ids_map: index.fields_ids_map(wtxn)?, + fields_ids_map, indexer_settings, available_documents_ids: AvailableIds::new(&documents_ids), original_sorter, @@ -194,7 +197,7 @@ impl<'a, 'i> Transform<'a, 'i> { // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer // does not keep references from the cursor between loop iterations let mut field_buffer_cache = drop_and_reuse(field_buffer); - if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { + if self.indexer_settings.log_every_n.is_some_and(|len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, }); @@ -354,7 +357,7 @@ impl<'a, 'i> Transform<'a, 'i> { documents_seen: documents_count, }); - self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; + self.index.put_fields_ids_map(wtxn, self.fields_ids_map.as_fields_ids_map())?; self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we @@ -371,7 +374,7 @@ impl<'a, 'i> Transform<'a, 'i> { )] fn flatten_from_fields_ids_map( obkv: &KvReader, - fields_ids_map: &mut FieldsIdsMap, + fields_ids_map: &mut FieldIdMapWithMetadata, ) -> Result>> { if obkv .iter() @@ -657,7 +660,6 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, - modified_faceted_fields: &HashSet, mut injected_vectors: serde_json::Map, old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, @@ -667,13 +669,25 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only a faceted field has been added, keep only this field. - let must_reindex_facets = settings_diff.reindex_facets(); - let necessary_faceted_field = |id: FieldId| -> bool { - let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); - must_reindex_facets - && modified_faceted_fields - .iter() - .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) + let facet_fids_changed = settings_diff.facet_fids_changed(); + + let necessary_faceted_field = |id: FieldId| -> Option { + if facet_fids_changed { + let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); + // if the faceted fields changed, we need to keep all the field that are + // faceted in the old or new settings. + match ( + settings_diff.old.match_faceted_field(field_name), + settings_diff.new.match_faceted_field(field_name), + ) { + (PatternMatch::NoMatch, PatternMatch::NoMatch) => None, + (PatternMatch::NoMatch, _) => Some(DelAddOperation::Addition), + (_, PatternMatch::NoMatch) => Some(DelAddOperation::Deletion), + (_, _) => Some(DelAddOperation::DeletionAndAddition), + } + } else { + None + } }; // Alway provide all fields when vectors are involved because @@ -725,12 +739,24 @@ impl<'a, 'i> Transform<'a, 'i> { } } - if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + if is_primary_key(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; - } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { - operations.insert(id, operation); - obkv_writer.insert(id, val)?; + } else { + let facet_operation = necessary_faceted_field(id); + let searchable_operation = settings_diff.reindex_searchable_id(id); + let operation = match (facet_operation, searchable_operation) { + (Some(facet_operation), Some(searchable_operation)) => { + Some(facet_operation.merge(searchable_operation)) + } + (Some(operation), None) | (None, Some(operation)) => Some(operation), + (None, None) => None, + }; + + if let Some(operation) = operation { + operations.insert(id, operation); + obkv_writer.insert(id, val)?; + } } } if !injected_vectors.is_empty() { @@ -827,10 +853,8 @@ impl<'a, 'i> Transform<'a, 'i> { }) .collect(); - let old_vectors_fid = settings_diff - .old - .fields_ids_map - .id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + let old_vectors_fid = + settings_diff.old.fields_ids_map.id(crate::constants::RESERVED_VECTORS_FIELD_NAME); // We initialize the sorter with the user indexing settings. let mut flattened_sorter = @@ -849,7 +873,6 @@ impl<'a, 'i> Transform<'a, 'i> { }; if original_sorter.is_some() || flattened_sorter.is_some() { - let modified_faceted_fields = settings_diff.modified_faceted_fields(); let mut original_obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); @@ -890,7 +913,6 @@ impl<'a, 'i> Transform<'a, 'i> { Self::rebind_existing_document( old_obkv, &settings_diff, - &modified_faceted_fields, injected_vectors, old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index a97569800..87ea31942 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -55,7 +55,7 @@ impl ChunkAccumulator { match self .inner .iter() - .position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right))) + .position(|right| right.first().is_some_and(|right| chunk.mergeable_with(right))) { Some(position) => { let v = self.inner.get_mut(position).unwrap(); @@ -129,6 +129,7 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, settings_diff: &InnerIndexSettingsDiff, typed_chunks: Vec, + modified_docids: &mut RoaringBitmap, ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunks[0] { @@ -137,8 +138,7 @@ pub(crate) fn write_typed_chunk_into_index( let _entered = span.enter(); let fields_ids_map = index.fields_ids_map(wtxn)?; - let vectors_fid = - fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + let vectors_fid = fields_ids_map.id(crate::constants::RESERVED_VECTORS_FIELD_NAME); let mut builder = MergerBuilder::new(KeepLatestObkv); for typed_chunk in typed_chunks { @@ -215,6 +215,7 @@ pub(crate) fn write_typed_chunk_into_index( kind: DocumentOperationKind::Create, }); docids.insert(docid); + modified_docids.insert(docid); } else { db.delete(wtxn, &docid)?; operations.push(DocumentOperation { @@ -223,6 +224,7 @@ pub(crate) fn write_typed_chunk_into_index( kind: DocumentOperationKind::Delete, }); docids.remove(docid); + modified_docids.insert(docid); } } let external_documents_docids = index.external_documents_ids(); @@ -363,7 +365,7 @@ pub(crate) fn write_typed_chunk_into_index( let merger = builder.build(); let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size); - indexer.execute(wtxn)?; + indexer.execute(wtxn, &settings_diff.new)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(_) => { @@ -399,7 +401,7 @@ pub(crate) fn write_typed_chunk_into_index( Some(normalized_facet_id_string_merger), data_size, ); - indexer.execute(wtxn)?; + indexer.execute(wtxn, &settings_diff.new)?; is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(_) => { @@ -662,11 +664,8 @@ pub(crate) fn write_typed_chunk_into_index( let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; - let binary_quantized = settings_diff - .old - .embedding_configs - .get(&embedder_name) - .map_or(false, |conf| conf.2); + let binary_quantized = + settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); // FIXME: allow customizing distance let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); diff --git a/crates/milli/src/update/mod.rs b/crates/milli/src/update/mod.rs index 5888a20db..9a783ffd2 100644 --- a/crates/milli/src/update/mod.rs +++ b/crates/milli/src/update/mod.rs @@ -5,6 +5,7 @@ pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::*; pub use self::indexer_config::IndexerConfig; +pub use self::new::ChannelCongestion; pub use self::settings::{validate_embedding_settings, Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; @@ -21,6 +22,7 @@ mod indexer_config; pub mod new; pub(crate) mod settings; mod update_step; +pub mod upgrade; mod word_prefix_docids; mod words_prefix_integer_docids; mod words_prefixes_fst; diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 3afcd3e4b..4fff31a35 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,104 +1,386 @@ +use std::cell::RefCell; +use std::io::{self, BufWriter}; +use std::iter::Cycle; use std::marker::PhantomData; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::mem; +use std::num::NonZeroU16; +use std::ops::Range; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::Arc; +use std::time::Duration; -use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; +use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bbqueue::BBBuffer; +use bytemuck::{checked, CheckedBitPattern, NoUninit}; +use flume::{RecvTimeoutError, SendError}; use heed::types::Bytes; -use heed::BytesDecode; -use memmap2::Mmap; +use heed::{BytesDecode, MdbError}; +use memmap2::{Mmap, MmapMut}; use roaring::RoaringBitmap; use super::extract::FacetKind; +use super::ref_cell_ext::RefCellExt; +use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; -use crate::index::IndexEmbeddingConfig; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Error, Index, InternalError}; -/// The capacity of the channel is currently in number of messages. -pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { - let (sender, receiver) = crossbeam_channel::bounded(cap); - ( - ExtractorSender { - sender, - send_count: Default::default(), - writer_contentious_count: Default::default(), - extractor_contentious_count: Default::default(), - }, - WriterReceiver(receiver), - ) +/// Note that the FrameProducer requires up to 9 bytes to +/// encode the length, the max grant has been computed accordingly. +/// +/// +const MAX_FRAME_HEADER_SIZE: usize = 9; + +/// Creates a tuple of senders/receiver to be used by +/// the extractors and the writer loop. +/// +/// The `total_bbbuffer_capacity` represents the number of bytes +/// allocated to all BBQueue buffers. It will be split by the +/// number of threads. +/// +/// The `channel_capacity` parameter defines the number of +/// too-large-to-fit-in-BBQueue entries that can be sent through +/// a flume channel. This parameter must stay low to make +/// sure we do not use too much memory. +/// +/// Note that the channel is also used to wake-up the receiver +/// when new stuff is available in any BBQueue buffer but we send +/// a message in this queue only if it is empty to avoid filling +/// the channel *and* the BBQueue. +pub fn extractor_writer_bbqueue( + bbbuffers: &mut Vec, + total_bbbuffer_capacity: usize, + channel_capacity: usize, +) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { + let current_num_threads = rayon::current_num_threads(); + let bbbuffer_capacity = total_bbbuffer_capacity.checked_div(current_num_threads).unwrap(); + bbbuffers.resize_with(current_num_threads, || BBBuffer::new(bbbuffer_capacity)); + + let capacity = bbbuffers.first().unwrap().capacity(); + // 1. Due to fragmentation in the bbbuffer, we can only accept up to half the capacity in a single message. + // 2. Read the documentation for `MAX_FRAME_HEADER_SIZE` for more information about why it is here. + let max_grant = capacity.saturating_div(2).checked_sub(MAX_FRAME_HEADER_SIZE).unwrap(); + + let producers = ThreadLocal::with_capacity(bbbuffers.len()); + let consumers = rayon::broadcast(|bi| { + let bbqueue = &bbbuffers[bi.index()]; + let (producer, consumer) = bbqueue.try_split_framed().unwrap(); + producers.get_or(|| FullySend(RefCell::new(producer))); + consumer + }); + + let sent_messages_attempts = Arc::new(AtomicUsize::new(0)); + let blocking_sent_messages_attempts = Arc::new(AtomicUsize::new(0)); + + let (sender, receiver) = flume::bounded(channel_capacity); + let sender = ExtractorBbqueueSender { + sender, + producers, + max_grant, + sent_messages_attempts: sent_messages_attempts.clone(), + blocking_sent_messages_attempts: blocking_sent_messages_attempts.clone(), + }; + let receiver = WriterBbqueueReceiver { + receiver, + look_at_consumer: (0..consumers.len()).cycle(), + consumers, + sent_messages_attempts, + blocking_sent_messages_attempts, + }; + (sender, receiver) } -pub enum KeyValueEntry { - Small { key_length: usize, data: Box<[u8]> }, - Large { key_entry: KeyEntry, data: Mmap }, +pub struct ExtractorBbqueueSender<'a> { + /// This channel is used to wake-up the receiver and + /// send large entries that cannot fit in the BBQueue. + sender: flume::Sender, + /// A memory buffer, one by thread, is used to serialize + /// the entries directly in this shared, lock-free space. + producers: ThreadLocal>>>, + /// The maximum frame grant that a producer can reserve. + /// It will never be able to store more than that as the + /// buffer cannot split data into two parts. + max_grant: usize, + /// The total number of attempts to send messages + /// over the bbqueue channel. + sent_messages_attempts: Arc, + /// The number of times an attempt to send a + /// messages failed and we had to pause for a bit. + blocking_sent_messages_attempts: Arc, } -impl KeyValueEntry { - pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self { - let mut data = Vec::with_capacity(key.len() + value.len()); - data.extend_from_slice(key); - data.extend_from_slice(value); - KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() } - } - - fn from_large_key_value(key: &[u8], value: Mmap) -> Self { - KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value } - } - - pub fn key(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[..*key_length], - KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(), - } - } - - pub fn value(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[*key_length..], - KeyValueEntry::Large { key_entry: _, data } => &data[..], - } - } -} - -pub struct KeyEntry { - data: Box<[u8]>, -} - -impl KeyEntry { - pub fn from_key(key: &[u8]) -> Self { - KeyEntry { data: key.to_vec().into_boxed_slice() } - } - - pub fn entry(&self) -> &[u8] { - self.data.as_ref() - } -} - -pub enum EntryOperation { - Delete(KeyEntry), - Write(KeyValueEntry), -} - -pub enum WriterOperation { - DbOperation(DbOperation), - ArroyOperation(ArroyOperation), -} - -pub enum ArroyOperation { - DeleteVectors { docid: DocumentId }, - SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec }, - SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding }, - Finish { configs: Vec }, -} - -pub struct DbOperation { - database: Database, - entry: EntryOperation, +pub struct WriterBbqueueReceiver<'a> { + /// Used to wake up when new entries are available either in + /// any BBQueue buffer or directly sent throught this channel + /// (still written to disk). + receiver: flume::Receiver, + /// Indicates the consumer to observe. This cycling range + /// ensures fair distribution of work among consumers. + look_at_consumer: Cycle>, + /// The BBQueue frames to read when waking-up. + consumers: Vec>, + /// The total number of attempts to send messages + /// over the bbqueue channel. + sent_messages_attempts: Arc, + /// The number of times an attempt to send a + /// message failed and we had to pause for a bit. + blocking_sent_messages_attempts: Arc, } +/// The action to perform on the receiver/writer side. #[derive(Debug)] +pub enum ReceiverAction { + /// Wake up, you have frames to read for the BBQueue buffers. + WakeUp, + LargeEntry(LargeEntry), + LargeVectors(LargeVectors), +} + +/// An entry that cannot fit in the BBQueue buffers has been +/// written to disk, memory-mapped and must be written in the +/// database. +#[derive(Debug)] +pub struct LargeEntry { + /// The database where the entry must be written. + pub database: Database, + /// The key of the entry that must be written in the database. + pub key: Box<[u8]>, + /// The large value that must be written. + /// + /// Note: We can probably use a `File` here and + /// use `Database::put_reserved` instead of memory-mapping. + pub value: Mmap, +} + +/// When embeddings are larger than the available +/// BBQueue space it arrives here. +#[derive(Debug)] +pub struct LargeVectors { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The large embedding that must be written. + pub embeddings: Mmap, +} + +impl LargeVectors { + pub fn read_embeddings(&self, dimensions: usize) -> impl Iterator { + self.embeddings.chunks_exact(dimensions).map(bytemuck::cast_slice) + } +} + +impl<'a> WriterBbqueueReceiver<'a> { + /// Tries to receive an action to do until the timeout occurs + /// and if it does, consider it as a spurious wake up. + pub fn recv_action(&mut self) -> Option { + match self.receiver.recv_timeout(Duration::from_millis(100)) { + Ok(action) => Some(action), + Err(RecvTimeoutError::Timeout) => Some(ReceiverAction::WakeUp), + Err(RecvTimeoutError::Disconnected) => None, + } + } + + /// Reads all the BBQueue buffers and selects the first available frame. + pub fn recv_frame(&mut self) -> Option> { + for index in self.look_at_consumer.by_ref().take(self.consumers.len()) { + if let Some(frame) = self.consumers[index].read() { + return Some(FrameWithHeader::from(frame)); + } + } + None + } + + /// Returns the total count of attempts to send messages through the BBQueue channel. + pub fn sent_messages_attempts(&self) -> usize { + self.sent_messages_attempts.load(atomic::Ordering::Relaxed) + } + + /// Returns the count of attempts to send messages that had to be paused due to BBQueue being full. + pub fn blocking_sent_messages_attempts(&self) -> usize { + self.blocking_sent_messages_attempts.load(atomic::Ordering::Relaxed) + } +} + +pub struct FrameWithHeader<'a> { + header: EntryHeader, + frame: FrameGrantR<'a>, +} + +impl FrameWithHeader<'_> { + pub fn header(&self) -> EntryHeader { + self.header + } + + pub fn frame(&self) -> &FrameGrantR<'_> { + &self.frame + } +} + +impl<'a> From> for FrameWithHeader<'a> { + fn from(mut frame: FrameGrantR<'a>) -> Self { + frame.auto_release(true); + FrameWithHeader { header: EntryHeader::from_slice(&frame[..]), frame } + } +} + +/// A header that is written at the beginning of a bbqueue frame. +/// +/// Note that the different variants cannot be changed without taking +/// care of their size in the implementation, like, everywhere. +#[derive(Debug, Clone, Copy)] +#[repr(u8)] +pub enum EntryHeader { + DbOperation(DbOperation), + ArroyDeleteVector(ArroyDeleteVector), + ArroySetVectors(ArroySetVectors), +} + +impl EntryHeader { + const fn variant_size() -> usize { + mem::size_of::() + } + + const fn variant_id(&self) -> u8 { + match self { + EntryHeader::DbOperation(_) => 0, + EntryHeader::ArroyDeleteVector(_) => 1, + EntryHeader::ArroySetVectors(_) => 2, + } + } + + const fn total_key_value_size(key_length: NonZeroU16, value_length: usize) -> usize { + Self::variant_size() + + mem::size_of::() + + key_length.get() as usize + + value_length + } + + const fn total_key_size(key_length: NonZeroU16) -> usize { + Self::total_key_value_size(key_length, 0) + } + + const fn total_delete_vector_size() -> usize { + Self::variant_size() + mem::size_of::() + } + + /// The `dimensions` corresponds to the number of `f32` in the embedding. + fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size * count + } + + fn header_size(&self) -> usize { + let payload_size = match self { + EntryHeader::DbOperation(op) => mem::size_of_val(op), + EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), + EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), + }; + Self::variant_size() + payload_size + } + + fn from_slice(slice: &[u8]) -> EntryHeader { + let (variant_id, remaining) = slice.split_first().unwrap(); + match variant_id { + 0 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::DbOperation(header) + } + 1 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroyDeleteVector(header) + } + 2 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVectors(header) + } + id => panic!("invalid variant id: {id}"), + } + } + + fn serialize_into(&self, header_bytes: &mut [u8]) { + let (first, remaining) = header_bytes.split_first_mut().unwrap(); + let payload_bytes = match self { + EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), + EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), + EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), + }; + *first = self.variant_id(); + remaining.copy_from_slice(payload_bytes); + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// Wether a put of the key/value pair or a delete of the given key. +pub struct DbOperation { + /// The database on which to perform the operation. + pub database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + pub key_length: Option, +} + +impl DbOperation { + pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { + let skip = EntryHeader::variant_size() + mem::size_of::(); + match self.key_length { + Some(key_length) => { + let (key, value) = frame[skip..].split_at(key_length.get() as usize); + (key, Some(value)) + } + None => (&frame[skip..], None), + } + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(transparent)] +pub struct ArroyDeleteVector { + pub docid: DocumentId, +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVectors { + pub docid: DocumentId, + pub embedder_id: u8, + _padding: [u8; 3], +} + +impl ArroySetVectors { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read all the embeddings and write them into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(u16)] pub enum Database { Main, Documents, @@ -112,7 +394,7 @@ pub enum Database { FacetIdIsNullDocids, FacetIdIsEmptyDocids, FacetIdExistsDocids, - FacetIdF64NumberDocids, + FacetIdF64Docids, FacetIdStringDocids, FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, @@ -133,18 +415,39 @@ impl Database { Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(), Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(), Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(), - Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(), + Database::FacetIdF64Docids => index.facet_id_f64_docids.remap_types(), Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), } } + + pub fn database_name(&self) -> &'static str { + match self { + Database::Main => db_name::MAIN, + Database::Documents => db_name::DOCUMENTS, + Database::ExternalDocumentsIds => db_name::EXTERNAL_DOCUMENTS_IDS, + Database::ExactWordDocids => db_name::EXACT_WORD_DOCIDS, + Database::WordDocids => db_name::WORD_DOCIDS, + Database::WordFidDocids => db_name::WORD_FIELD_ID_DOCIDS, + Database::WordPositionDocids => db_name::WORD_POSITION_DOCIDS, + Database::FidWordCountDocids => db_name::FIELD_ID_WORD_COUNT_DOCIDS, + Database::WordPairProximityDocids => db_name::WORD_PAIR_PROXIMITY_DOCIDS, + Database::FacetIdIsNullDocids => db_name::FACET_ID_IS_NULL_DOCIDS, + Database::FacetIdIsEmptyDocids => db_name::FACET_ID_IS_EMPTY_DOCIDS, + Database::FacetIdExistsDocids => db_name::FACET_ID_EXISTS_DOCIDS, + Database::FacetIdF64Docids => db_name::FACET_ID_F64_DOCIDS, + Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, + Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, + Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, + } + } } impl From for Database { fn from(value: FacetKind) -> Self { match value { - FacetKind::Number => Database::FacetIdF64NumberDocids, + FacetKind::Number => Database::FacetIdF64Docids, FacetKind::String => Database::FacetIdStringDocids, FacetKind::Null => Database::FacetIdIsNullDocids, FacetKind::Empty => Database::FacetIdIsEmptyDocids, @@ -153,100 +456,294 @@ impl From for Database { } } -impl DbOperation { - pub fn database(&self, index: &Index) -> heed::Database { - self.database.database(index) - } - - pub fn entry(self) -> EntryOperation { - self.entry - } -} - -pub struct WriterReceiver(Receiver); - -impl IntoIterator for WriterReceiver { - type Item = WriterOperation; - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -pub struct ExtractorSender { - sender: Sender, - /// The number of message we sent in total in the channel. - send_count: AtomicUsize, - /// The number of times we sent something in a channel that was full. - writer_contentious_count: AtomicUsize, - /// The number of times we sent something in a channel that was empty. - extractor_contentious_count: AtomicUsize, -} - -impl Drop for ExtractorSender { - fn drop(&mut self) { - let send_count = *self.send_count.get_mut(); - let writer_contentious_count = *self.writer_contentious_count.get_mut(); - let extractor_contentious_count = *self.extractor_contentious_count.get_mut(); - tracing::debug!( - "Extractor channel stats: {send_count} sends, \ - {writer_contentious_count} writer contentions ({}%), \ - {extractor_contentious_count} extractor contentions ({}%)", - (writer_contentious_count as f32 / send_count as f32) * 100.0, - (extractor_contentious_count as f32 / send_count as f32) * 100.0 - ) - } -} - -impl ExtractorSender { - pub fn docids(&self) -> WordDocidsSender<'_, D> { +impl<'b> ExtractorBbqueueSender<'b> { + pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> { WordDocidsSender { sender: self, _marker: PhantomData } } - pub fn facet_docids(&self) -> FacetDocidsSender<'_> { + pub fn facet_docids<'a>(&'a self) -> FacetDocidsSender<'a, 'b> { FacetDocidsSender { sender: self } } - pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> { + pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> { FieldIdDocidFacetSender(self) } - pub fn documents(&self) -> DocumentsSender<'_> { + pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> { DocumentsSender(self) } - pub fn embeddings(&self) -> EmbeddingSender<'_> { - EmbeddingSender(&self.sender) + pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> { + EmbeddingSender(self) } - pub fn geo(&self) -> GeoSender<'_> { - GeoSender(&self.sender) + pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> { + GeoSender(self) } - fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { - match self - .sender - .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); + let total_length = EntryHeader::total_delete_vector_size(); + if total_length > max_grant { + panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)"); } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + payload_header.serialize_into(grant); + Ok(()) + }, + )?; + + Ok(()) } - fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> { - if self.sender.is_full() { - self.writer_contentious_count.fetch_add(1, Ordering::SeqCst); - } - if self.sender.is_empty() { - self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst); + fn set_vectors( + &self, + docid: u32, + embedder_id: u8, + embeddings: &[Vec], + ) -> crate::Result<()> { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // If there are no vectors we specify the dimensions + // to zero to allocate no extra space at all + let dimensions = embeddings.first().map_or(0, |emb| emb.len()); + + let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; + let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); + let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); + if total_length > max_grant { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + for embedding in embeddings { + let mut embedding_bytes = bytemuck::cast_slice(embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + } + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + let embeddings = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVectors { docid, embedder_id, embeddings }; + self.sender.send(ReceiverAction::LargeVectors(large_vectors)).unwrap(); + + return Ok(()); } - self.send_count.fetch_add(1, Ordering::SeqCst); - match self.sender.send(WriterOperation::DbOperation(op)) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + if dimensions != 0 { + let output_iter = + remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + } + + Ok(()) + }, + )?; + + Ok(()) + } + + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length: value.len(), + error: MdbError::BadValSize.into(), + } + })?; + self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + value_buffer.copy_from_slice(value); + Ok(()) + }) + } + + fn write_key_value_with( + &self, + database: Database, + key_length: NonZeroU16, + value_length: usize, + key_value_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8], &mut [u8]) -> crate::Result<()>, + { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let operation = DbOperation { database, key_length: Some(key_length) }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_value_size(key_length, value_length); + if total_length > max_grant { + let mut key_buffer = vec![0; key_length.get() as usize].into_boxed_slice(); + let value_file = tempfile::tempfile()?; + value_file.set_len(value_length.try_into().unwrap())?; + let mut mmap_mut = unsafe { MmapMut::map_mut(&value_file)? }; + + key_value_writer(&mut key_buffer, &mut mmap_mut)?; + + self.sender + .send(ReceiverAction::LargeEntry(LargeEntry { + database, + key: key_buffer, + value: mmap_mut.make_read_only()?, + })) + .unwrap(); + + return Ok(()); } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); + key_value_writer(key_buffer, value_buffer) + }, + )?; + + Ok(()) + } + + fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StoreDeletion { + database_name: database.database_name(), + key: key.into(), + error: MdbError::BadValSize.into(), + } + })?; + self.delete_entry_with(database, key_length, |buffer| { + buffer.copy_from_slice(key); + Ok(()) + }) + } + + fn delete_entry_with( + &self, + database: Database, + key_length: NonZeroU16, + key_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8]) -> crate::Result<()>, + { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // For deletion we do not specify the key length, + // it's in the remaining bytes. + let operation = DbOperation { database, key_length: None }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_size(key_length); + if total_length > max_grant { + panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)"); + } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_writer(remaining) + }, + )?; + + Ok(()) + } +} + +/// Try to reserve a frame grant of `total_length` by spin +/// looping on the BBQueue buffer, panics if the receiver +/// has been disconnected or send a WakeUp message if necessary. +fn reserve_and_write_grant( + producer: &mut FrameProducer, + total_length: usize, + sender: &flume::Sender, + sent_messages_attempts: &AtomicUsize, + blocking_sent_messages_attempts: &AtomicUsize, + f: F, +) -> crate::Result<()> +where + F: FnOnce(&mut [u8]) -> crate::Result<()>, +{ + loop { + // An attempt means trying multiple times + // whether is succeeded or not. + sent_messages_attempts.fetch_add(1, atomic::Ordering::Relaxed); + + for _ in 0..10_000 { + match producer.grant(total_length) { + Ok(mut grant) => { + // We could commit only the used memory. + f(&mut grant)?; + grant.commit(total_length); + + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if sender.is_empty() { + sender.send(ReceiverAction::WakeUp).unwrap(); + } + + return Ok(()); + } + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + } + if sender.is_disconnected() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + // We made an attempt to send a message in the + // bbqueue channel but it didn't succeed. + blocking_sent_messages_attempts.fetch_add(1, atomic::Ordering::Relaxed); + + // We prefer to yield and allow the writing thread + // to do its job, especially beneficial when there + // is only one CPU core available. + std::thread::yield_now(); } } @@ -285,166 +782,160 @@ impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } -pub trait DocidsSender { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>; - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>; -} - -pub struct WordDocidsSender<'a, D> { - sender: &'a ExtractorSender, +#[derive(Clone, Copy)] +pub struct WordDocidsSender<'a, 'b, D> { + sender: &'a ExtractorBbqueueSender<'b>, _marker: PhantomData, } -impl DocidsSender for WordDocidsSender<'_, D> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } - - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -pub struct FacetDocidsSender<'a> { - sender: &'a ExtractorSender, -} - -impl DocidsSender for FacetDocidsSender<'_> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let (facet_kind, key) = FacetKind::extract_from_key(key); - let database = Database::from(facet_kind); - let entry = match facet_kind { - // skip level group size - FacetKind::String | FacetKind::Number => { - // add facet group size - let value = [&[1], value].concat(); - EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value)) +impl WordDocidsSender<'_, '_, D> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: D::DATABASE.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), } - _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)), - }; - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + })?; + self.sender.write_key_value_with( + D::DATABASE, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; + Ok(()) + }, + ) } - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + self.sender.delete_entry(D::DATABASE, key) + } +} + +#[derive(Clone, Copy)] +pub struct FacetDocidsSender<'a, 'b> { + sender: &'a ExtractorBbqueueSender<'b>, +} + +impl FacetDocidsSender<'_, '_> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { let (facet_kind, key) = FacetKind::extract_from_key(key); let database = Database::from(facet_kind); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let value_length = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::Number | FacetKind::String => value_length + 1, + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, + }; + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; + + self.sender.write_key_value_with( + database, + key_length, + value_length, + |key_out, value_out| { + key_out.copy_from_slice(key); + + let value_out = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::String | FacetKind::Number => { + let (first, remaining) = value_out.split_first_mut().unwrap(); + *first = 1; + remaining + } + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, + }; + + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + + Ok(()) + }, + ) + } + + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + let (facet_kind, key) = FacetKind::extract_from_key(key); + let database = Database::from(facet_kind); + self.sender.delete_entry(database, key) } } -pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); +#[derive(Clone, Copy)] +pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl FieldIdDocidFacetSender<'_> { - pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { +impl FieldIdDocidFacetSender<'_, '_> { + pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetStrings, key, value) } - pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn write_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &[])); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetF64s, key, &[]) } - pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_string(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetStrings, key) } - pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetF64s, key) } } -pub struct DocumentsSender<'a>(&'a ExtractorSender); +#[derive(Clone, Copy)] +pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl DocumentsSender<'_> { +impl DocumentsSender<'_, '_> { /// TODO do that efficiently pub fn uncompressed( &self, docid: DocumentId, external_id: String, document: &KvReaderFieldId, - ) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( - &docid.to_be_bytes(), - document.as_bytes(), - )); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( + ) -> crate::Result<()> { + self.0.write_key_value(Database::Documents, &docid.to_be_bytes(), document.as_bytes())?; + self.0.write_key_value( + Database::ExternalDocumentsIds, external_id.as_bytes(), &docid.to_be_bytes(), - )); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + ) } - pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - - self.0.send_delete_vector(docid)?; - - let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> { + self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?; + self.0.delete_vector(docid)?; + self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes()) } } -pub struct EmbeddingSender<'a>(&'a Sender); +#[derive(Clone, Copy)] +pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl EmbeddingSender<'_> { +impl EmbeddingSender<'_, '_> { pub fn set_vectors( &self, docid: DocumentId, embedder_id: u8, embeddings: Vec, - ) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings, - })) - .map_err(|_| SendError(())) + ) -> crate::Result<()> { + self.0.set_vectors(docid, embedder_id, &embeddings[..]) } pub fn set_vector( @@ -452,51 +943,48 @@ impl EmbeddingSender<'_> { docid: DocumentId, embedder_id: u8, embedding: Embedding, - ) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVector { - docid, - embedder_id, - embedding, - })) - .map_err(|_| SendError(())) - } - - /// Marks all embedders as "to be built" - pub fn finish(self, configs: Vec) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { configs })) - .map_err(|_| SendError(())) + ) -> crate::Result<()> { + self.0.set_vectors(docid, embedder_id, &[embedding]) } } -pub struct GeoSender<'a>(&'a Sender); +#[derive(Clone, Copy)] +pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl GeoSender<'_> { +impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { self.0 - .send(WriterOperation::DbOperation(DbOperation { + .sender + .send(ReceiverAction::LargeEntry(LargeEntry { database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( - GEO_RTREE_KEY.as_bytes(), - value, - )), + key: GEO_RTREE_KEY.to_string().into_bytes().into_boxed_slice(), + value, })) .map_err(|_| SendError(())) } - pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { - let mut buffer = Vec::new(); - bitmap.serialize_into(&mut buffer).unwrap(); + pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> { + let database = Database::Main; + let value_length = bitmap.serialized_size(); + let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; - self.0 - .send(WriterOperation::DbOperation(DbOperation { - database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( - GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), - &buffer, - )), - })) - .map_err(|_| SendError(())) + self.0.write_key_value_with( + database, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + bitmap.serialize_into(value_buffer)?; + Ok(()) + }, + ) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b1a2218f2..1ef44fc8d 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -1,13 +1,14 @@ use std::collections::{BTreeMap, BTreeSet}; +use bumparaw_collections::RawMap; use heed::RoTxn; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; -use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError}; /// A view into a document that can represent either the current version from the DB, @@ -55,13 +56,13 @@ where content: &'t KvReaderFieldId, } -impl<'t, Mapper: FieldIdMapper> Clone for DocumentFromDb<'t, Mapper> { +impl Clone for DocumentFromDb<'_, Mapper> { #[inline] fn clone(&self) -> Self { *self } } -impl<'t, Mapper: FieldIdMapper> Copy for DocumentFromDb<'t, Mapper> {} +impl Copy for DocumentFromDb<'_, Mapper> {} impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { fn iter_top_level_fields(&self) -> impl Iterator> { @@ -79,7 +80,7 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { Err(error) => return Some(Err(error.into())), }; - if name == RESERVED_VECTORS_FIELD_NAME || name == "_geo" { + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { continue; } @@ -99,7 +100,7 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { } fn geo_field(&self) -> Result> { - self.field("_geo") + self.field(RESERVED_GEO_FIELD_NAME) } fn top_level_fields_count(&self) -> usize { @@ -114,7 +115,7 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> { } fn top_level_field(&self, k: &str) -> Result> { - if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" { + if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME { return Ok(None); } self.field(k) @@ -153,7 +154,7 @@ impl<'a, 'doc> DocumentFromVersions<'a, 'doc> { } } -impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> { +impl<'doc> Document<'doc> for DocumentFromVersions<'_, 'doc> { fn iter_top_level_fields(&self) -> impl Iterator> { self.versions.iter_top_level_fields().map(Ok) } @@ -366,7 +367,9 @@ where } if let Some(geo_value) = document.geo_field()? { - let fid = fields_ids_map.id_or_insert("_geo").ok_or(UserError::AttributeLimitReached)?; + let fid = fields_ids_map + .id_or_insert(RESERVED_GEO_FIELD_NAME) + .ok_or(UserError::AttributeLimitReached)?; fields_ids_map.id_or_insert("_geo.lat").ok_or(UserError::AttributeLimitReached)?; fields_ids_map.id_or_insert("_geo.lng").ok_or(UserError::AttributeLimitReached)?; unordered_field_buffer.push((fid, geo_value)); @@ -385,12 +388,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue); #[derive(Debug)] pub struct Versions<'doc> { - data: RawMap<'doc>, + data: RawMap<'doc, FxBuildHasher>, } impl<'doc> Versions<'doc> { pub fn multiple( - mut versions: impl Iterator>>, + mut versions: impl Iterator>>, ) -> Result> { let Some(data) = versions.next() else { return Ok(None) }; let mut data = data?; @@ -403,12 +406,14 @@ impl<'doc> Versions<'doc> { Ok(Some(Self::single(data))) } - pub fn single(version: RawMap<'doc>) -> Self { + pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self { Self { data: version } } pub fn iter_top_level_fields(&self) -> impl Iterator + '_ { - self.data.iter().filter(|(k, _)| *k != RESERVED_VECTORS_FIELD_NAME && *k != "_geo") + self.data + .iter() + .filter(|(k, _)| *k != RESERVED_VECTORS_FIELD_NAME && *k != RESERVED_GEO_FIELD_NAME) } pub fn vectors_field(&self) -> Option<&'doc RawValue> { @@ -416,7 +421,7 @@ impl<'doc> Versions<'doc> { } pub fn geo_field(&self) -> Option<&'doc RawValue> { - self.data.get("_geo") + self.data.get(RESERVED_GEO_FIELD_NAME) } pub fn len(&self) -> usize { @@ -428,7 +433,7 @@ impl<'doc> Versions<'doc> { } pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> { - if k == RESERVED_VECTORS_FIELD_NAME || k == "_geo" { + if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME { return None; } self.data.get(k) diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 899655db1..38369a4d7 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,10 +1,13 @@ use bumpalo::Bump; use heed::RoTxn; -use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; +use super::document::{ + Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, +}; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; +use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, Result}; @@ -24,7 +27,7 @@ pub struct Update<'doc> { docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>, - has_deletion: bool, + from_scratch: bool, } pub struct Insertion<'doc> { @@ -106,9 +109,9 @@ impl<'doc> Update<'doc> { docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>, - has_deletion: bool, + from_scratch: bool, ) -> Self { - Update { docid, new, external_document_id, has_deletion } + Update { docid, new, external_document_id, from_scratch } } pub fn docid(&self) -> DocumentId { @@ -141,7 +144,7 @@ impl<'doc> Update<'doc> { )?) } - pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> { + pub fn only_changed_fields(&self) -> DocumentFromVersions<'_, 'doc> { DocumentFromVersions::new(&self.new) } @@ -151,7 +154,7 @@ impl<'doc> Update<'doc> { index: &'t Index, mapper: &'t Mapper, ) -> Result> { - if self.has_deletion { + if self.from_scratch { Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new))) } else { MergedDocument::with_db( @@ -164,7 +167,83 @@ impl<'doc> Update<'doc> { } } - pub fn updated_vectors( + /// Returns whether the updated version of the document is different from the current version for the subset of fields selected by `selector`. + /// + /// `true` if at least one top-level-field that is exactly a selected field or a parent of a selected field changed. + /// Otherwise `false`. + /// + /// - Note: `_geo` and `_vectors` are not taken into account by this function. + pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( + &self, + selector: &mut impl FnMut(&str) -> PatternMatch, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result { + let mut changed = false; + let mut cached_current = None; + let mut updated_selected_field_count = 0; + + for entry in self.only_changed_fields().iter_top_level_fields() { + let (key, updated_value) = entry?; + + if selector(key) == PatternMatch::NoMatch { + continue; + } + + updated_selected_field_count += 1; + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + let current_value = current.top_level_field(key)?; + let Some(current_value) = current_value else { + changed = true; + break; + }; + + if current_value.get() != updated_value.get() { + changed = true; + break; + } + cached_current = Some(current); + } + + if !self.from_scratch { + // no field deletion or update, so fields that don't appear in `updated` cannot have changed + return Ok(changed); + } + + if changed { + return Ok(true); + } + + // we saw all updated fields, and set `changed` if any field wasn't in `current`. + // so if there are as many fields in `current` as in `updated`, then nothing changed. + // If there is any more fields in `current`, then they are missing in `updated`. + let has_deleted_fields = { + let current = match cached_current { + Some(current) => current, + None => self.current(rtxn, index, mapper)?, + }; + + let mut current_selected_field_count = 0; + for entry in current.iter_top_level_fields() { + let (key, _) = entry?; + + if selector(key) == PatternMatch::NoMatch { + continue; + } + current_selected_field_count += 1; + } + + current_selected_field_count != updated_selected_field_count + }; + + Ok(has_deleted_fields) + } + + pub fn only_changed_vectors( &self, doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, @@ -180,7 +259,7 @@ impl<'doc> Update<'doc> { doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, ) -> Result>> { - if self.has_deletion { + if self.from_scratch { MergedVectorDocument::without_db( self.external_document_id, &self.new, diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 26ed0eb44..c76ef3999 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -69,12 +69,12 @@ use std::io::BufReader; use std::{io, iter, mem}; use bumpalo::Bump; +use bumparaw_collections::bbbul::{BitPacker, BitPacker4x}; +use bumparaw_collections::map::FrozenMap; +use bumparaw_collections::{Bbbul, FrozenBbbul}; use grenad::ReaderCursor; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use raw_collections::bbbul::{BitPacker, BitPacker4x}; -use raw_collections::map::FrozenMap; -use raw_collections::{Bbbul, FrozenBbbul}; use roaring::RoaringBitmap; use rustc_hash::FxBuildHasher; @@ -121,7 +121,7 @@ impl<'extractor> BalancedCaches<'extractor> { } pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> Result<()> { - if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) { + if self.max_memory.is_some_and(|mm| self.alloc.allocated_bytes() >= mm) { self.start_spilling()?; } @@ -138,7 +138,7 @@ impl<'extractor> BalancedCaches<'extractor> { } pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> Result<()> { - if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) { + if self.max_memory.is_some_and(|mm| self.alloc.allocated_bytes() >= mm) { self.start_spilling()?; } @@ -177,12 +177,12 @@ impl<'extractor> BalancedCaches<'extractor> { Ok(()) } - pub fn freeze(&mut self) -> Result>> { + pub fn freeze(&mut self, source_id: usize) -> Result>> { match &mut self.caches { InnerCaches::Normal(NormalCaches { caches }) => caches .iter_mut() .enumerate() - .map(|(bucket, map)| { + .map(|(bucket_id, map)| { // safety: we are transmuting the Bbbul into a FrozenBbbul // that are the same size. let map = unsafe { @@ -201,14 +201,19 @@ impl<'extractor> BalancedCaches<'extractor> { >, >(map) }; - Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() }) + Ok(FrozenCache { + source_id, + bucket_id, + cache: FrozenMap::new(map), + spilled: Vec::new(), + }) }) .collect(), InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches .iter_mut() .zip(mem::take(spilled_entries)) .enumerate() - .map(|(bucket, (map, sorter))| { + .map(|(bucket_id, (map, sorter))| { let spilled = sorter .into_reader_cursors()? .into_iter() @@ -234,7 +239,7 @@ impl<'extractor> BalancedCaches<'extractor> { >, >(map) }; - Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled }) + Ok(FrozenCache { source_id, bucket_id, cache: FrozenMap::new(map), spilled }) }) .collect(), } @@ -415,21 +420,21 @@ fn spill_entry_to_sorter( match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), @@ -440,7 +445,8 @@ fn spill_entry_to_sorter( } pub struct FrozenCache<'a, 'extractor> { - bucket: usize, + bucket_id: usize, + source_id: usize, cache: FrozenMap< 'a, 'extractor, @@ -457,40 +463,36 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>( let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0); let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect(); - for thread_cache in caches { - for frozen in thread_cache.freeze()? { - bucket_caches[frozen.bucket].push(frozen); + for (thread_index, thread_cache) in caches.iter_mut().enumerate() { + for frozen in thread_cache.freeze(thread_index)? { + bucket_caches[frozen.bucket_id].push(frozen); } } Ok(bucket_caches) } -/// Merges the caches that must be all associated to the same bucket. +/// Merges the caches that must be all associated to the same bucket +/// but make sure to sort the different buckets before performing the merges. /// /// # Panics /// /// - If the bucket IDs in these frozen caches are not exactly the same. -pub fn merge_caches(frozen: Vec, mut f: F) -> Result<()> +pub fn merge_caches_sorted(frozen: Vec, mut f: F) -> Result<()> where F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, { let mut maps = Vec::new(); - let mut readers = Vec::new(); - let mut current_bucket = None; - for FrozenCache { bucket, cache, ref mut spilled } in frozen { - assert_eq!(*current_bucket.get_or_insert(bucket), bucket); - maps.push(cache); - readers.append(spilled); - } - - // First manage the spilled entries by looking into the HashMaps, - // merge them and mark them as dummy. let mut heap = BinaryHeap::new(); - for (source_index, source) in readers.into_iter().enumerate() { - let mut cursor = source.into_cursor()?; - if cursor.move_on_next()?.is_some() { - heap.push(Entry { cursor, source_index }); + let mut current_bucket = None; + for FrozenCache { source_id, bucket_id, cache, spilled } in frozen { + assert_eq!(*current_bucket.get_or_insert(bucket_id), bucket_id); + maps.push((source_id, cache)); + for reader in spilled { + let mut cursor = reader.into_cursor()?; + if cursor.move_on_next()?.is_some() { + heap.push(Entry { cursor, source_id }); + } } } @@ -507,25 +509,29 @@ where let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; while let Some(mut entry) = heap.peek_mut() { - if let Some((key, _value)) = entry.cursor.current() { - if first_key == key { - let new = DelAddRoaringBitmap::from_bytes(first_value)?; - output = output.merge(new); - // When we are done we the current value of this entry move make - // it move forward and let the heap reorganize itself (on drop) - if entry.cursor.move_on_next()?.is_none() { - PeekMut::pop(entry); - } - } else { + if let Some((key, value)) = entry.cursor.current() { + if first_key != key { break; } + + let new = DelAddRoaringBitmap::from_bytes(value)?; + output = output.merge(new); + // When we are done we the current value of this entry move make + // it move forward and let the heap reorganize itself (on drop) + if entry.cursor.move_on_next()?.is_none() { + PeekMut::pop(entry); + } } } // Once we merged all of the spilled bitmaps we must also // fetch the entries from the non-spilled entries (the HashMaps). - for (map_index, map) in maps.iter_mut().enumerate() { - if first_entry.source_index != map_index { + for (source_id, map) in maps.iter_mut() { + debug_assert!( + !(map.get(first_key).is_some() && first_entry.source_id == *source_id), + "A thread should not have spiled a key that has been inserted in the cache" + ); + if first_entry.source_id != *source_id { if let Some(new) = map.get_mut(first_key) { output.union_and_clear_bbbul(new); } @@ -537,22 +543,22 @@ where // Don't forget to put the first entry back into the heap. if first_entry.cursor.move_on_next()?.is_some() { - heap.push(first_entry) + heap.push(first_entry); } } // Then manage the content on the HashMap entries that weren't taken (mem::take). - while let Some(mut map) = maps.pop() { - for (key, bbbul) in map.iter_mut() { - // Make sure we don't try to work with entries already managed by the spilled - if bbbul.is_empty() { - continue; - } + while let Some((_, mut map)) = maps.pop() { + // Make sure we don't try to work with entries already managed by the spilled + let mut ordered_entries: Vec<_> = + map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect(); + ordered_entries.sort_unstable_by_key(|(key, _)| *key); + for (key, bbbul) in ordered_entries { let mut output = DelAddRoaringBitmap::empty(); output.union_and_clear_bbbul(bbbul); - for rhs in maps.iter_mut() { + for (_, rhs) in maps.iter_mut() { if let Some(new) = rhs.get_mut(key) { output.union_and_clear_bbbul(new); } @@ -568,14 +574,14 @@ where struct Entry { cursor: ReaderCursor, - source_index: usize, + source_id: usize, } impl Ord for Entry { fn cmp(&self, other: &Entry) -> Ordering { let skey = self.cursor.current().map(|(k, _)| k); let okey = other.cursor.current().map(|(k, _)| k); - skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse() + skey.cmp(&okey).then(self.source_id.cmp(&other.source_id)).reverse() } } @@ -617,7 +623,7 @@ pub struct FrozenDelAddBbbul<'bump, B> { pub add: Option>, } -impl<'bump, B> FrozenDelAddBbbul<'bump, B> { +impl FrozenDelAddBbbul<'_, B> { fn is_empty(&self) -> bool { self.del.is_none() && self.add.is_none() } @@ -673,9 +679,7 @@ impl DelAddRoaringBitmap { let del = self.del.get_or_insert_with(RoaringBitmap::new); let mut iter = bbbul.iter_and_clear(); while let Some(block) = iter.next_block() { - let iter = block.iter().copied(); - let block = RoaringBitmap::from_sorted_iter(iter).unwrap(); - *del |= block; + del.extend(block); } } @@ -683,9 +687,7 @@ impl DelAddRoaringBitmap { let add = self.add.get_or_insert_with(RoaringBitmap::new); let mut iter = bbbul.iter_and_clear(); while let Some(block) = iter.next_block() { - let iter = block.iter().copied(); - let block = RoaringBitmap::from_sorted_iter(iter).unwrap(); - *add |= block; + add.extend(block); } } } @@ -709,15 +711,17 @@ impl DelAddRoaringBitmap { DelAddRoaringBitmap { del, add } } - pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) { + pub fn apply_to(&self, documents_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap) { let DelAddRoaringBitmap { del, add } = self; if let Some(del) = del { *documents_ids -= del; + *modified_docids |= del; } if let Some(add) = add { *documents_ids |= add; + *modified_docids |= add; } } } diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index aeb1d5694..d1c92919b 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -4,6 +4,7 @@ use bumpalo::Bump; use hashbrown::HashMap; use super::DelAddRoaringBitmap; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::DocumentsSender; use crate::update::new::document::{write_to_obkv, Document as _}; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; @@ -12,13 +13,14 @@ use crate::update::new::thread_local::FullySend; use crate::update::new::DocumentChange; use crate::vector::EmbeddingConfigs; use crate::Result; -pub struct DocumentsExtractor<'a> { - document_sender: &'a DocumentsSender<'a>, + +pub struct DocumentsExtractor<'a, 'b> { + document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs, } -impl<'a> DocumentsExtractor<'a> { - pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self { +impl<'a, 'b> DocumentsExtractor<'a, 'b> { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { Self { document_sender, embedders } } } @@ -29,7 +31,7 @@ pub struct DocumentExtractorData { pub field_distribution_delta: HashMap, } -impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { +impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> { type Data = FullySend>; fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { @@ -61,8 +63,10 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { context.index, &context.db_fields_ids_map, )?; - let geo_iter = - content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = content + .geo_field() + .transpose() + .map(|res| res.map(|rv| (RESERVED_GEO_FIELD_NAME, rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data @@ -78,8 +82,10 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { let docid = update.docid(); let content = update.current(&context.rtxn, context.index, &context.db_fields_ids_map)?; - let geo_iter = - content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = content + .geo_field() + .transpose() + .map(|res| res.map(|rv| (RESERVED_GEO_FIELD_NAME, rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data @@ -88,9 +94,12 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { .or_default(); *entry -= 1; } - let content = update.updated(); - let geo_iter = - content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let content = + update.merged(&context.rtxn, context.index, &context.db_fields_ids_map)?; + let geo_iter = content + .geo_field() + .transpose() + .map(|res| res.map(|rv| (RESERVED_GEO_FIELD_NAME, rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data @@ -120,8 +129,10 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { DocumentChange::Insertion(insertion) => { let docid = insertion.docid(); let content = insertion.inserted(); - let geo_iter = - content.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + let geo_iter = content + .geo_field() + .transpose() + .map(|res| res.map(|rv| (RESERVED_GEO_FIELD_NAME, rv))); for res in content.iter_top_level_fields().chain(geo_iter) { let (f, _) = res?; let entry = document_extractor_data diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 9ad37d52c..e2f24b26b 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -5,34 +5,39 @@ use std::ops::DerefMut as _; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::HashMap; -use heed::RoTxn; use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; use super::FacetKind; +use crate::fields_ids_map::metadata::Metadata; +use crate::filterable_attributes_rules::match_faceted_field; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; use crate::update::new::extract::perm_json_p; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; -use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::{DocumentId, FieldId, FilterableAttributesRule, Result, MAX_FACET_VALUE_LENGTH}; -pub struct FacetedExtractorData<'a> { - attributes_to_extract: &'a [&'a str], - sender: &'a FieldIdDocidFacetSender<'a>, - grenad_parameters: GrenadParameters, +pub struct FacetedExtractorData<'a, 'b> { + sender: &'a FieldIdDocidFacetSender<'a, 'b>, + grenad_parameters: &'a GrenadParameters, buckets: usize, + filterable_attributes: &'a [FilterableAttributesRule], + sortable_fields: &'a HashSet, + asc_desc_fields: &'a HashSet, + distinct_field: &'a Option, + is_geo_enabled: bool, } -impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { +impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'_, '_> { type Data = RefCell>; fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { @@ -52,7 +57,11 @@ impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { let change = change?; FacetedDocidsExtractor::extract_document_change( context, - self.attributes_to_extract, + self.filterable_attributes, + self.sortable_fields, + self.asc_desc_fields, + self.distinct_field, + self.is_geo_enabled, change, self.sender, )? @@ -64,13 +73,18 @@ impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { + #[allow(clippy::too_many_arguments)] fn extract_document_change( context: &DocumentChangeContext>, - attributes_to_extract: &[&str], + filterable_attributes: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, + is_geo_enabled: bool, document_change: DocumentChange, sender: &FieldIdDocidFacetSender, ) -> Result<()> { - let index = &context.index; + let index = context.index; let rtxn = &context.rtxn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield(); @@ -78,11 +92,15 @@ impl FacetedDocidsExtractor { let docid = document_change.docid(); let res = match document_change { DocumentChange::Deletion(inner) => extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -91,18 +109,41 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + filterable_attributes, depth, value, ) }, ), DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + &mut |field_name| { + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) + }, + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -111,6 +152,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -118,11 +161,15 @@ impl FacetedDocidsExtractor { )?; extract_document_facets( - attributes_to_extract, inner.merged(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -131,6 +178,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -138,11 +187,15 @@ impl FacetedDocidsExtractor { ) } DocumentChange::Insertion(inner) => extract_document_facets( - attributes_to_extract, inner.inserted(), inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -151,6 +204,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -171,9 +226,18 @@ impl FacetedDocidsExtractor { facet_fn: impl Fn(&mut DelAddFacetValue<'doc>, FieldId, BVec<'doc, u8>, FacetKind), docid: DocumentId, fid: FieldId, + meta: Metadata, + filterable_attributes: &[FilterableAttributesRule], depth: perm_json_p::Depth, value: &Value, ) -> Result<()> { + // if the field is not faceted, do nothing + if !meta.is_faceted(filterable_attributes) { + return Ok(()); + } + + let features = meta.filterable_attributes_features(filterable_attributes); + let mut buffer = BVec::new_in(doc_alloc); // Exists // key: fid @@ -237,7 +301,9 @@ impl FacetedDocidsExtractor { } // Null // key: fid - Value::Null if depth == perm_json_p::Depth::OnBaseKey => { + Value::Null + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_null() => + { buffer.clear(); buffer.push(FacetKind::Null as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -245,19 +311,29 @@ impl FacetedDocidsExtractor { } // Empty // key: fid - Value::Array(a) if a.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Array(a) + if a.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::String(_) if depth == perm_json_p::Depth::OnBaseKey => { + Value::String(_) + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::Object(o) if o.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Object(o) + if o.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -267,49 +343,63 @@ impl FacetedDocidsExtractor { _ => Ok(()), } } - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } } struct DelAddFacetValue<'doc> { - strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + strings: HashMap< + (FieldId, &'doc str), + Option>, + hashbrown::DefaultHashBuilder, + &'doc Bump, + >, f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, + doc_alloc: &'doc Bump, } impl<'doc> DelAddFacetValue<'doc> { fn new(doc_alloc: &'doc Bump) -> Self { - Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc) } + Self { strings: HashMap::new_in(doc_alloc), f64s: HashMap::new_in(doc_alloc), doc_alloc } } fn insert_add(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { - let cache = match kind { - FacetKind::String => &mut self.strings, - FacetKind::Number => &mut self.f64s, - _ => return, - }; - - let key = (fid, value); - if let Some(DelAdd::Deletion) = cache.get(&key) { - cache.remove(&key); - } else { - cache.insert(key, DelAdd::Addition); + match kind { + FacetKind::Number => { + let key = (fid, value); + if let Some(DelAdd::Deletion) = self.f64s.get(&key) { + self.f64s.remove(&key); + } else { + self.f64s.insert(key, DelAdd::Addition); + } + } + FacetKind::String => { + if let Ok(s) = std::str::from_utf8(&value) { + let normalized = crate::normalize_facet(s); + let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized)); + self.strings.insert((fid, truncated), Some(value)); + } + } + _ => (), } } fn insert_del(&mut self, fid: FieldId, value: BVec<'doc, u8>, kind: FacetKind) { - let cache = match kind { - FacetKind::String => &mut self.strings, - FacetKind::Number => &mut self.f64s, - _ => return, - }; - - let key = (fid, value); - if let Some(DelAdd::Addition) = cache.get(&key) { - cache.remove(&key); - } else { - cache.insert(key, DelAdd::Deletion); + match kind { + FacetKind::Number => { + let key = (fid, value); + if let Some(DelAdd::Addition) = self.f64s.get(&key) { + self.f64s.remove(&key); + } else { + self.f64s.insert(key, DelAdd::Deletion); + } + } + FacetKind::String => { + if let Ok(s) = std::str::from_utf8(&value) { + let normalized = crate::normalize_facet(s); + let truncated = self.doc_alloc.alloc_str(truncate_str(&normalized)); + self.strings.insert((fid, truncated), None); + } + } + _ => (), } } @@ -318,20 +408,16 @@ impl<'doc> DelAddFacetValue<'doc> { docid: DocumentId, sender: &FieldIdDocidFacetSender, doc_alloc: &Bump, - ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { + ) -> crate::Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); - for ((fid, value), deladd) in self.strings { - if let Ok(s) = std::str::from_utf8(&value) { - buffer.clear(); - buffer.extend_from_slice(&fid.to_be_bytes()); - buffer.extend_from_slice(&docid.to_be_bytes()); - let normalized = crate::normalize_facet(s); - let truncated = truncate_str(&normalized); - buffer.extend_from_slice(truncated.as_bytes()); - match deladd { - DelAdd::Deletion => sender.delete_facet_string(&buffer)?, - DelAdd::Addition => sender.write_facet_string(&buffer, &value)?, - } + for ((fid, truncated), value) in self.strings { + buffer.clear(); + buffer.extend_from_slice(&fid.to_be_bytes()); + buffer.extend_from_slice(&docid.to_be_bytes()); + buffer.extend_from_slice(truncated.as_bytes()); + match &value { + Some(value) => sender.write_facet_string(&buffer, value)?, + None => sender.delete_facet_string(&buffer)?, } } @@ -364,32 +450,23 @@ fn truncate_str(s: &str) -> &str { impl FacetedDocidsExtractor { #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] - pub fn run_extraction< - 'pl, - 'fid, - 'indexer, - 'index, - 'extractor, - DC: DocumentChanges<'pl>, - MSP, - SP, - >( - grenad_parameters: GrenadParameters, + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, sender: &FieldIdDocidFacetSender, - step: Step, + step: IndexingStep, ) -> Result>> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { let index = indexing_context.index; let rtxn = index.read_txn()?; - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_extract: Vec<_> = - attributes_to_extract.iter().map(|s| s.as_ref()).collect(); + let filterable_attributes = index.filterable_attributes_rules(&rtxn)?; + let sortable_fields = index.sortable_fields(&rtxn)?; + let asc_desc_fields = index.asc_desc_fields(&rtxn)?; + let distinct_field = index.distinct_field(&rtxn)?.map(|s| s.to_string()); + let is_geo_enabled = index.is_geo_enabled(&rtxn)?; let datastore = ThreadLocal::new(); { @@ -398,10 +475,14 @@ impl FacetedDocidsExtractor { let _entered = span.enter(); let extractor = FacetedExtractorData { - attributes_to_extract: &attributes_to_extract, - grenad_parameters, + grenad_parameters: indexing_context.grenad_parameters, buckets: rayon::current_num_threads(), sender, + filterable_attributes: &filterable_attributes, + sortable_fields: &sortable_fields, + asc_desc_fields: &asc_desc_fields, + distinct_field: &distinct_field, + is_geo_enabled, }; extract( document_changes, diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index eff529120..e74131402 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,45 +1,80 @@ +use std::collections::HashSet; + use serde_json::Value; +use crate::attribute_patterns::PatternMatch; +use crate::fields_ids_map::metadata::Metadata; use crate::update::new::document::Document; use crate::update::new::extract::geo::extract_geo_coordinates; use crate::update::new::extract::perm_json_p; -use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; +use crate::{ + FieldId, FilterableAttributesRule, GlobalFieldsIdsMap, InternalError, Result, UserError, +}; +use crate::filterable_attributes_rules::match_faceted_field; + +#[allow(clippy::too_many_arguments)] pub fn extract_document_facets<'doc>( - attributes_to_extract: &[&str], document: impl Document<'doc>, external_document_id: &str, field_id_map: &mut GlobalFieldsIdsMap, - facet_fn: &mut impl FnMut(FieldId, perm_json_p::Depth, &Value) -> Result<()>, + filterable_attributes: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, + is_geo_enabled: bool, + facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>, ) -> Result<()> { + // return the match result for the given field name. + let match_field = |field_name: &str| -> PatternMatch { + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) + }; + + // extract the field if it is faceted (facet searchable, filterable, sortable) + let mut extract_field = |name: &str, depth: perm_json_p::Depth, value: &Value| -> Result<()> { + match field_id_map.id_with_metadata_or_insert(name) { + Some((field_id, meta)) => { + facet_fn(field_id, meta, depth, value)?; + + Ok(()) + } + None => Err(UserError::AttributeLimitReached.into()), + } + }; + for res in document.iter_top_level_fields() { let (field_name, value) = res?; + let selection = match_field(field_name); - let mut tokenize_field = - |name: &str, depth: perm_json_p::Depth, value: &Value| match field_id_map - .id_or_insert(name) - { - Some(field_id) => facet_fn(field_id, depth, value), - None => Err(UserError::AttributeLimitReached.into()), - }; + // extract the field if it matches a pattern and if it is faceted (facet searchable, filterable, sortable) + let mut match_and_extract = |name: &str, depth: perm_json_p::Depth, value: &Value| { + let selection = match_field(name); + if selection == PatternMatch::Match { + extract_field(name, depth, value)?; + } - // if the current field is searchable or contains a searchable attribute - let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); - if selection != perm_json_p::Selection::Skip { + Ok(selection) + }; + + if selection != PatternMatch::NoMatch { // parse json. match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => { perm_json_p::seek_leaf_values_in_object( &object, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Object(object), @@ -49,36 +84,34 @@ pub fn extract_document_facets<'doc>( Value::Array(array) => { perm_json_p::seek_leaf_values_in_array( &array, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Array(array), )?; } } - value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, + value => extract_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, } } } - if attributes_to_extract.contains(&"_geo") { + if is_geo_enabled { if let Some(geo_value) = document.geo_field()? { if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? { - let (lat_fid, lng_fid) = field_id_map - .id_or_insert("_geo.lat") - .zip(field_id_map.id_or_insert("_geo.lng")) + let ((lat_fid, lat_meta), (lng_fid, lng_meta)) = field_id_map + .id_with_metadata_or_insert("_geo.lat") + .zip(field_id_map.id_with_metadata_or_insert("_geo.lng")) .ok_or(UserError::AttributeLimitReached)?; - facet_fn(lat_fid, perm_json_p::Depth::OnBaseKey, &lat.into())?; - facet_fn(lng_fid, perm_json_p::Depth::OnBaseKey, &lng.into())?; + facet_fn(lat_fid, lat_meta, perm_json_p::Depth::OnBaseKey, &lat.into())?; + facet_fn(lng_fid, lng_meta, perm_json_p::Depth::OnBaseKey, &lng.into())?; } } } diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 09d2ce0f8..b2ccc1b2b 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -1,6 +1,6 @@ use std::cell::RefCell; use std::fs::File; -use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; +use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _}; use std::{iter, mem, result}; use bumpalo::Bump; @@ -28,9 +28,7 @@ impl GeoExtractor { index: &Index, grenad_parameters: GrenadParameters, ) -> Result> { - let is_sortable = index.sortable_fields(rtxn)?.contains("_geo"); - let is_filterable = index.filterable_fields(rtxn)?.contains("_geo"); - if is_sortable || is_filterable { + if index.is_geo_enabled(rtxn)? { Ok(Some(GeoExtractor { grenad_parameters })) } else { Ok(None) @@ -94,33 +92,37 @@ pub struct FrozenGeoExtractorData<'extractor> { pub spilled_inserted: Option>, } -impl<'extractor> FrozenGeoExtractorData<'extractor> { +impl FrozenGeoExtractorData<'_> { pub fn iter_and_clear_removed( &mut self, - ) -> impl IntoIterator> + '_ { - mem::take(&mut self.removed) + ) -> io::Result> + '_> { + Ok(mem::take(&mut self.removed) .iter() .copied() .map(Ok) - .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)?)) } pub fn iter_and_clear_inserted( &mut self, - ) -> impl IntoIterator> + '_ { - mem::take(&mut self.inserted) + ) -> io::Result> + '_> { + Ok(mem::take(&mut self.inserted) .iter() .copied() .map(Ok) - .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)) + .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)?)) } } fn iterator_over_spilled_geopoints( spilled: &mut Option>, -) -> impl IntoIterator> + '_ { +) -> io::Result> + '_> { let mut spilled = spilled.take(); - iter::from_fn(move || match &mut spilled { + if let Some(spilled) = &mut spilled { + spilled.rewind()?; + } + + Ok(iter::from_fn(move || match &mut spilled { Some(file) => { let geopoint_bytes = &mut [0u8; mem::size_of::()]; match file.read_exact(geopoint_bytes) { @@ -130,7 +132,7 @@ fn iterator_over_spilled_geopoints( } } None => None, - }) + })) } impl<'extractor> Extractor<'extractor> for GeoExtractor { @@ -157,7 +159,9 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { let mut data_ref = context.data.borrow_mut_or_yield(); for change in changes { - if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) { + if data_ref.spilled_removed.is_none() + && max_memory.is_some_and(|mm| context.extractor_alloc.allocated_bytes() >= mm) + { // We must spill as we allocated too much memory data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?; data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?; @@ -192,7 +196,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { .transpose()?; let updated_geo = update - .updated() + .merged(rtxn, index, db_fields_ids_map)? .geo_field()? .map(|geo| extract_geo_coordinates(external_id, geo)) .transpose()?; @@ -254,9 +258,11 @@ pub fn extract_geo_coordinates( Value::Null => return Ok(None), Value::Object(map) => map, value => { - return Err( - GeoError::NotAnObject { document_id: Value::from(external_id), value }.into() - ) + return Err(Box::new(GeoError::NotAnObject { + document_id: Value::from(external_id), + value, + }) + .into()) } }; @@ -265,23 +271,29 @@ pub fn extract_geo_coordinates( if geo.is_empty() { [lat, lng] } else { - return Err(GeoError::UnexpectedExtraFields { + return Err(Box::new(GeoError::UnexpectedExtraFields { document_id: Value::from(external_id), value: Value::from(geo), - } + }) .into()); } } (Some(_), None) => { - return Err(GeoError::MissingLongitude { document_id: Value::from(external_id) }.into()) + return Err(Box::new(GeoError::MissingLongitude { + document_id: Value::from(external_id), + }) + .into()) } (None, Some(_)) => { - return Err(GeoError::MissingLatitude { document_id: Value::from(external_id) }.into()) + return Err(Box::new(GeoError::MissingLatitude { + document_id: Value::from(external_id), + }) + .into()) } (None, None) => { - return Err(GeoError::MissingLatitudeAndLongitude { + return Err(Box::new(GeoError::MissingLatitudeAndLongitude { document_id: Value::from(external_id), - } + }) .into()) } }; @@ -289,16 +301,18 @@ pub fn extract_geo_coordinates( match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { (Ok(lat), Ok(lng)) => Ok(Some([lat, lng])), (Ok(_), Err(value)) => { - Err(GeoError::BadLongitude { document_id: Value::from(external_id), value }.into()) + Err(Box::new(GeoError::BadLongitude { document_id: Value::from(external_id), value }) + .into()) } (Err(value), Ok(_)) => { - Err(GeoError::BadLatitude { document_id: Value::from(external_id), value }.into()) + Err(Box::new(GeoError::BadLatitude { document_id: Value::from(external_id), value }) + .into()) } - (Err(lat), Err(lng)) => Err(GeoError::BadLatitudeAndLongitude { + (Err(lat), Err(lng)) => Err(Box::new(GeoError::BadLatitudeAndLongitude { document_id: Value::from(external_id), lat, lng, - } + }) .into()), } } diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index e67f70db1..a8264ba4a 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -5,38 +5,20 @@ mod geo; mod searchable; mod vectors; -use bumpalo::Bump; -pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; +pub use cache::{ + merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, +}; pub use documents::*; pub use faceted::*; pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress}; -use super::steps::Step; -use super::thread_local::{FullySend, ThreadLocal}; -use crate::update::GrenadParameters; -use crate::Result; - -pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( - grenad_parameters: GrenadParameters, - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync; -} - /// TODO move in permissive json pointer pub mod perm_json_p { use serde_json::{Map, Value}; - use crate::Result; + use crate::{attribute_patterns::PatternMatch, Result}; const SPLIT_SYMBOL: char = '.'; /// Returns `true` if the `selector` match the `key`. @@ -69,11 +51,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_object( value: &Map, - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if value.is_empty() { seeker(base_key, base_depth, &Value::Object(Map::with_capacity(0)))?; @@ -86,40 +66,16 @@ pub mod perm_json_p { format!("{}{}{}", base_key, SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let selection = select_field(&base_key, selectors, skip_selectors); - if selection != Selection::Skip { + let selection = seeker(&base_key, Depth::OnBaseKey, value)?; + if selection != PatternMatch::NoMatch { match value { Value::Object(object) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_object(object, &base_key, Depth::OnBaseKey, seeker) } Value::Array(array) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_array(array, &base_key, Depth::OnBaseKey, seeker) } - value => seeker(&base_key, Depth::OnBaseKey, value), + _ => Ok(()), }?; } } @@ -129,11 +85,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_array( values: &[Value], - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if values.is_empty() { seeker(base_key, base_depth, &Value::Array(vec![]))?; @@ -141,61 +95,16 @@ pub mod perm_json_p { for value in values { match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - value => seeker(base_key, Depth::InsideArray, value), + Value::Object(object) => { + seek_leaf_values_in_object(object, base_key, Depth::InsideArray, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, base_key, Depth::InsideArray, seeker) + } + value => seeker(base_key, Depth::InsideArray, value).map(|_| ()), }?; } Ok(()) } - - pub fn select_field( - field_name: &str, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - ) -> Selection { - if skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) - }) { - Selection::Skip - } else if let Some(selectors) = selectors { - let mut selection = Selection::Skip; - for selector in selectors { - if contained_in(field_name, selector) { - selection = Selection::Select; - break; - } else if contained_in(selector, field_name) { - selection = Selection::Parent; - } - } - selection - } else { - Selection::Select - } - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub enum Selection { - /// The field is a parent of the of a nested field that must be selected - Parent, - /// The field must be selected - Select, - /// The field must be skipped - Skip, - } } diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index 05e2374dc..a085a89ae 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -5,20 +5,19 @@ use std::ops::DerefMut as _; use bumpalo::collections::vec::Vec as BumpVec; use bumpalo::Bump; -use heed::RoTxn; +use super::match_searchable_field; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::update::new::extract::cache::BalancedCaches; use crate::update::new::extract::perm_json_p::contained_in; use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, }; use crate::update::new::ref_cell_ext::RefCellExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; const MAX_COUNTED_WORDS: usize = 30; @@ -28,11 +27,11 @@ pub struct WordDocidsBalancedCaches<'extractor> { exact_word_docids: BalancedCaches<'extractor>, word_position_docids: BalancedCaches<'extractor>, fid_word_count_docids: BalancedCaches<'extractor>, - fid_word_count: HashMap, + fid_word_count: HashMap, Option)>, current_docid: Option, } -unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {} +unsafe impl MostlySend for WordDocidsBalancedCaches<'_> {} impl<'extractor> WordDocidsBalancedCaches<'extractor> { pub fn new_in(buckets: usize, max_memory: Option, alloc: &'extractor Bump) -> Self { @@ -79,14 +78,14 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { buffer.extend_from_slice(&position.to_be_bytes()); self.word_position_docids.insert_add_u32(&buffer, docid)?; - if self.current_docid.map_or(false, |id| docid != id) { + if self.current_docid.is_some_and(|id| docid != id) { self.flush_fid_word_count(&mut buffer)?; } self.fid_word_count .entry(field_id) - .and_modify(|(_current_count, new_count)| *new_count += 1) - .or_insert((0, 1)); + .and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1) + .or_insert((None, Some(1))); self.current_docid = Some(docid); Ok(()) @@ -124,14 +123,14 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { buffer.extend_from_slice(&position.to_be_bytes()); self.word_position_docids.insert_del_u32(&buffer, docid)?; - if self.current_docid.map_or(false, |id| docid != id) { + if self.current_docid.is_some_and(|id| docid != id) { self.flush_fid_word_count(&mut buffer)?; } self.fid_word_count .entry(field_id) - .and_modify(|(current_count, _new_count)| *current_count += 1) - .or_insert((1, 0)); + .and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1) + .or_insert((Some(1), None)); self.current_docid = Some(docid); @@ -141,14 +140,18 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { fn flush_fid_word_count(&mut self, buffer: &mut BumpVec) -> Result<()> { for (fid, (current_count, new_count)) in self.fid_word_count.drain() { if current_count != new_count { - if current_count <= MAX_COUNTED_WORDS { + if let Some(current_count) = + current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(current_count as u8); self.fid_word_count_docids .insert_del_u32(buffer, self.current_docid.unwrap())?; } - if new_count <= MAX_COUNTED_WORDS { + if let Some(new_count) = + new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS) + { buffer.clear(); buffer.extend_from_slice(&fid.to_be_bytes()); buffer.push(new_count as u8); @@ -203,18 +206,19 @@ impl<'extractor> WordDocidsCaches<'extractor> { } pub struct WordDocidsExtractorData<'a> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: GrenadParameters, + tokenizer: DocumentTokenizer<'a>, + max_memory_by_thread: Option, buckets: usize, + searchable_attributes: Option>, } -impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { +impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'_> { type Data = RefCell>>; fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in( self.buckets, - self.grenad_parameters.max_memory_by_thread(), + self.max_memory_by_thread, extractor_alloc, )))) } @@ -226,7 +230,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { ) -> Result<()> { for change in changes { let change = change?; - WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?; + WordDocidsExtractors::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; } Ok(()) } @@ -235,72 +244,51 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { pub struct WordDocidsExtractors; impl WordDocidsExtractors { - pub fn run_extraction< - 'pl, - 'fid, - 'indexer, - 'index, - 'extractor, - DC: DocumentChanges<'pl>, - MSP, - SP, - >( - grenad_parameters: GrenadParameters, + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, + step: IndexingStep, ) -> Result> where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { - let index = indexing_context.index; - let rtxn = index.read_txn()?; - - let stop_words = index.stop_words(&rtxn)?; - let allowed_separators = index.allowed_separators(&rtxn)?; + // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; let allowed_separators: Option> = allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = index.dictionary(&rtxn)?; + let dictionary = indexing_context.index.dictionary(&rtxn)?; let dictionary: Option> = dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let builder = tokenizer_builder( + let mut builder = tokenizer_builder( stop_words.as_ref(), allowed_separators.as_deref(), dictionary.as_deref(), ); - let tokenizer = builder.into_tokenizer(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?; + let tokenizer = builder.build(); let localized_attributes_rules = - index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), localized_attributes_rules: &localized_attributes_rules, max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, }; - + let extractor_data = WordDocidsExtractorData { + tokenizer: document_tokenizer, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + }; let datastore = ThreadLocal::new(); - { let span = tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); let _entered = span.enter(); - - let extractor = WordDocidsExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters, - buckets: rayon::current_num_threads(), - }; - extract( document_changes, - &extractor, + &extractor_data, indexing_context, extractor_allocs, &datastore, @@ -319,6 +307,7 @@ impl WordDocidsExtractors { fn extract_document_change( context: &DocumentChangeContext>>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let index = &context.index; @@ -351,6 +340,17 @@ impl WordDocidsExtractors { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, + &context.rtxn, + context.index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let mut token_fn = |fname: &str, fid, pos, word: &str| { cached_sorter.insert_del_u32( fid, @@ -406,15 +406,4 @@ impl WordDocidsExtractors { let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc); cached_sorter.flush_fid_word_count(&mut buffer) } - - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } - - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) - } } diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dcd9e3a78..3b358800f 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -2,30 +2,114 @@ use std::cell::RefCell; use std::collections::VecDeque; use std::rc::Rc; -use heed::RoTxn; +use bumpalo::Bump; -use super::tokenize_document::DocumentTokenizer; -use super::SearchableExtractor; +use super::match_searchable_field; +use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::update::new::document::Document; use crate::update::new::extract::cache::BalancedCaches; -use crate::update::new::indexer::document_changes::DocumentChangeContext; +use crate::update::new::indexer::document_changes::{ + extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, +}; use crate::update::new::ref_cell_ext::RefCellExt as _; +use crate::update::new::steps::IndexingStep; +use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result}; +use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE}; + +pub struct WordPairProximityDocidsExtractorData<'a> { + tokenizer: DocumentTokenizer<'a>, + searchable_attributes: Option>, + max_memory_by_thread: Option, + buckets: usize, +} + +impl<'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'_> { + type Data = RefCell>; + + fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { + Ok(RefCell::new(BalancedCaches::new_in( + self.buckets, + self.max_memory_by_thread, + extractor_alloc, + ))) + } + + fn process<'doc>( + &self, + changes: impl Iterator>>, + context: &DocumentChangeContext, + ) -> Result<()> { + for change in changes { + let change = change?; + WordPairProximityDocidsExtractor::extract_document_change( + context, + &self.tokenizer, + self.searchable_attributes.as_deref(), + change, + )?; + } + Ok(()) + } +} pub struct WordPairProximityDocidsExtractor; -impl SearchableExtractor for WordPairProximityDocidsExtractor { - fn attributes_to_extract<'a>( - rtxn: &'a RoTxn, - index: &'a Index, - ) -> Result>> { - index.user_defined_searchable_fields(rtxn).map_err(Into::into) - } +impl WordPairProximityDocidsExtractor { + pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( + document_changes: &DC, + indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, + extractor_allocs: &'extractor mut ThreadLocal>, + step: IndexingStep, + ) -> Result>> + where + MSP: Fn() -> bool + Sync, + { + // Warning: this is duplicated code from extract_word_docids.rs + let rtxn = indexing_context.index.read_txn()?; + let stop_words = indexing_context.index.stop_words(&rtxn)?; + let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; + let allowed_separators: Option> = + allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let dictionary = indexing_context.index.dictionary(&rtxn)?; + let dictionary: Option> = + dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); + let mut builder = tokenizer_builder( + stop_words.as_ref(), + allowed_separators.as_deref(), + dictionary.as_deref(), + ); + let tokenizer = builder.build(); + let localized_attributes_rules = + indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); + let document_tokenizer = DocumentTokenizer { + tokenizer: &tokenizer, + localized_attributes_rules: &localized_attributes_rules, + max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, + }; + let extractor_data = WordPairProximityDocidsExtractorData { + tokenizer: document_tokenizer, + searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?, + max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(), + buckets: rayon::current_num_threads(), + }; + let datastore = ThreadLocal::new(); + { + let span = + tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); + let _entered = span.enter(); + extract( + document_changes, + &extractor_data, + indexing_context, + extractor_allocs, + &datastore, + step, + )?; + } - fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result> { - Ok(Vec::new()) + Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } // This method is reimplemented to count the number of words in the document in each field @@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { fn extract_document_change( context: &DocumentChangeContext>, document_tokenizer: &DocumentTokenizer, + searchable_attributes: Option<&[&str]>, document_change: DocumentChange, ) -> Result<()> { let doc_alloc = &context.doc_alloc; @@ -70,6 +155,17 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { )?; } DocumentChange::Update(inner) => { + if !inner.has_changed_for_fields( + &mut |field_name: &str| { + match_searchable_field(field_name, searchable_attributes) + }, + rtxn, + index, + context.db_fields_ids_map, + )? { + return Ok(()); + } + let document = inner.current(rtxn, index, context.db_fields_ids_map)?; process_document_tokens( document, @@ -174,7 +270,7 @@ fn process_document_tokens<'doc>( // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions .front() - .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) + .is_some_and(|(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) { word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); } diff --git a/crates/milli/src/update/new/extract/searchable/mod.rs b/crates/milli/src/update/new/extract/searchable/mod.rs index 05d2406d9..79a6fae87 100644 --- a/crates/milli/src/update/new/extract/searchable/mod.rs +++ b/crates/milli/src/update/new/extract/searchable/mod.rs @@ -2,155 +2,28 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; -use std::cell::RefCell; -use std::marker::PhantomData; - -use bumpalo::Bump; pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors}; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; -use heed::RoTxn; -use tokenize_document::{tokenizer_builder, DocumentTokenizer}; -use super::cache::BalancedCaches; -use super::DocidsExtractor; -use crate::update::new::indexer::document_changes::{ - extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, -}; -use crate::update::new::steps::Step; -use crate::update::new::thread_local::{FullySend, ThreadLocal}; -use crate::update::new::DocumentChange; -use crate::update::GrenadParameters; -use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; -pub struct SearchableExtractorData<'a, EX: SearchableExtractor> { - tokenizer: &'a DocumentTokenizer<'a>, - grenad_parameters: GrenadParameters, - buckets: usize, - _ex: PhantomData, -} +pub fn match_searchable_field( + field_name: &str, + searchable_fields: Option<&[&str]>, +) -> PatternMatch { + let Some(searchable_fields) = searchable_fields else { + // If no searchable fields are provided, consider all fields as searchable + return PatternMatch::Match; + }; -impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> - for SearchableExtractorData<'a, EX> -{ - type Data = RefCell>; - - fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { - Ok(RefCell::new(BalancedCaches::new_in( - self.buckets, - self.grenad_parameters.max_memory_by_thread(), - extractor_alloc, - ))) - } - - fn process<'doc>( - &self, - changes: impl Iterator>>, - context: &DocumentChangeContext, - ) -> Result<()> { - for change in changes { - let change = change?; - EX::extract_document_change(context, self.tokenizer, change)?; + let mut selection = PatternMatch::NoMatch; + for pattern in searchable_fields { + match match_field_legacy(pattern, field_name) { + PatternMatch::Match => return PatternMatch::Match, + PatternMatch::Parent => selection = PatternMatch::Parent, + PatternMatch::NoMatch => (), } - Ok(()) - } -} - -pub trait SearchableExtractor: Sized + Sync { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( - grenad_parameters: GrenadParameters, - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, - { - let rtxn = indexing_context.index.read_txn()?; - let stop_words = indexing_context.index.stop_words(&rtxn)?; - let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?; - let allowed_separators: Option> = - allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let dictionary = indexing_context.index.dictionary(&rtxn)?; - let dictionary: Option> = - dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); - let mut builder = tokenizer_builder( - stop_words.as_ref(), - allowed_separators.as_deref(), - dictionary.as_deref(), - ); - let tokenizer = builder.build(); - - let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?; - let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?; - let localized_attributes_rules = - indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); - - let document_tokenizer = DocumentTokenizer { - tokenizer: &tokenizer, - attribute_to_extract: attributes_to_extract.as_deref(), - attribute_to_skip: attributes_to_skip.as_slice(), - localized_attributes_rules: &localized_attributes_rules, - max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE, - }; - - let extractor_data: SearchableExtractorData = SearchableExtractorData { - tokenizer: &document_tokenizer, - grenad_parameters, - buckets: rayon::current_num_threads(), - _ex: PhantomData, - }; - - let datastore = ThreadLocal::new(); - - { - let span = - tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction"); - let _entered = span.enter(); - extract( - document_changes, - &extractor_data, - indexing_context, - extractor_allocs, - &datastore, - step, - )?; - } - - Ok(datastore.into_iter().map(RefCell::into_inner).collect()) } - fn extract_document_change( - context: &DocumentChangeContext>, - document_tokenizer: &DocumentTokenizer, - document_change: DocumentChange, - ) -> Result<()>; - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) - -> Result>>; - - fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; -} - -impl DocidsExtractor for T { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( - grenad_parameters: GrenadParameters, - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: Step, - ) -> Result>> - where - MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, - { - Self::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - extractor_allocs, - step, - ) - } + selection } diff --git a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs index ffdce5b7e..4fa456bb3 100644 --- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs +++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs @@ -3,9 +3,10 @@ use std::collections::HashMap; use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; use serde_json::Value; +use crate::attribute_patterns::PatternMatch; use crate::update::new::document::Document; use crate::update::new::extract::perm_json_p::{ - seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection, + seek_leaf_values_in_array, seek_leaf_values_in_object, Depth, }; use crate::{ FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError, @@ -17,13 +18,11 @@ const MAX_DISTANCE: u32 = 8; pub struct DocumentTokenizer<'a> { pub tokenizer: &'a Tokenizer<'a>, - pub attribute_to_extract: Option<&'a [&'a str]>, - pub attribute_to_skip: &'a [&'a str], pub localized_attributes_rules: &'a [LocalizedAttributesRule], pub max_positions_per_attributes: u32, } -impl<'a> DocumentTokenizer<'a> { +impl DocumentTokenizer<'_> { pub fn tokenize_document<'doc>( &self, document: impl Document<'doc>, @@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> { token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>, ) -> Result<()> { let mut field_position = HashMap::new(); + let mut tokenize_field = |field_name: &str, _depth, value: &Value| { + let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else { + return Err(UserError::AttributeLimitReached.into()); + }; + + if meta.is_searchable() { + self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?; + } + + // todo: should be a match on the field_name using `match_field_legacy` function, + // but for legacy reasons we iterate over all the fields to fill the field_id_map. + Ok(PatternMatch::Match) + }; for entry in document.iter_top_level_fields() { let (field_name, value) = entry?; - - let mut tokenize_field = |field_name: &str, _depth, value: &Value| { - let Some(field_id) = field_id_map.id_or_insert(field_name) else { - return Err(UserError::AttributeLimitReached.into()); - }; - - if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) - != Selection::Select - { - return Ok(()); - } - - let position = field_position - .entry(field_id) - .and_modify(|counter| *counter += MAX_DISTANCE) - .or_insert(0); - if *position >= self.max_positions_per_attributes { - return Ok(()); - } - - let text; - let tokens = match value { - Value::Number(n) => { - text = n.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::Bool(b) => { - text = b.to_string(); - self.tokenizer.tokenize(text.as_str()) - } - Value::String(text) => { - let locales = self - .localized_attributes_rules - .iter() - .find(|rule| rule.match_str(field_name)) - .map(|rule| rule.locales()); - self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) - } - _ => return Ok(()), - }; - - // create an iterator of token with their positions. - let tokens = process_tokens(*position, tokens) - .take_while(|(p, _)| *p < self.max_positions_per_attributes); - - for (index, token) in tokens { - // keep a word only if it is not empty and fit in a LMDB key. - let token = token.lemma().trim(); - if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { - *position = index; - if let Ok(position) = (*position).try_into() { - token_fn(field_name, field_id, position, token)?; - } - } - } - - Ok(()) - }; - // parse json. match serde_json::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => seek_leaf_values_in_object( &object, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, Value::Array(array) => seek_leaf_values_in_array( &array, - None, - &[], field_name, Depth::OnBaseKey, &mut tokenize_field, )?, - value => tokenize_field(field_name, Depth::OnBaseKey, &value)?, + value => { + tokenize_field(field_name, Depth::OnBaseKey, &value)?; + } + } + } + + Ok(()) + } + + fn tokenize_field( + &self, + field_id: FieldId, + field_name: &str, + value: &Value, + token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>, + field_position: &mut HashMap, + ) -> Result<()> { + let position = field_position + .entry(field_id) + .and_modify(|counter| *counter += MAX_DISTANCE) + .or_insert(0); + if *position >= self.max_positions_per_attributes { + return Ok(()); + } + + let text; + let tokens = match value { + Value::Number(n) => { + text = n.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::Bool(b) => { + text = b.to_string(); + self.tokenizer.tokenize(text.as_str()) + } + Value::String(text) => { + let locales = self + .localized_attributes_rules + .iter() + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) + .map(|rule| rule.locales()); + self.tokenizer.tokenize_with_allow_list(text.as_str(), locales) + } + _ => return Ok(()), + }; + + // create an iterator of token with their positions. + let tokens = process_tokens(*position, tokens) + .take_while(|(p, _)| *p < self.max_positions_per_attributes); + + for (index, token) in tokens { + // keep a word only if it is not empty and fit in a LMDB key. + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + *position = index; + if let Ok(position) = (*position).try_into() { + token_fn(field_name, field_id, position, token)?; + } } } @@ -176,9 +182,10 @@ pub fn tokenizer_builder<'a>( #[cfg(test)] mod test { use bumpalo::Bump; + use bumparaw_collections::RawMap; use charabia::TokenizerBuilder; use meili_snap::snapshot; - use raw_collections::RawMap; + use rustc_hash::FxBuildHasher; use serde_json::json; use serde_json::value::RawValue; @@ -214,15 +221,20 @@ mod test { let mut tb = TokenizerBuilder::default(); let document_tokenizer = DocumentTokenizer { tokenizer: &tb.build(), - attribute_to_extract: None, - attribute_to_skip: &["not-me", "me-nether.nope"], localized_attributes_rules: &[], max_positions_per_attributes: 1000, }; let fields_ids_map = FieldIdMapWithMetadata::new( fields_ids_map, - MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None), + MetadataBuilder::new( + Default::default(), + Default::default(), + Default::default(), + None, + None, + Default::default(), + ), ); let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map); @@ -234,7 +246,7 @@ mod test { let bump = Bump::new(); let document: &RawValue = serde_json::from_str(&document).unwrap(); - let document = RawMap::from_raw_value(document, &bump).unwrap(); + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap(); let document = Versions::single(document); let document = DocumentFromVersions::new(&document); @@ -264,6 +276,10 @@ mod test { 2, 16, ]: "catto", + [ + 3, + 0, + ]: "unsearchable", [ 5, 0, @@ -276,6 +292,10 @@ mod test { 8, 0, ]: "23", + [ + 9, + 0, + ]: "unsearchable", } "###); } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 8ac73a8d7..adc022aed 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -18,17 +18,17 @@ use crate::vector::error::{ use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; -pub struct EmbeddingExtractor<'a> { +pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a> EmbeddingExtractor<'a> { +impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { @@ -43,7 +43,7 @@ pub struct EmbeddingExtractorData<'extractor>( unsafe impl MostlySend for EmbeddingExtractorData<'_> {} -impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { +impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -99,7 +99,8 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.db_fields_ids_map, &context.doc_alloc, )?; - let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?; + let new_vectors = + update.only_changed_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { unused_vectors_distribution.append(new_vectors)?; @@ -130,6 +131,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { ); } else if new_vectors.regenerate { let new_rendered = prompt.render_document( + update.external_document_id(), update.current( &context.rtxn, context.index, @@ -139,6 +141,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; let old_rendered = prompt.render_document( + update.external_document_id(), update.merged( &context.rtxn, context.index, @@ -158,6 +161,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } } else if old_vectors.regenerate { let old_rendered = prompt.render_document( + update.external_document_id(), update.current( &context.rtxn, context.index, @@ -167,6 +171,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { &context.doc_alloc, )?; let new_rendered = prompt.render_document( + update.external_document_id(), update.merged( &context.rtxn, context.index, @@ -216,6 +221,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { ); } else if new_vectors.regenerate { let rendered = prompt.render_document( + insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, @@ -229,6 +235,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { } } else { let rendered = prompt.render_document( + insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, &context.doc_alloc, @@ -259,7 +266,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { // Currently this is the case as: // 1. BVec are inside of the bumaplo // 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'extractor> { +struct Chunks<'a, 'b, 'extractor> { texts: BVec<'a, &'a str>, ids: BVec<'a, DocumentId>, @@ -270,11 +277,11 @@ struct Chunks<'a, 'extractor> { possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, } -impl<'a, 'extractor> Chunks<'a, 'extractor> { +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { #[allow(clippy::too_many_arguments)] pub fn new( embedder: &'a Embedder, @@ -284,7 +291,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, ) -> Self { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); @@ -368,7 +375,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, - sender: &EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, ) -> Result<()> { if let Some(external_docid) = has_manual_generation { @@ -409,7 +416,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); } - let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) { + let res = match embedder.embed_index_ref(texts.as_slice(), threads) { Ok(embeddings) => { for (docid, embedding) in ids.into_iter().zip(embeddings) { sender.set_vector(*docid, embedder_id, embedding).unwrap(); diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index 1993c1d00..6e9ffa1ed 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -9,12 +9,14 @@ use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; use super::fst_merger_builder::FstMergerBuilder; use super::KvReaderDelAdd; +use crate::attribute_patterns::PatternMatch; use crate::heed_codec::facet::FacetGroupKey; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::{create_sorter, MergeDeladdBtreesetString}; use crate::{ - BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result, - MAX_FACET_VALUE_LENGTH, + BEU16StrCodec, FieldId, FieldIdMapMissingEntry, FilterableAttributesFeatures, + FilterableAttributesRule, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, + Result, MAX_FACET_VALUE_LENGTH, }; pub struct FacetSearchBuilder<'indexer> { @@ -22,6 +24,7 @@ pub struct FacetSearchBuilder<'indexer> { normalized_facet_string_docids_sorter: Sorter, global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, localized_attributes_rules: Vec, + filterable_attributes_rules: Vec, // Buffered data below buffer: Vec, localized_field_ids: HashMap>>, @@ -31,6 +34,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { pub fn new( global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, localized_attributes_rules: Vec, + filterable_attributes_rules: Vec, ) -> Self { let registered_facets = HashMap::new(); let normalized_facet_string_docids_sorter = create_sorter( @@ -49,6 +53,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { buffer: Vec::new(), global_fields_ids_map, localized_attributes_rules, + filterable_attributes_rules, localized_field_ids: HashMap::new(), } } @@ -60,6 +65,13 @@ impl<'indexer> FacetSearchBuilder<'indexer> { ) -> Result<()> { let FacetGroupKey { field_id, level: _level, left_bound } = facet_key; + let filterable_attributes_features = self.filterable_attributes_features(field_id)?; + + // if facet search is disabled, we don't need to register the facet + if !filterable_attributes_features.is_facet_searchable() { + return Ok(()); + }; + if deladd == DelAdd::Addition { self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1); } @@ -83,6 +95,24 @@ impl<'indexer> FacetSearchBuilder<'indexer> { Ok(()) } + fn filterable_attributes_features( + &mut self, + field_id: u16, + ) -> Result { + let Some(filterable_attributes_features) = + self.global_fields_ids_map.metadata(field_id).map(|metadata| { + metadata.filterable_attributes_features(&self.filterable_attributes_rules) + }) + else { + return Err(InternalError::FieldIdMapMissingEntry(FieldIdMapMissingEntry::FieldId { + field_id, + process: "facet_search_builder::register_from_key", + }) + .into()); + }; + Ok(filterable_attributes_features) + } + fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> { if let Entry::Vacant(e) = self.localized_field_ids.entry(field_id) { let Some(field_name) = self.global_fields_ids_map.name(field_id) else { @@ -92,7 +122,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let locales = self .localized_attributes_rules .iter() - .find(|rule| rule.match_str(field_name)) + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) .map(|rule| rule.locales.clone()); e.insert(locales); @@ -103,6 +133,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> { #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")] pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> { + tracing::trace!("merge facet strings for facet search: {:?}", self.registered_facets); + let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString); builder.extend(reader); @@ -118,12 +150,15 @@ impl<'indexer> FacetSearchBuilder<'indexer> { BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?; if current_field_id != Some(field_id) { - if let Some(fst_merger_builder) = fst_merger_builder { + if let (Some(current_field_id), Some(fst_merger_builder)) = + (current_field_id, fst_merger_builder) + { let mmap = fst_merger_builder.build(&mut callback)?; - index - .facet_id_string_fst - .remap_data_type::() - .put(wtxn, &field_id, &mmap)?; + index.facet_id_string_fst.remap_data_type::().put( + wtxn, + ¤t_field_id, + &mmap, + )?; } fst = index.facet_id_string_fst.get(rtxn, &field_id)?; diff --git a/crates/milli/src/update/new/indexer/de.rs b/crates/milli/src/update/new/indexer/de.rs index c9808360e..d3ecaeb36 100644 --- a/crates/milli/src/update/new/indexer/de.rs +++ b/crates/milli/src/update/new/indexer/de.rs @@ -1,6 +1,8 @@ use std::ops::ControlFlow; use bumpalo::Bump; +use bumparaw_collections::RawVec; +use rustc_hash::FxBuildHasher; use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; use serde_json::value::RawValue; @@ -27,8 +29,8 @@ impl<'p, 'indexer, Mapper: MutFieldIdMapper> FieldAndDocidExtractor<'p, 'indexer } } -impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> - for FieldAndDocidExtractor<'p, 'indexer, Mapper> +impl<'de, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de> + for FieldAndDocidExtractor<'_, 'indexer, Mapper> { type Value = Result, DocumentIdExtractionError>, crate::UserError>; @@ -96,7 +98,7 @@ struct NestedPrimaryKeyVisitor<'a, 'bump> { bump: &'bump Bump, } -impl<'de, 'a, 'bump: 'de> Visitor<'de> for NestedPrimaryKeyVisitor<'a, 'bump> { +impl<'de, 'bump: 'de> Visitor<'de> for NestedPrimaryKeyVisitor<'_, 'bump> { type Value = std::result::Result>, DocumentIdExtractionError>; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -235,7 +237,7 @@ impl<'de, 'a, Mapper: MutFieldIdMapper> Visitor<'de> for MutFieldIdMapVisitor<'a pub struct FieldIdMapVisitor<'a, Mapper: FieldIdMapper>(pub &'a Mapper); -impl<'de, 'a, Mapper: FieldIdMapper> Visitor<'de> for FieldIdMapVisitor<'a, Mapper> { +impl<'de, Mapper: FieldIdMapper> Visitor<'de> for FieldIdMapVisitor<'_, Mapper> { type Value = Option; fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { @@ -360,7 +362,7 @@ impl<'a> DeserrRawValue<'a> { } pub struct DeserrRawVec<'a> { - vec: raw_collections::RawVec<'a>, + vec: RawVec<'a>, alloc: &'a Bump, } @@ -379,7 +381,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> { } pub struct DeserrRawVecIter<'a> { - it: raw_collections::vec::iter::IntoIter<'a>, + it: bumparaw_collections::vec::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -393,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { } pub struct DeserrRawMap<'a> { - map: raw_collections::RawMap<'a>, + map: bumparaw_collections::RawMap<'a, FxBuildHasher>, alloc: &'a Bump, } @@ -416,7 +418,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> { } pub struct DeserrRawMapIter<'a> { - it: raw_collections::map::iter::IntoIter<'a>, + it: bumparaw_collections::map::iter::IntoIter<'a>, alloc: &'a Bump, } @@ -615,7 +617,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { where A: serde::de::SeqAccess<'de>, { - let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); + let mut raw_vec = RawVec::new_in(self.alloc); while let Some(next) = seq.next_element()? { raw_vec.push(next); } diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index bfb369680..5302c9d05 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -1,15 +1,18 @@ use std::cell::{Cell, RefCell}; +use std::sync::atomic::Ordering; use std::sync::{Arc, RwLock}; use bumpalo::Bump; -use heed::RoTxn; +use heed::{RoTxn, WithoutTls}; use rayon::iter::IndexedParallelIterator; use super::super::document_change::DocumentChange; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; +use crate::progress::{AtomicDocumentStep, Progress}; use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; +use crate::update::GrenadParameters; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; pub struct DocumentChangeContext< @@ -25,7 +28,7 @@ pub struct DocumentChangeContext< /// inside of the DB. pub db_fields_ids_map: &'indexer FieldsIdsMap, /// A transaction providing data from the DB before all indexing operations - pub rtxn: RoTxn<'indexer>, + pub rtxn: RoTxn<'indexer, WithoutTls>, /// Global field id map that is up to date with the current state of the indexing process. /// @@ -70,7 +73,7 @@ impl< F: FnOnce(&'extractor Bump) -> Result, { let doc_alloc = - doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); + doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); let doc_alloc = doc_alloc.0.take(); let fields_ids_map = fields_ids_map_store .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); @@ -133,10 +136,8 @@ pub struct IndexingContext< 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation 'index, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { pub index: &'index Index, pub db_fields_ids_map: &'indexer FieldsIdsMap, @@ -144,46 +145,31 @@ pub struct IndexingContext< pub doc_allocs: &'indexer ThreadLocal>>, pub fields_ids_map_store: &'indexer ThreadLocal>>>, pub must_stop_processing: &'indexer MSP, - pub send_progress: &'indexer SP, + pub progress: &'indexer Progress, + pub grenad_parameters: &'indexer GrenadParameters, } -impl< - 'fid, // invariant lifetime of fields ids map - 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation - 'index, // covariant lifetime of the index - MSP, - SP, - > Copy +impl Copy for IndexingContext< - 'fid, // invariant lifetime of fields ids map - 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation - 'index, // covariant lifetime of the index + '_, // invariant lifetime of fields ids map + '_, // covariant lifetime of objects that are borrowed during the entire indexing operation + '_, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { } -impl< - 'fid, // invariant lifetime of fields ids map - 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation - 'index, // covariant lifetime of the index - MSP, - SP, - > Clone +impl Clone for IndexingContext< - 'fid, // invariant lifetime of fields ids map - 'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation - 'index, // covariant lifetime of the index + '_, // invariant lifetime of fields ids map + '_, // covariant lifetime of objects that are borrowed during the entire indexing operation + '_, // covariant lifetime of the index MSP, - SP, > where MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { fn clone(&self) -> Self { *self @@ -202,7 +188,6 @@ pub fn extract< EX, DC: DocumentChanges<'pl>, MSP, - SP, >( document_changes: &DC, extractor: &EX, @@ -213,18 +198,19 @@ pub fn extract< doc_allocs, fields_ids_map_store, must_stop_processing, - send_progress, - }: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, + progress, + grenad_parameters: _, + }: IndexingContext<'fid, 'indexer, 'index, MSP>, extractor_allocs: &'extractor mut ThreadLocal>, datastore: &'data ThreadLocal, - step: Step, + step: IndexingStep, ) -> Result<()> where EX: Extractor<'extractor>, MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { tracing::trace!("We are resetting the extractor allocators"); + progress.update_progress(step); // Clean up and reuse the extractor allocs for extractor_alloc in extractor_allocs.iter_mut() { tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes()); @@ -232,9 +218,11 @@ where } let total_documents = document_changes.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + progress.update_progress(progress_step); let pi = document_changes.iter(CHUNK_SIZE); - pi.enumerate().try_arc_for_each_try_init( + pi.try_arc_for_each_try_init( || { DocumentChangeContext::new( index, @@ -247,13 +235,10 @@ where move |index_alloc| extractor.init_data(index_alloc), ) }, - |context, (finished_documents, items)| { + |context, items| { if (must_stop_processing)() { return Err(Arc::new(InternalError::AbortedIndexation.into())); } - let finished_documents = (finished_documents * CHUNK_SIZE) as u32; - - (send_progress)(Progress::from_step_substep(step, finished_documents, total_documents)); // Clean up and reuse the document-specific allocator context.doc_alloc.reset(); @@ -264,6 +249,7 @@ where }); let res = extractor.process(changes, context).map_err(Arc::new); + step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed); // send back the doc_alloc in the pool context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); @@ -271,32 +257,7 @@ where res }, )?; - - (send_progress)(Progress::from_step_substep(step, total_documents, total_documents)); + step.store(total_documents, Ordering::Relaxed); Ok(()) } - -pub struct Progress { - pub finished_steps: u16, - pub total_steps: u16, - pub step_name: &'static str, - pub finished_total_substep: Option<(u32, u32)>, -} - -impl Progress { - pub fn from_step(step: Step) -> Self { - Self { - finished_steps: step.finished_steps(), - total_steps: Step::total_steps(), - step_name: step.name(), - finished_total_substep: None, - } - } - pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self { - Self { - finished_total_substep: Some((finished_substep, total_substep)), - ..Progress::from_step(step) - } - } -} diff --git a/crates/milli/src/update/new/indexer/document_deletion.rs b/crates/milli/src/update/new/indexer/document_deletion.rs index 518786e6f..c4a72a2a1 100644 --- a/crates/milli/src/update/new/indexer/document_deletion.rs +++ b/crates/milli/src/update/new/indexer/document_deletion.rs @@ -92,11 +92,12 @@ mod test { use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::tests::TempIndex; + use crate::progress::Progress; use crate::update::new::indexer::document_changes::{ extract, DocumentChangeContext, Extractor, IndexingContext, }; use crate::update::new::indexer::DocumentDeletion; - use crate::update::new::steps::Step; + use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{MostlySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::DocumentId; @@ -109,7 +110,7 @@ mod test { >, } - unsafe impl<'extractor> MostlySend for DeletionWithData<'extractor> {} + unsafe impl MostlySend for DeletionWithData<'_> {} struct TrackDeletion<'extractor>(PhantomData<&'extractor ()>); @@ -164,7 +165,8 @@ mod test { doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing: &(|| false), - send_progress: &(|_progress| {}), + progress: &Progress::default(), + grenad_parameters: &Default::default(), }; for _ in 0..3 { @@ -176,7 +178,7 @@ mod test { context, &mut extractor_allocs, &datastore, - Step::ExtractingDocuments, + IndexingStep::ExtractingDocuments, ) .unwrap(); diff --git a/crates/milli/src/update/new/indexer/document_operation.rs b/crates/milli/src/update/new/indexer/document_operation.rs index 2a381d5d1..ca433c043 100644 --- a/crates/milli/src/update/new/indexer/document_operation.rs +++ b/crates/milli/src/update/new/indexer/document_operation.rs @@ -1,50 +1,68 @@ +use std::sync::atomic::Ordering; + use bumpalo::collections::CollectIn; use bumpalo::Bump; +use bumparaw_collections::RawMap; use hashbrown::hash_map::Entry; use heed::RoTxn; use memmap2::Mmap; -use raw_collections::RawMap; use rayon::slice::ParallelSlice; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use serde_json::Deserializer; use super::super::document_change::DocumentChange; -use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress}; -use super::retrieve_or_guess_primary_key; +use super::document_changes::{DocumentChangeContext, DocumentChanges}; +use super::guess_primary_key::retrieve_or_guess_primary_key; use crate::documents::PrimaryKey; +use crate::progress::{AtomicPayloadStep, Progress}; use crate::update::new::document::Versions; -use crate::update::new::steps::Step; +use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::MostlySend; use crate::update::new::{Deletion, Insertion, Update}; use crate::update::{AvailableIds, IndexDocumentsMethod}; use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError}; +#[derive(Default)] pub struct DocumentOperation<'pl> { operations: Vec>, - method: MergeMethod, } impl<'pl> DocumentOperation<'pl> { - pub fn new(method: IndexDocumentsMethod) -> Self { - Self { operations: Default::default(), method: MergeMethod::from(method) } + pub fn new() -> Self { + Self { operations: Default::default() } } - /// TODO please give me a type + /// Append a replacement of documents. + /// /// The payload is expected to be in the NDJSON format - pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result<()> { + pub fn replace_documents(&mut self, payload: &'pl Mmap) -> Result<()> { #[cfg(unix)] payload.advise(memmap2::Advice::Sequential)?; - self.operations.push(Payload::Addition(&payload[..])); + self.operations.push(Payload::Replace(&payload[..])); Ok(()) } + /// Append an update of documents. + /// + /// The payload is expected to be in the NDJSON format + pub fn update_documents(&mut self, payload: &'pl Mmap) -> Result<()> { + #[cfg(unix)] + payload.advise(memmap2::Advice::Sequential)?; + self.operations.push(Payload::Update(&payload[..])); + Ok(()) + } + + /// Append a deletion of documents IDs. + /// + /// The list is a set of external documents IDs. pub fn delete_documents(&mut self, to_delete: &'pl [&'pl str]) { self.operations.push(Payload::Deletion(to_delete)) } #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")] - pub fn into_changes( + pub fn into_changes( self, indexer: &'pl Bump, index: &Index, @@ -52,13 +70,13 @@ impl<'pl> DocumentOperation<'pl> { primary_key_from_op: Option<&'pl str>, new_fields_ids_map: &mut FieldsIdsMap, must_stop_processing: &MSP, - send_progress: &SP, + progress: Progress, ) -> Result<(DocumentOperationChanges<'pl>, Vec, Option>)> where MSP: Fn() -> bool, - SP: Fn(Progress), { - let Self { operations, method } = self; + progress.update_progress(IndexingStep::PreparingPayloads); + let Self { operations } = self; let documents_ids = index.documents_ids(rtxn)?; let mut operations_stats = Vec::new(); @@ -67,20 +85,18 @@ impl<'pl> DocumentOperation<'pl> { let mut primary_key = None; let payload_count = operations.len(); + let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32); + progress.update_progress(progress_step); for (payload_index, operation) in operations.into_iter().enumerate() { if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - send_progress(Progress::from_step_substep( - Step::PreparingPayloads, - payload_index as u32, - payload_count as u32, - )); + step.store(payload_index as u32, Ordering::Relaxed); let mut bytes = 0; let result = match operation { - Payload::Addition(payload) => extract_addition_payload_changes( + Payload::Replace(payload) => extract_addition_payload_changes( indexer, index, rtxn, @@ -90,7 +106,20 @@ impl<'pl> DocumentOperation<'pl> { &mut available_docids, &mut bytes, &docids_version_offsets, - method, + IndexDocumentsMethod::ReplaceDocuments, + payload, + ), + Payload::Update(payload) => extract_addition_payload_changes( + indexer, + index, + rtxn, + primary_key_from_op, + &mut primary_key, + new_fields_ids_map, + &mut available_docids, + &mut bytes, + &docids_version_offsets, + IndexDocumentsMethod::UpdateDocuments, payload, ), Payload::Deletion(to_delete) => extract_deletion_payload_changes( @@ -98,7 +127,6 @@ impl<'pl> DocumentOperation<'pl> { rtxn, &mut available_docids, &docids_version_offsets, - method, to_delete, ), }; @@ -117,32 +145,22 @@ impl<'pl> DocumentOperation<'pl> { }; operations_stats.push(PayloadStats { document_count, bytes, error }); } - - send_progress(Progress::from_step_substep( - Step::PreparingPayloads, - payload_count as u32, - payload_count as u32, - )); + step.store(payload_count as u32, Ordering::Relaxed); // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = docids_version_offsets.drain().collect_in(indexer); // Reorder the offsets to make sure we iterate on the file sequentially - // And finally sort them - docids_version_offsets.sort_unstable_by_key(|(_, po)| method.sort_key(&po.operations)); + // And finally sort them. This clearly speeds up reading the update files. + docids_version_offsets + .sort_unstable_by_key(|(_, po)| first_update_pointer(&po.operations).unwrap_or(0)); let docids_version_offsets = docids_version_offsets.into_bump_slice(); Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats, primary_key)) } } -impl Default for DocumentOperation<'_> { - fn default() -> Self { - DocumentOperation::new(IndexDocumentsMethod::default()) - } -} - #[allow(clippy::too_many_arguments)] fn extract_addition_payload_changes<'r, 'pl: 'r>( indexer: &'pl Bump, @@ -154,9 +172,11 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( available_docids: &mut AvailableIds, bytes: &mut u64, main_docids_version_offsets: &hashbrown::HashMap<&'pl str, PayloadOperations<'pl>>, - method: MergeMethod, + method: IndexDocumentsMethod, payload: &'pl [u8], ) -> Result>> { + use IndexDocumentsMethod::{ReplaceDocuments, UpdateDocuments}; + let mut new_docids_version_offsets = hashbrown::HashMap::<&str, PayloadOperations<'pl>>::new(); let mut previous_offset = 0; @@ -166,8 +186,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( // Only guess the primary key if it is the first document let retrieved_primary_key = if previous_offset == 0 { - let doc = - RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; + let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer) + .map(Some) + .map_err(UserError::SerdeJson)?; let result = retrieve_or_guess_primary_key( rtxn, @@ -189,14 +210,8 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( primary_key.as_ref().unwrap() }; - let external_id = match retrieved_primary_key.extract_fields_and_docid( - doc, - new_fields_ids_map, - indexer, - ) { - Ok(edi) => edi, - Err(e) => return Err(e), - }; + let external_id = + retrieved_primary_key.extract_fields_and_docid(doc, new_fields_ids_map, indexer)?; let external_id = external_id.to_de(); let current_offset = iter.byte_offset(); @@ -206,54 +221,106 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( None => { match index.external_documents_ids().get(rtxn, external_id) { Ok(Some(docid)) => match new_docids_version_offsets.entry(external_id) { - Entry::Occupied(mut entry) => { - entry.get_mut().push_addition(document_offset) - } + Entry::Occupied(mut entry) => match method { + ReplaceDocuments => entry.get_mut().push_replacement(document_offset), + UpdateDocuments => entry.get_mut().push_update(document_offset), + }, Entry::Vacant(entry) => { - entry.insert(PayloadOperations::new_addition( - method, - docid, - false, // is new - document_offset, - )); + match method { + ReplaceDocuments => { + entry.insert(PayloadOperations::new_replacement( + docid, + false, // is new + document_offset, + )); + } + UpdateDocuments => { + entry.insert(PayloadOperations::new_update( + docid, + false, // is new + document_offset, + )); + } + } } }, Ok(None) => match new_docids_version_offsets.entry(external_id) { - Entry::Occupied(mut entry) => { - entry.get_mut().push_addition(document_offset) - } + Entry::Occupied(mut entry) => match method { + ReplaceDocuments => entry.get_mut().push_replacement(document_offset), + UpdateDocuments => entry.get_mut().push_update(document_offset), + }, Entry::Vacant(entry) => { let docid = match available_docids.next() { Some(docid) => docid, None => return Err(UserError::DocumentLimitReached.into()), }; - entry.insert(PayloadOperations::new_addition( - method, - docid, - true, // is new - document_offset, - )); + + match method { + ReplaceDocuments => { + entry.insert(PayloadOperations::new_replacement( + docid, + true, // is new + document_offset, + )); + } + UpdateDocuments => { + entry.insert(PayloadOperations::new_update( + docid, + true, // is new + document_offset, + )); + } + } } }, Err(e) => return Err(e.into()), } } Some(payload_operations) => match new_docids_version_offsets.entry(external_id) { - Entry::Occupied(mut entry) => entry.get_mut().push_addition(document_offset), - Entry::Vacant(entry) => { - entry.insert(PayloadOperations::new_addition( - method, - payload_operations.docid, - payload_operations.is_new, - document_offset, - )); - } + Entry::Occupied(mut entry) => match method { + ReplaceDocuments => entry.get_mut().push_replacement(document_offset), + UpdateDocuments => entry.get_mut().push_update(document_offset), + }, + Entry::Vacant(entry) => match method { + ReplaceDocuments => { + entry.insert(PayloadOperations::new_replacement( + payload_operations.docid, + payload_operations.is_new, + document_offset, + )); + } + UpdateDocuments => { + entry.insert(PayloadOperations::new_update( + payload_operations.docid, + payload_operations.is_new, + document_offset, + )); + } + }, }, } previous_offset = iter.byte_offset(); } + if payload.is_empty() { + let result = retrieve_or_guess_primary_key( + rtxn, + index, + new_fields_ids_map, + primary_key_from_op, + None, + ); + match result { + Ok(Ok((pk, _))) => { + primary_key.get_or_insert(pk); + } + Ok(Err(UserError::NoPrimaryKeyCandidateFound)) => (), + Ok(Err(user_error)) => return Err(Error::UserError(user_error)), + Err(error) => return Err(error), + }; + } + Ok(new_docids_version_offsets) } @@ -262,7 +329,6 @@ fn extract_deletion_payload_changes<'s, 'pl: 's>( rtxn: &RoTxn, available_docids: &mut AvailableIds, main_docids_version_offsets: &hashbrown::HashMap<&'s str, PayloadOperations<'pl>>, - method: MergeMethod, to_delete: &'pl [&'pl str], ) -> Result>> { let mut new_docids_version_offsets = hashbrown::HashMap::<&str, PayloadOperations<'pl>>::new(); @@ -276,7 +342,7 @@ fn extract_deletion_payload_changes<'s, 'pl: 's>( Entry::Occupied(mut entry) => entry.get_mut().push_deletion(), Entry::Vacant(entry) => { entry.insert(PayloadOperations::new_deletion( - method, docid, false, // is new + docid, false, // is new )); } } @@ -290,7 +356,7 @@ fn extract_deletion_payload_changes<'s, 'pl: 's>( Entry::Occupied(mut entry) => entry.get_mut().push_deletion(), Entry::Vacant(entry) => { entry.insert(PayloadOperations::new_deletion( - method, docid, true, // is new + docid, true, // is new )); } } @@ -302,7 +368,6 @@ fn extract_deletion_payload_changes<'s, 'pl: 's>( Entry::Occupied(mut entry) => entry.get_mut().push_deletion(), Entry::Vacant(entry) => { entry.insert(PayloadOperations::new_deletion( - method, payload_operations.docid, payload_operations.is_new, )); @@ -353,13 +418,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> { 'pl: 'doc, { let (external_doc, payload_operations) = item; - payload_operations.merge_method.merge( - payload_operations.docid, - external_doc, - payload_operations.is_new, - &context.doc_alloc, - &payload_operations.operations[..], - ) + payload_operations.merge(external_doc, &context.doc_alloc) } fn len(&self) -> usize { @@ -372,7 +431,8 @@ pub struct DocumentOperationChanges<'pl> { } pub enum Payload<'pl> { - Addition(&'pl [u8]), + Replace(&'pl [u8]), + Update(&'pl [u8]), Deletion(&'pl [&'pl str]), } @@ -389,31 +449,30 @@ pub struct PayloadOperations<'pl> { pub is_new: bool, /// The operations to perform, in order, on this document. pub operations: Vec>, - /// The merge method we are using to merge payloads and documents. - merge_method: MergeMethod, } impl<'pl> PayloadOperations<'pl> { - fn new_deletion(merge_method: MergeMethod, docid: DocumentId, is_new: bool) -> Self { - Self { docid, is_new, operations: vec![InnerDocOp::Deletion], merge_method } + fn new_replacement(docid: DocumentId, is_new: bool, offset: DocumentOffset<'pl>) -> Self { + Self { docid, is_new, operations: vec![InnerDocOp::Replace(offset)] } } - fn new_addition( - merge_method: MergeMethod, - docid: DocumentId, - is_new: bool, - offset: DocumentOffset<'pl>, - ) -> Self { - Self { docid, is_new, operations: vec![InnerDocOp::Addition(offset)], merge_method } + fn new_update(docid: DocumentId, is_new: bool, offset: DocumentOffset<'pl>) -> Self { + Self { docid, is_new, operations: vec![InnerDocOp::Update(offset)] } + } + + fn new_deletion(docid: DocumentId, is_new: bool) -> Self { + Self { docid, is_new, operations: vec![InnerDocOp::Deletion] } } } impl<'pl> PayloadOperations<'pl> { - fn push_addition(&mut self, offset: DocumentOffset<'pl>) { - if self.merge_method.useless_previous_changes() { - self.operations.clear(); - } - self.operations.push(InnerDocOp::Addition(offset)) + fn push_replacement(&mut self, offset: DocumentOffset<'pl>) { + self.operations.clear(); + self.operations.push(InnerDocOp::Replace(offset)) + } + + fn push_update(&mut self, offset: DocumentOffset<'pl>) { + self.operations.push(InnerDocOp::Update(offset)) } fn push_deletion(&mut self) { @@ -423,16 +482,114 @@ impl<'pl> PayloadOperations<'pl> { fn append_operations(&mut self, mut operations: Vec>) { debug_assert!(!operations.is_empty()); - if self.merge_method.useless_previous_changes() { + if matches!(operations.first(), Some(InnerDocOp::Deletion | InnerDocOp::Replace(_))) { self.operations.clear(); } self.operations.append(&mut operations); } + + /// Returns only the most recent version of a document based on the updates from the payloads. + /// + /// This function is only meant to be used when doing a replacement and not an update. + fn merge<'doc>( + &self, + external_doc: &'doc str, + doc_alloc: &'doc Bump, + ) -> Result>> + where + 'pl: 'doc, + { + match self.operations.last() { + Some(InnerDocOp::Replace(DocumentOffset { content })) => { + let document = serde_json::from_slice(content).unwrap(); + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; + + if self.is_new { + Ok(Some(DocumentChange::Insertion(Insertion::create( + self.docid, + external_doc, + Versions::single(document), + )))) + } else { + Ok(Some(DocumentChange::Update(Update::create( + self.docid, + external_doc, + Versions::single(document), + true, + )))) + } + } + Some(InnerDocOp::Update(_)) => { + // Search the first operation that is a tombstone which resets the document. + let last_tombstone = self + .operations + .iter() + .rposition(|op| matches!(op, InnerDocOp::Deletion | InnerDocOp::Replace(_))); + + // Track when we must ignore previous document versions from the rtxn. + let from_scratch = last_tombstone.is_some(); + + // We ignore deletion and keep the replacement to create the appropriate versions. + let operations = match last_tombstone { + Some(i) => match self.operations[i] { + InnerDocOp::Deletion => &self.operations[i + 1..], + InnerDocOp::Replace(_) => &self.operations[i..], + InnerDocOp::Update(_) => unreachable!("Found a non-tombstone operation"), + }, + None => &self.operations[..], + }; + + // We collect the versions to generate the appropriate document. + let versions = operations.iter().map(|operation| { + let DocumentOffset { content } = match operation { + InnerDocOp::Replace(offset) | InnerDocOp::Update(offset) => offset, + InnerDocOp::Deletion => unreachable!("Deletion in document operations"), + }; + + let document = serde_json::from_slice(content).unwrap(); + let document = + RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) + .map_err(UserError::SerdeJson)?; + + Ok(document) + }); + + let Some(versions) = Versions::multiple(versions)? else { return Ok(None) }; + + if self.is_new { + Ok(Some(DocumentChange::Insertion(Insertion::create( + self.docid, + external_doc, + versions, + )))) + } else { + Ok(Some(DocumentChange::Update(Update::create( + self.docid, + external_doc, + versions, + from_scratch, + )))) + } + } + Some(InnerDocOp::Deletion) => { + if self.is_new { + Ok(None) + } else { + let deletion = Deletion::create(self.docid, external_doc); + Ok(Some(DocumentChange::Deletion(deletion))) + } + } + None => unreachable!("We must not have an empty set of operations on a document"), + } + } } #[derive(Clone)] pub enum InnerDocOp<'pl> { - Addition(DocumentOffset<'pl>), + Replace(DocumentOffset<'pl>), + Update(DocumentOffset<'pl>), Deletion, } @@ -444,228 +601,14 @@ pub struct DocumentOffset<'pl> { pub content: &'pl [u8], } -trait MergeChanges { - /// Whether the payloads in the list of operations are useless or not. - fn useless_previous_changes(&self) -> bool; - - /// Returns a key that is used to order the payloads the right way. - fn sort_key(&self, docops: &[InnerDocOp]) -> usize; - - fn merge<'doc>( - &self, - docid: DocumentId, - external_docid: &'doc str, - is_new: bool, - doc_alloc: &'doc Bump, - operations: &'doc [InnerDocOp], - ) -> Result>>; -} - -#[derive(Debug, Clone, Copy)] -enum MergeMethod { - ForReplacement(MergeDocumentForReplacement), - ForUpdates(MergeDocumentForUpdates), -} - -impl MergeChanges for MergeMethod { - fn useless_previous_changes(&self) -> bool { - match self { - MergeMethod::ForReplacement(merge) => merge.useless_previous_changes(), - MergeMethod::ForUpdates(merge) => merge.useless_previous_changes(), - } - } - - fn sort_key(&self, docops: &[InnerDocOp]) -> usize { - match self { - MergeMethod::ForReplacement(merge) => merge.sort_key(docops), - MergeMethod::ForUpdates(merge) => merge.sort_key(docops), - } - } - - fn merge<'doc>( - &self, - docid: DocumentId, - external_docid: &'doc str, - is_new: bool, - doc_alloc: &'doc Bump, - operations: &'doc [InnerDocOp], - ) -> Result>> { - match self { - MergeMethod::ForReplacement(merge) => { - merge.merge(docid, external_docid, is_new, doc_alloc, operations) - } - MergeMethod::ForUpdates(merge) => { - merge.merge(docid, external_docid, is_new, doc_alloc, operations) - } - } - } -} - -impl From for MergeMethod { - fn from(method: IndexDocumentsMethod) -> Self { - match method { - IndexDocumentsMethod::ReplaceDocuments => { - MergeMethod::ForReplacement(MergeDocumentForReplacement) - } - IndexDocumentsMethod::UpdateDocuments => { - MergeMethod::ForUpdates(MergeDocumentForUpdates) - } - } - } -} - -#[derive(Debug, Clone, Copy)] -struct MergeDocumentForReplacement; - -impl MergeChanges for MergeDocumentForReplacement { - fn useless_previous_changes(&self) -> bool { - true - } - - /// Reorders to read only the last change. - fn sort_key(&self, docops: &[InnerDocOp]) -> usize { - let f = |ido: &_| match ido { - InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), - InnerDocOp::Deletion => None, - }; - docops.iter().rev().find_map(f).unwrap_or(0) - } - - /// Returns only the most recent version of a document based on the updates from the payloads. - /// - /// This function is only meant to be used when doing a replacement and not an update. - fn merge<'doc>( - &self, - docid: DocumentId, - external_doc: &'doc str, - is_new: bool, - doc_alloc: &'doc Bump, - operations: &'doc [InnerDocOp], - ) -> Result>> { - match operations.last() { - Some(InnerDocOp::Addition(DocumentOffset { content })) => { - let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; - - if is_new { - Ok(Some(DocumentChange::Insertion(Insertion::create( - docid, - external_doc, - Versions::single(document), - )))) - } else { - Ok(Some(DocumentChange::Update(Update::create( - docid, - external_doc, - Versions::single(document), - true, - )))) - } - } - Some(InnerDocOp::Deletion) => { - return if is_new { - Ok(None) - } else { - let deletion = Deletion::create(docid, external_doc); - Ok(Some(DocumentChange::Deletion(deletion))) - }; - } - None => unreachable!("We must not have empty set of operations on a document"), - } - } -} - -#[derive(Debug, Clone, Copy)] -struct MergeDocumentForUpdates; - -impl MergeChanges for MergeDocumentForUpdates { - fn useless_previous_changes(&self) -> bool { - false - } - - /// Reorders to read the first changes first so that it's faster to read the first one and then the rest. - fn sort_key(&self, docops: &[InnerDocOp]) -> usize { - let f = |ido: &_| match ido { - InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize), - InnerDocOp::Deletion => None, - }; - docops.iter().find_map(f).unwrap_or(0) - } - - /// Reads the previous version of a document from the database, the new versions - /// in the grenad update files and merges them to generate a new boxed obkv. - /// - /// This function is only meant to be used when doing an update and not a replacement. - fn merge<'doc>( - &self, - docid: DocumentId, - external_docid: &'doc str, - is_new: bool, - doc_alloc: &'doc Bump, - operations: &'doc [InnerDocOp], - ) -> Result>> { - if operations.is_empty() { - unreachable!("We must not have empty set of operations on a document"); - } - - let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion)); - let operations = &operations[last_deletion.map_or(0, |i| i + 1)..]; - - let has_deletion = last_deletion.is_some(); - - if operations.is_empty() { - return if is_new { - Ok(None) - } else { - let deletion = Deletion::create(docid, external_docid); - Ok(Some(DocumentChange::Deletion(deletion))) - }; - } - - let versions = match operations { - [single] => { - let DocumentOffset { content } = match single { - InnerDocOp::Addition(offset) => offset, - InnerDocOp::Deletion => { - unreachable!("Deletion in document operations") - } - }; - let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; - - Some(Versions::single(document)) - } - operations => { - let versions = operations.iter().map(|operation| { - let DocumentOffset { content } = match operation { - InnerDocOp::Addition(offset) => offset, - InnerDocOp::Deletion => { - unreachable!("Deletion in document operations") - } - }; - - let document = serde_json::from_slice(content).unwrap(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) - .map_err(UserError::SerdeJson)?; - Ok(document) - }); - Versions::multiple(versions)? - } - }; - - let Some(versions) = versions else { return Ok(None) }; - - if is_new { - Ok(Some(DocumentChange::Insertion(Insertion::create(docid, external_docid, versions)))) - } else { - Ok(Some(DocumentChange::Update(Update::create( - docid, - external_docid, - versions, - has_deletion, - )))) - } - } +/// Returns the first pointer of the first change in a document. +/// +/// This is used to sort the documents in update file content order +/// and read the update file in order to largely speed up the indexation. +pub fn first_update_pointer(docops: &[InnerDocOp]) -> Option { + docops.iter().find_map(|ido: &_| match ido { + InnerDocOp::Replace(replace) => Some(replace.content.as_ptr() as usize), + InnerDocOp::Update(update) => Some(update.content.as_ptr() as usize), + InnerDocOp::Deletion => None, + }) } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs new file mode 100644 index 000000000..907a4d1df --- /dev/null +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -0,0 +1,311 @@ +use std::collections::BTreeMap; +use std::sync::atomic::AtomicBool; +use std::sync::OnceLock; + +use bumpalo::Bump; +use roaring::RoaringBitmap; +use tracing::Span; + +use super::super::channel::*; +use super::super::extract::*; +use super::super::steps::IndexingStep; +use super::super::thread_local::{FullySend, ThreadLocal}; +use super::super::FacetFieldIdsDelta; +use super::document_changes::{extract, DocumentChanges, IndexingContext}; +use crate::index::IndexEmbeddingConfig; +use crate::proximity::ProximityPrecision; +use crate::update::new::extract::EmbeddingExtractor; +use crate::update::new::merger::merge_and_send_rtree; +use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; +use crate::vector::EmbeddingConfigs; +use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; + +#[allow(clippy::too_many_arguments)] +pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( + document_changes: &DC, + indexing_context: IndexingContext, + indexer_span: Span, + extractor_sender: ExtractorBbqueueSender, + embedders: &EmbeddingConfigs, + extractor_allocs: &'extractor mut ThreadLocal>, + finished_extraction: &AtomicBool, + field_distribution: &mut BTreeMap, + mut index_embeddings: Vec, + document_ids: &mut RoaringBitmap, + modified_docids: &mut RoaringBitmap, +) -> Result<(FacetFieldIdsDelta, Vec)> +where + DC: DocumentChanges<'pl>, + MSP: Fn() -> bool + Sync, +{ + let span = + tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); + let _entered = span.enter(); + + let index = indexing_context.index; + let rtxn = index.read_txn()?; + + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(document_sender, embedders); + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + let _entered = span.enter(); + extract( + document_changes, + &document_extractor, + indexing_context, + extractor_allocs, + &datastore, + IndexingStep::ExtractingDocuments, + )?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); + let _entered = span.enter(); + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids, modified_docids); + } + + field_distribution.retain(|_, v| *v != 0); + } + + let facet_field_ids_delta; + + { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); + let _entered = span.enter(); + + FacetedDocidsExtractor::run_extraction( + document_changes, + indexing_context, + extractor_allocs, + &extractor_sender.field_id_docid_facet_sender(), + IndexingStep::ExtractingFacets, + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); + let _entered = span.enter(); + + facet_field_ids_delta = merge_and_send_facet_docids( + caches, + FacetDatabases::new(index), + index, + &rtxn, + extractor_sender.facet_docids(), + )?; + } + } + + { + let WordDocidsCaches { + word_docids, + word_fid_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); + + WordDocidsExtractors::run_extraction( + document_changes, + indexing_context, + extractor_allocs, + IndexingStep::ExtractingWords, + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_docids, + index.word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + + { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_fid_docids, + index.word_fid_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + + { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + exact_word_docids, + index.exact_word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + + { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_position_docids, + index.word_position_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + + { + let span = + tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); + merge_and_send_docids( + fid_word_count_docids, + index.field_id_word_count_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + } + + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + + WordPairProximityDocidsExtractor::run_extraction( + document_changes, + indexing_context, + extractor_allocs, + IndexingStep::ExtractingWordProximity, + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); + let _entered = span.enter(); + + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } + } + + 'vectors: { + if index_embeddings.is_empty() { + break 'vectors; + } + + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new( + embedders, + embedding_sender, + field_distribution, + request_threads(), + ); + let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); + + extract( + document_changes, + &extractor, + indexing_context, + extractor_allocs, + &datastore, + IndexingStep::ExtractingEmbeddings, + )?; + } + { + let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); + let _entered = span.enter(); + + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { + continue 'data; + }; + deladd.apply_to(&mut config.user_provided, modified_docids); + } + } + } + } + + 'geo: { + let Some(extractor) = GeoExtractor::new(&rtxn, index, *indexing_context.grenad_parameters)? + else { + break 'geo; + }; + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); + + extract( + document_changes, + &extractor, + indexing_context, + extractor_allocs, + &datastore, + IndexingStep::WritingGeoPoints, + )?; + } + + merge_and_send_rtree( + datastore, + &rtxn, + index, + extractor_sender.geo(), + &indexing_context.must_stop_processing, + )?; + } + indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites); + finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); + + Result::Ok((facet_field_ids_delta, index_embeddings)) +} + +fn request_threads() -> &'static ThreadPoolNoAbort { + static REQUEST_THREADS: OnceLock = OnceLock::new(); + + REQUEST_THREADS.get_or_init(|| { + ThreadPoolNoAbortBuilder::new() + .num_threads(crate::vector::REQUEST_PARALLELISM) + .thread_name(|index| format!("embedding-request-{index}")) + .build() + .unwrap() + }) +} diff --git a/crates/milli/src/update/new/indexer/guess_primary_key.rs b/crates/milli/src/update/new/indexer/guess_primary_key.rs new file mode 100644 index 000000000..f0eb82b8d --- /dev/null +++ b/crates/milli/src/update/new/indexer/guess_primary_key.rs @@ -0,0 +1,85 @@ +use bumparaw_collections::RawMap; +use heed::RoTxn; +use rustc_hash::FxBuildHasher; + +use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; +use crate::update::new::StdResult; +use crate::{FieldsIdsMap, Index, Result, UserError}; + +/// Returns the primary key that has already been set for this index or the +/// one we will guess by searching for the first key that contains "id" as a substring, +/// and whether the primary key changed +pub fn retrieve_or_guess_primary_key<'a>( + rtxn: &'a RoTxn<'a>, + index: &Index, + new_fields_ids_map: &mut FieldsIdsMap, + primary_key_from_op: Option<&'a str>, + first_document: Option>, +) -> Result, bool), UserError>> { + // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. + + // do we have an existing declared primary key? + let (primary_key, has_changed) = if let Some(primary_key_from_db) = index.primary_key(rtxn)? { + // did we request a primary key in the operation? + match primary_key_from_op { + // we did, and it is different from the DB one + Some(primary_key_from_op) if primary_key_from_op != primary_key_from_db => { + return Ok(Err(UserError::PrimaryKeyCannotBeChanged( + primary_key_from_db.to_string(), + ))); + } + _ => (primary_key_from_db, false), + } + } else { + // no primary key in the DB => let's set one + // did we request a primary key in the operation? + let primary_key = if let Some(primary_key_from_op) = primary_key_from_op { + // set primary key from operation + primary_key_from_op + } else { + // guess primary key + let first_document = match first_document { + Some(document) => document, + // previous indexer when no pk is set + we send an empty payload => index_primary_key_no_candidate_found + None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + }; + + let guesses: Result> = first_document + .keys() + .filter_map(|name| { + let Some(_) = new_fields_ids_map.insert(name) else { + return Some(Err(UserError::AttributeLimitReached.into())); + }; + name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY).then_some(Ok(name)) + }) + .collect(); + + let mut guesses = guesses?; + + // sort the keys in lexicographical order, so that fields are always in the same order. + guesses.sort_unstable(); + + match guesses.as_slice() { + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [name] => { + tracing::info!("Primary key was not specified in index. Inferred to '{name}'"); + *name + } + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|candidate| candidate.to_string()) + .collect(), + })) + } + } + }; + (primary_key, true) + }; + + match PrimaryKey::new_or_insert(primary_key, new_fields_ids_map) { + Ok(primary_key) => Ok(Ok((primary_key, has_changed))), + Err(err) => Ok(Err(err)), + } +} diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 01ac26503..4f2dd19c9 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -1,56 +1,40 @@ -use std::cmp::Ordering; -use std::sync::{OnceLock, RwLock}; +use std::sync::atomic::AtomicBool; +use std::sync::{Once, RwLock}; use std::thread::{self, Builder}; use big_s::S; -use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; +use document_changes::{DocumentChanges, IndexingContext}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; -use heed::types::{Bytes, DecodeIgnore, Str}; -use heed::{RoTxn, RwTxn}; -use itertools::{merge_join_by, EitherOrBoth}; +use heed::RwTxn; pub use partial_dump::PartialDump; -use rand::SeedableRng as _; -use raw_collections::RawMap; -use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; +pub use write::ChannelCongestion; +use write::{build_vectors, update_index, write_to_db}; use super::channel::*; -use super::extract::*; -use super::facet_search_builder::FacetSearchBuilder; -use super::merger::FacetFieldIdsDelta; -use super::steps::Step; +use super::steps::IndexingStep; use super::thread_local::ThreadLocal; -use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; -use super::words_prefix_docids::{ - compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, -}; -use super::StdResult; -use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; -use crate::facet::FacetType; +use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; -use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; -use crate::proximity::ProximityPrecision; -use crate::update::del_add::DelAdd; -use crate::update::new::extract::EmbeddingExtractor; -use crate::update::new::merger::merge_and_send_rtree; -use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; -use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; -use crate::update::settings::InnerIndexSettings; -use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; -use crate::{ - FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, - ThreadPoolNoAbortBuilder, UserError, -}; +use crate::progress::Progress; +use crate::update::GrenadParameters; +use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; pub(crate) mod de; pub mod document_changes; mod document_deletion; mod document_operation; +mod extract; +mod guess_primary_key; mod partial_dump; +mod post_processing; mod update_by_function; +mod write; + +static LOG_MEMORY_METRICS_ONCE: Once = Once::new(); /// This is the main function of this crate. /// @@ -58,9 +42,10 @@ mod update_by_function; /// /// TODO return stats #[allow(clippy::too_many_arguments)] // clippy: šŸ˜ -pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( +pub fn index<'pl, 'indexer, 'index, DC, MSP>( wtxn: &mut RwTxn, index: &'index Index, + pool: &ThreadPoolNoAbort, grenad_parameters: GrenadParameters, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, @@ -68,14 +53,63 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( document_changes: &DC, embedders: EmbeddingConfigs, must_stop_processing: &'indexer MSP, - send_progress: &'indexer SP, -) -> Result<()> + progress: &'indexer Progress, +) -> Result where DC: DocumentChanges<'pl>, MSP: Fn() -> bool + Sync, - SP: Fn(Progress) + Sync, { - let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); + let mut bbbuffers = Vec::new(); + let finished_extraction = AtomicBool::new(false); + + let arroy_memory = grenad_parameters.max_memory; + + // We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch + // is because we still use the old indexer for the settings and it is highly impacted by the + // max memory. So we keep the changes here and will remove these changes once we use the new + // indexer to also index settings. Related to #5125 and #5141. + let grenad_parameters = GrenadParameters { + max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100), + ..grenad_parameters + }; + + // 5% percent of the allocated memory for the extractors, or min 100MiB + // 5% percent of the allocated memory for the bbqueues, or min 50MiB + // + // Minimum capacity for bbqueues + let minimum_total_bbbuffer_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB + let minimum_total_extractors_capacity = minimum_total_bbbuffer_capacity * 2; + + let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( + ( + GrenadParameters { + max_memory: Some(minimum_total_extractors_capacity), + ..grenad_parameters + }, + minimum_total_bbbuffer_capacity, + ), // 100 MiB by thread by default + |max_memory| { + let total_bbbuffer_capacity = max_memory.max(minimum_total_bbbuffer_capacity); + let new_grenad_parameters = GrenadParameters { + max_memory: Some(max_memory.max(minimum_total_extractors_capacity)), + ..grenad_parameters + }; + (new_grenad_parameters, total_bbbuffer_capacity) + }, + ); + + LOG_MEMORY_METRICS_ONCE.call_once(|| { + tracing::debug!( + "Indexation allocated memory metrics - \ + Total BBQueue size: {total_bbbuffer_capacity}, \ + Total extractor memory: {:?}", + grenad_parameters.max_memory, + ); + }); + + let (extractor_sender, writer_receiver) = pool + .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) + .unwrap(); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); @@ -91,245 +125,46 @@ where doc_allocs: &doc_allocs, fields_ids_map_store: &fields_ids_map_store, must_stop_processing, - send_progress, + progress, + grenad_parameters: &grenad_parameters, }; + let index_embeddings = index.embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; + let mut modified_docids = roaring::RoaringBitmap::new(); - thread::scope(|s| -> Result<()> { + let congestion = thread::scope(|s| -> Result { let indexer_span = tracing::Span::current(); let embedders = &embedders; + let finished_extraction = &finished_extraction; // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; - let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); - - let rtxn = index.read_txn()?; - - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender, embedders); - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - - extract(document_changes, - &document_extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingDocuments, - )?; - - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); - } - document_extractor_data.docids_delta.apply_to(document_ids); - } - - field_distribution.retain(|_, v| *v != 0); - - let facet_field_ids_delta; - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); - let _entered = span.enter(); - - facet_field_ids_delta = merge_and_send_facet_docids( - FacetedDocidsExtractor::run_extraction( - grenad_parameters, + let modified_docids = &mut modified_docids; + let extractor_handle = + Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { + pool.install(move || { + extract::extract_all( document_changes, indexing_context, + indexer_span, + extractor_sender, + embedders, &mut extractor_allocs, - &extractor_sender.field_id_docid_facet_sender(), - Step::ExtractingFacets - )?, - FacetDatabases::new(index), - index, - extractor_sender.facet_docids(), - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); - - - let WordDocidsCaches { - word_docids, - word_fid_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = WordDocidsExtractors::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWords - )?; - - // TODO Word Docids Merger - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_docids, - index.word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - // Word Fid Docids Merging - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_fid_docids, - index.word_fid_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - // Exact Word Docids Merging - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - exact_word_docids, - index.exact_word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - // Word Position Docids Merging - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_position_docids, - index.word_position_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - // Fid Word Count Docids Merging - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); - let _entered = span.enter(); - merge_and_send_docids( - fid_word_count_docids, - index.field_id_word_count_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } - - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); - - - let caches = ::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWordProximity, - )?; - - merge_and_send_docids( - caches, - index.word_pair_proximity_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - - 'vectors: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); - - let mut index_embeddings = index.embedding_configs(&rtxn)?; - if index_embeddings.is_empty() { - break 'vectors; - } - - let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); - let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; - - for config in &mut index_embeddings { - 'data: for data in datastore.iter_mut() { - let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); - } - } - - embedding_sender.finish(index_embeddings).unwrap(); - } - - 'geo: { - let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); - let _entered = span.enter(); - - let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { - break 'geo; - }; - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::WritingGeoPoints - )?; - - merge_and_send_rtree( - datastore, - &rtxn, - index, - extractor_sender.geo(), - &indexing_context.must_stop_processing, - )?; - } - - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); - let _entered = span.enter(); - (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); - } - - Result::Ok(facet_field_ids_delta) - })?; + finished_extraction, + field_distribution, + index_embeddings, + document_ids, + modified_docids, + ) + }) + .unwrap() + })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let vector_arroy = index.vector_arroy; - let mut rng = rand::rngs::StdRng::seed_from_u64(42); - let indexer_span = tracing::Span::current(); let arroy_writers: Result> = embedders .inner_as_ref() .iter() @@ -352,350 +187,55 @@ where .collect(); let mut arroy_writers = arroy_writers?; - for operation in writer_receiver { - match operation { - WriterOperation::DbOperation(db_operation) => { - let database = db_operation.database(index); - match db_operation.entry() { - EntryOperation::Delete(e) => { - if !database.delete(wtxn, e.entry())? { - unreachable!("We tried to delete an unknown key") - } - } - EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, - } - } - WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { - ArroyOperation::DeleteVectors { docid } => { - for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - writer.del_items(wtxn, dimensions, docid)?; - } - } - ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings: raw_embeddings, - } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - // TODO: switch to Embeddings - let mut embeddings = Embeddings::new(*dimensions); - for embedding in raw_embeddings { - embeddings.append(embedding).unwrap(); - } - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; - } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - ArroyOperation::Finish { configs } => { - let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); - let _entered = span.enter(); + let congestion = + write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; - (indexing_context.send_progress)(Progress::from_step( - Step::WritingEmbeddingsToDatabase, - )); + indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); - for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in - &mut arroy_writers - { - let dimensions = *dimensions; - writer.build_and_quantize( - wtxn, - &mut rng, - dimensions, - false, - &indexing_context.must_stop_processing, - )?; - } + let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; - index.put_embedding_configs(wtxn, configs)?; - } - }, - } - } + indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); - (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); + pool.install(|| { + build_vectors( + index, + wtxn, + indexing_context.progress, + index_embeddings, + arroy_memory, + &mut arroy_writers, + &indexing_context.must_stop_processing, + ) + }) + .unwrap()?; - let facet_field_ids_delta = extractor_handle.join().unwrap()?; + post_processing::post_process( + indexing_context, + wtxn, + global_fields_ids_map, + facet_field_ids_delta, + )?; - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); + indexing_context.progress.update_progress(IndexingStep::Finalizing); - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; - - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords)); - - if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { - compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?; - } - - (indexing_context.send_progress)(Progress::from_step(Step::Finalizing)); - - Ok(()) as Result<_> + Ok(congestion) as Result<_> })?; // required to into_inner the new_fields_ids_map drop(fields_ids_map_store); let new_fields_ids_map = new_fields_ids_map.into_inner().unwrap(); - index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; + update_index( + index, + wtxn, + new_fields_ids_map, + new_primary_key, + embedders, + field_distribution, + document_ids, + modified_docids, + )?; - if let Some(new_primary_key) = new_primary_key { - index.put_primary_key(wtxn, new_primary_key.name())?; - } - - // used to update the localized and weighted maps while sharing the update code with the settings pipeline. - let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn, Some(embedders))?; - inner_index_settings.recompute_facets(wtxn, index)?; - inner_index_settings.recompute_searchables(wtxn, index)?; - index.put_field_distribution(wtxn, &field_distribution)?; - index.put_documents_ids(wtxn, &document_ids)?; - index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - - Ok(()) -} - -#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] -fn compute_prefix_database( - index: &Index, - wtxn: &mut RwTxn, - prefix_delta: PrefixDelta, - grenad_parameters: GrenadParameters, -) -> Result<()> { - let PrefixDelta { modified, deleted } = prefix_delta; - // Compute word prefix docids - compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; - // Compute exact word prefix docids - compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; - // Compute word prefix fid docids - compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; - // Compute word prefix position docids - compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters) -} - -#[tracing::instrument(level = "trace", skip_all, target = "indexing")] -fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result> { - let rtxn = index.read_txn()?; - let words_fst = index.words_fst(&rtxn)?; - let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; - let prefix_settings = index.prefix_settings(&rtxn)?; - word_fst_builder.with_prefix_settings(prefix_settings); - - let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); - let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); - for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) { - (Ok((l, _)), Ok((r, _))) => l.cmp(r), - (Err(_), _) | (_, Err(_)) => Ordering::Equal, - }) { - match eob { - EitherOrBoth::Both(lhs, rhs) => { - let (word, lhs_bytes) = lhs?; - let (_, rhs_bytes) = rhs?; - if lhs_bytes != rhs_bytes { - word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; - } - } - EitherOrBoth::Left(result) => { - let (word, _) = result?; - word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?; - } - EitherOrBoth::Right(result) => { - let (word, _) = result?; - word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; - } - } - } - - let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?; - index.main.remap_types::().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; - if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { - index.main.remap_types::().put( - wtxn, - WORDS_PREFIXES_FST_KEY, - &prefixes_fst_mmap, - )?; - Ok(Some(prefix_delta)) - } else { - Ok(None) - } -} - -#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_search")] -fn compute_facet_search_database( - index: &Index, - wtxn: &mut RwTxn, - global_fields_ids_map: GlobalFieldsIdsMap, -) -> Result<()> { - let rtxn = index.read_txn()?; - let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; - let mut facet_search_builder = FacetSearchBuilder::new( - global_fields_ids_map, - localized_attributes_rules.unwrap_or_default(), - ); - - let previous_facet_id_string_docids = index - .facet_id_string_docids - .iter(&rtxn)? - .remap_data_type::() - .filter(|r| r.as_ref().map_or(true, |(k, _)| k.level == 0)); - let current_facet_id_string_docids = index - .facet_id_string_docids - .iter(wtxn)? - .remap_data_type::() - .filter(|r| r.as_ref().map_or(true, |(k, _)| k.level == 0)); - for eob in merge_join_by( - previous_facet_id_string_docids, - current_facet_id_string_docids, - |lhs, rhs| match (lhs, rhs) { - (Ok((l, _)), Ok((r, _))) => l.cmp(r), - (Err(_), _) | (_, Err(_)) => Ordering::Equal, - }, - ) { - match eob { - EitherOrBoth::Both(lhs, rhs) => { - let (_, _) = lhs?; - let (_, _) = rhs?; - } - EitherOrBoth::Left(result) => { - let (key, _) = result?; - facet_search_builder.register_from_key(DelAdd::Deletion, key)?; - } - EitherOrBoth::Right(result) => { - let (key, _) = result?; - facet_search_builder.register_from_key(DelAdd::Addition, key)?; - } - } - } - - facet_search_builder.merge_and_write(index, wtxn, &rtxn) -} - -#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")] -fn compute_facet_level_database( - index: &Index, - wtxn: &mut RwTxn, - facet_field_ids_delta: FacetFieldIdsDelta, -) -> Result<()> { - if let Some(modified_facet_string_ids) = facet_field_ids_delta.modified_facet_string_ids() { - let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); - let _entered = span.enter(); - FacetsUpdateBulk::new_not_updating_level_0( - index, - modified_facet_string_ids, - FacetType::String, - ) - .execute(wtxn)?; - } - if let Some(modified_facet_number_ids) = facet_field_ids_delta.modified_facet_number_ids() { - let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number"); - let _entered = span.enter(); - FacetsUpdateBulk::new_not_updating_level_0( - index, - modified_facet_number_ids, - FacetType::Number, - ) - .execute(wtxn)?; - } - - Ok(()) -} - -/// Returns the primary key that has already been set for this index or the -/// one we will guess by searching for the first key that contains "id" as a substring, -/// and whether the primary key changed -/// TODO move this elsewhere -pub fn retrieve_or_guess_primary_key<'a>( - rtxn: &'a RoTxn<'a>, - index: &Index, - new_fields_ids_map: &mut FieldsIdsMap, - primary_key_from_op: Option<&'a str>, - first_document: Option>, -) -> Result, bool), UserError>> { - // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. - - // do we have an existing declared primary key? - let (primary_key, has_changed) = if let Some(primary_key_from_db) = index.primary_key(rtxn)? { - // did we request a primary key in the operation? - match primary_key_from_op { - // we did, and it is different from the DB one - Some(primary_key_from_op) if primary_key_from_op != primary_key_from_db => { - return Ok(Err(UserError::PrimaryKeyCannotBeChanged( - primary_key_from_db.to_string(), - ))); - } - _ => (primary_key_from_db, false), - } - } else { - // no primary key in the DB => let's set one - // did we request a primary key in the operation? - let primary_key = if let Some(primary_key_from_op) = primary_key_from_op { - // set primary key from operation - primary_key_from_op - } else { - // guess primary key - let first_document = match first_document { - Some(document) => document, - // previous indexer when no pk is set + we send an empty payload => index_primary_key_no_candidate_found - None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - }; - - let guesses: Result> = first_document - .keys() - .filter_map(|name| { - let Some(_) = new_fields_ids_map.insert(name) else { - return Some(Err(UserError::AttributeLimitReached.into())); - }; - name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY).then_some(Ok(name)) - }) - .collect(); - - let mut guesses = guesses?; - - // sort the keys in lexicographical order, so that fields are always in the same order. - guesses.sort_unstable(); - - match guesses.as_slice() { - [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - [name] => { - tracing::info!("Primary key was not specified in index. Inferred to '{name}'"); - *name - } - multiple => { - return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { - candidates: multiple - .iter() - .map(|candidate| candidate.to_string()) - .collect(), - })) - } - } - }; - (primary_key, true) - }; - - match PrimaryKey::new_or_insert(primary_key, new_fields_ids_map) { - Ok(primary_key) => Ok(Ok((primary_key, has_changed))), - Err(err) => Ok(Err(err)), - } -} - -fn request_threads() -> &'static ThreadPoolNoAbort { - static REQUEST_THREADS: OnceLock = OnceLock::new(); - - REQUEST_THREADS.get_or_init(|| { - ThreadPoolNoAbortBuilder::new() - .num_threads(crate::vector::REQUEST_PARALLELISM) - .thread_name(|index| format!("embedding-request-{index}")) - .build() - .unwrap() - }) + Ok(congestion) } diff --git a/crates/milli/src/update/new/indexer/partial_dump.rs b/crates/milli/src/update/new/indexer/partial_dump.rs index 2cc653813..6e4abd898 100644 --- a/crates/milli/src/update/new/indexer/partial_dump.rs +++ b/crates/milli/src/update/new/indexer/partial_dump.rs @@ -1,6 +1,8 @@ use std::ops::DerefMut; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; +use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; use super::document_changes::{DocumentChangeContext, DocumentChanges}; @@ -75,7 +77,7 @@ where self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; let external_document_id = external_document_id.to_de(); - let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) + let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) .map_err(InternalError::SerdeJson)?; let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); diff --git a/crates/milli/src/update/new/indexer/post_processing.rs b/crates/milli/src/update/new/indexer/post_processing.rs new file mode 100644 index 000000000..2a01fccf3 --- /dev/null +++ b/crates/milli/src/update/new/indexer/post_processing.rs @@ -0,0 +1,246 @@ +use std::cmp::Ordering; + +use heed::types::{Bytes, DecodeIgnore, Str}; +use heed::RwTxn; +use itertools::{merge_join_by, EitherOrBoth}; + +use super::document_changes::IndexingContext; +use crate::facet::FacetType; +use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; +use crate::update::del_add::DelAdd; +use crate::update::facet::new_incremental::FacetsUpdateIncremental; +use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::update::new::facet_search_builder::FacetSearchBuilder; +use crate::update::new::merger::FacetFieldIdDelta; +use crate::update::new::steps::IndexingStep; +use crate::update::new::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; +use crate::update::new::words_prefix_docids::{ + compute_exact_word_prefix_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids, + compute_word_prefix_position_docids, +}; +use crate::update::new::FacetFieldIdsDelta; +use crate::update::{FacetsUpdateBulk, GrenadParameters}; +use crate::{GlobalFieldsIdsMap, Index, Result}; + +pub(super) fn post_process( + indexing_context: IndexingContext, + wtxn: &mut RwTxn<'_>, + mut global_fields_ids_map: GlobalFieldsIdsMap<'_>, + facet_field_ids_delta: FacetFieldIdsDelta, +) -> Result<()> +where + MSP: Fn() -> bool + Sync, +{ + let index = indexing_context.index; + indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); + compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?; + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); + if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { + compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?; + }; + Ok(()) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] +fn compute_prefix_database( + index: &Index, + wtxn: &mut RwTxn, + prefix_delta: PrefixDelta, + grenad_parameters: &GrenadParameters, +) -> Result<()> { + let PrefixDelta { modified, deleted } = prefix_delta; + // Compute word prefix docids + compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; + // Compute exact word prefix docids + compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; + // Compute word prefix fid docids + compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; + // Compute word prefix position docids + compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing")] +fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result> { + let rtxn = index.read_txn()?; + let words_fst = index.words_fst(&rtxn)?; + let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; + let prefix_settings = index.prefix_settings(&rtxn)?; + word_fst_builder.with_prefix_settings(prefix_settings); + + let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::(); + let current_words = index.word_docids.iter(wtxn)?.remap_data_type::(); + for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) { + (Ok((l, _)), Ok((r, _))) => l.cmp(r), + (Err(_), _) | (_, Err(_)) => Ordering::Equal, + }) { + match eob { + EitherOrBoth::Both(lhs, rhs) => { + let (word, lhs_bytes) = lhs?; + let (_, rhs_bytes) = rhs?; + if lhs_bytes != rhs_bytes { + word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; + } + } + EitherOrBoth::Left(result) => { + let (word, _) = result?; + word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?; + } + EitherOrBoth::Right(result) => { + let (word, _) = result?; + word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; + } + } + } + + let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?; + index.main.remap_types::().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; + if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data { + index.main.remap_types::().put( + wtxn, + WORDS_PREFIXES_FST_KEY, + &prefixes_fst_mmap, + )?; + Ok(Some(prefix_delta)) + } else { + Ok(None) + } +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_search")] +fn compute_facet_search_database( + index: &Index, + wtxn: &mut RwTxn, + global_fields_ids_map: GlobalFieldsIdsMap, +) -> Result<()> { + let rtxn = index.read_txn()?; + + // if the facet search is not enabled, we can skip the rest of the function + if !index.facet_search(wtxn)? { + return Ok(()); + } + + let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; + let mut facet_search_builder = FacetSearchBuilder::new( + global_fields_ids_map, + localized_attributes_rules.unwrap_or_default(), + filterable_attributes_rules, + ); + + let previous_facet_id_string_docids = index + .facet_id_string_docids + .iter(&rtxn)? + .remap_data_type::() + .filter(|r| r.as_ref().map_or(true, |(k, _)| k.level == 0)); + let current_facet_id_string_docids = index + .facet_id_string_docids + .iter(wtxn)? + .remap_data_type::() + .filter(|r| r.as_ref().map_or(true, |(k, _)| k.level == 0)); + for eob in merge_join_by( + previous_facet_id_string_docids, + current_facet_id_string_docids, + |lhs, rhs| match (lhs, rhs) { + (Ok((l, _)), Ok((r, _))) => l.cmp(r), + (Err(_), _) | (_, Err(_)) => Ordering::Equal, + }, + ) { + match eob { + EitherOrBoth::Both(lhs, rhs) => { + let (_, _) = lhs?; + let (_, _) = rhs?; + } + EitherOrBoth::Left(result) => { + let (key, _) = result?; + facet_search_builder.register_from_key(DelAdd::Deletion, key)?; + } + EitherOrBoth::Right(result) => { + let (key, _) = result?; + facet_search_builder.register_from_key(DelAdd::Addition, key)?; + } + } + } + + facet_search_builder.merge_and_write(index, wtxn, &rtxn) +} + +#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_field_ids")] +fn compute_facet_level_database( + index: &Index, + wtxn: &mut RwTxn, + mut facet_field_ids_delta: FacetFieldIdsDelta, + global_fields_ids_map: &mut GlobalFieldsIdsMap, +) -> Result<()> { + let rtxn = index.read_txn()?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; + for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() { + // skip field ids that should not be facet leveled + let Some(metadata) = global_fields_ids_map.metadata(fid) else { + continue; + }; + if !metadata.require_facet_level_database(&filterable_attributes_rules) { + continue; + } + + let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); + let _entered = span.enter(); + match delta { + FacetFieldIdDelta::Bulk => { + tracing::debug!(%fid, "bulk string facet processing"); + FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String) + .execute(wtxn)? + } + FacetFieldIdDelta::Incremental(delta_data) => { + tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing"); + FacetsUpdateIncremental::new( + index, + FacetType::String, + fid, + delta_data, + FACET_GROUP_SIZE, + FACET_MIN_LEVEL_SIZE, + FACET_MAX_GROUP_SIZE, + ) + .execute(wtxn)? + } + } + } + + for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() { + let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number"); + let _entered = span.enter(); + match delta { + FacetFieldIdDelta::Bulk => { + tracing::debug!(%fid, "bulk number facet processing"); + FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number) + .execute(wtxn)? + } + FacetFieldIdDelta::Incremental(delta_data) => { + tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing"); + FacetsUpdateIncremental::new( + index, + FacetType::Number, + fid, + delta_data, + FACET_GROUP_SIZE, + FACET_MIN_LEVEL_SIZE, + FACET_MAX_GROUP_SIZE, + ) + .execute(wtxn)? + } + } + debug_assert!(crate::update::facet::sanity_checks( + index, + wtxn, + fid, + FacetType::Number, + FACET_GROUP_SIZE as usize, + FACET_MIN_LEVEL_SIZE as usize, + FACET_MAX_GROUP_SIZE as usize, + ) + .is_ok()); + } + + Ok(()) +} diff --git a/crates/milli/src/update/new/indexer/update_by_function.rs b/crates/milli/src/update/new/indexer/update_by_function.rs index a8e3e38a8..3001648e6 100644 --- a/crates/milli/src/update/new/indexer/update_by_function.rs +++ b/crates/milli/src/update/new/indexer/update_by_function.rs @@ -1,8 +1,9 @@ -use raw_collections::RawMap; +use bumparaw_collections::RawMap; use rayon::iter::IndexedParallelIterator; use rayon::slice::ParallelSlice as _; use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; use roaring::RoaringBitmap; +use rustc_hash::FxBuildHasher; use super::document_changes::DocumentChangeContext; use super::DocumentChanges; @@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { if document_id != new_document_id { Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) } else { - let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) - .map_err(InternalError::SerdeJson)?; + let raw_new_doc = RawMap::from_raw_value_and_hasher( + raw_new_doc, + FxBuildHasher, + doc_alloc, + ) + .map_err(InternalError::SerdeJson)?; Ok(Some(DocumentChange::Update(Update::create( docid, diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs new file mode 100644 index 000000000..ca860bbff --- /dev/null +++ b/crates/milli/src/update/new/indexer/write.rs @@ -0,0 +1,234 @@ +use std::sync::atomic::AtomicBool; + +use bstr::ByteSlice as _; +use hashbrown::HashMap; +use heed::RwTxn; +use rand::SeedableRng as _; +use time::OffsetDateTime; + +use super::super::channel::*; +use crate::documents::PrimaryKey; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; +use crate::index::IndexEmbeddingConfig; +use crate::progress::Progress; +use crate::update::settings::InnerIndexSettings; +use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; +use crate::{Error, Index, InternalError, Result, UserError}; + +pub fn write_to_db( + mut writer_receiver: WriterBbqueueReceiver<'_>, + finished_extraction: &AtomicBool, + index: &Index, + wtxn: &mut RwTxn<'_>, + arroy_writers: &HashMap, +) -> Result { + // Used by by the ArroySetVector to copy the embedding into an + // aligned memory area, required by arroy to accept a new vector. + let mut aligned_embedding = Vec::new(); + let span = tracing::trace_span!(target: "indexing::write_db", "all"); + let _entered = span.enter(); + let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); + let mut _entered_post_merge = None; + while let Some(action) = writer_receiver.recv_action() { + if _entered_post_merge.is_none() + && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) + { + _entered_post_merge = Some(span.enter()); + } + + match action { + ReceiverAction::WakeUp => (), + ReceiverAction::LargeEntry(LargeEntry { database, key, value }) => { + let database_name = database.database_name(); + let database = database.database(index); + if let Err(error) = database.put(wtxn, &key, &value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: bstr::BString::from(&key[..]), + value_length: value.len(), + error, + })); + } + } + ReceiverAction::LargeVectors(large_vectors) => { + let LargeVectors { docid, embedder_id, .. } = large_vectors; + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let mut embeddings = Embeddings::new(*dimensions); + for embedding in large_vectors.read_embeddings(*dimensions) { + embeddings.push(embedding.to_vec()).unwrap(); + } + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + } + + // Every time the is a message in the channel we search + // for new entries in the BBQueue buffers. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + arroy_writers, + &mut aligned_embedding, + )?; + } + + write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?; + + Ok(ChannelCongestion { + attempts: writer_receiver.sent_messages_attempts(), + blocking_attempts: writer_receiver.blocking_sent_messages_attempts(), + }) +} + +/// Stats exposing the congestion of a channel. +#[derive(Debug, Copy, Clone)] +pub struct ChannelCongestion { + /// Number of attempts to send a message into the bbqueue buffer. + pub attempts: usize, + /// Number of blocking attempts which require a retry. + pub blocking_attempts: usize, +} + +impl ChannelCongestion { + pub fn congestion_ratio(&self) -> f32 { + self.blocking_attempts as f32 / self.attempts as f32 + } +} + +#[tracing::instrument(level = "debug", skip_all, target = "indexing::vectors")] +pub fn build_vectors( + index: &Index, + wtxn: &mut RwTxn<'_>, + progress: &Progress, + index_embeddings: Vec, + arroy_memory: Option, + arroy_writers: &mut HashMap, + must_stop_processing: &MSP, +) -> Result<()> +where + MSP: Fn() -> bool + Sync + Send, +{ + if index_embeddings.is_empty() { + return Ok(()); + } + + let seed = rand::random(); + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + for (_index, (_embedder_name, _embedder, writer, dimensions)) in arroy_writers { + let dimensions = *dimensions; + writer.build_and_quantize( + wtxn, + progress, + &mut rng, + dimensions, + false, + arroy_memory, + must_stop_processing, + )?; + } + + index.put_embedding_configs(wtxn, index_embeddings)?; + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +pub(super) fn update_index( + index: &Index, + wtxn: &mut RwTxn<'_>, + new_fields_ids_map: FieldIdMapWithMetadata, + new_primary_key: Option>, + embedders: EmbeddingConfigs, + field_distribution: std::collections::BTreeMap, + document_ids: roaring::RoaringBitmap, + modified_docids: roaring::RoaringBitmap, +) -> Result<()> { + index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; + if let Some(new_primary_key) = new_primary_key { + index.put_primary_key(wtxn, new_primary_key.name())?; + } + let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn, Some(embedders))?; + inner_index_settings.recompute_searchables(wtxn, index)?; + index.put_field_distribution(wtxn, &field_distribution)?; + index.put_documents_ids(wtxn, &document_ids)?; + index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + index.update_documents_stats(wtxn, modified_docids)?; + Ok(()) +} + +/// A function dedicated to manage all the available BBQueue frames. +/// +/// It reads all the available frames, do the corresponding database operations +/// and stops when no frame are available. +pub fn write_from_bbqueue( + writer_receiver: &mut WriterBbqueueReceiver<'_>, + index: &Index, + wtxn: &mut RwTxn<'_>, + arroy_writers: &HashMap, + aligned_embedding: &mut Vec, +) -> crate::Result<()> { + while let Some(frame_with_header) = writer_receiver.recv_frame() { + match frame_with_header.header() { + EntryHeader::DbOperation(operation) => { + let database_name = operation.database.database_name(); + let database = operation.database.database(index); + let frame = frame_with_header.frame(); + match operation.key_value(frame) { + (key, Some(value)) => { + if let Err(error) = database.put(wtxn, key, value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: key.into(), + value_length: value.len(), + error, + })); + } + } + (key, None) => match database.delete(wtxn, key) { + Ok(false) => { + tracing::error!( + database_name, + key_bytes = ?key, + formatted_key = ?key.as_bstr(), + "Attempt to delete an unknown key" + ); + } + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError(InternalError::StoreDeletion { + database_name, + key: key.into(), + error, + })); + } + }, + } + } + EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { + let dimensions = *dimensions; + writer.del_items(wtxn, dimensions, docid)?; + } + } + EntryHeader::ArroySetVectors(asvs) => { + let ArroySetVectors { docid, embedder_id, .. } = asvs; + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let mut embeddings = Embeddings::new(*dimensions); + let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); + if embeddings.append(all_embeddings.to_vec()).is_err() { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: all_embeddings.len(), + })); + } + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + } + } + + Ok(()) +} diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 039c56b9d..090add6bd 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -1,6 +1,6 @@ use std::cell::RefCell; -use hashbrown::HashSet; +use hashbrown::HashMap; use heed::types::Bytes; use heed::{Database, RoTxn}; use memmap2::Mmap; @@ -9,9 +9,10 @@ use roaring::RoaringBitmap; use super::channel::*; use super::extract::{ - merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, - GeoExtractorData, + merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, + FacetKind, GeoExtractorData, }; +use crate::update::facet::new_incremental::FacetFieldIdChange; use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result}; #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] @@ -19,7 +20,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>( datastore: impl IntoIterator>>, rtxn: &RoTxn, index: &Index, - geo_sender: GeoSender<'_>, + geo_sender: GeoSender<'_, '_>, must_stop_processing: &MSP, ) -> Result<()> where @@ -34,7 +35,7 @@ where } let mut frozen = data.into_inner().freeze()?; - for result in frozen.iter_and_clear_removed() { + for result in frozen.iter_and_clear_removed()? { let extracted_geo_point = result?; let removed = rtree.remove(&GeoPoint::from(extracted_geo_point)); debug_assert!(removed.is_some()); @@ -42,7 +43,7 @@ where debug_assert!(removed); } - for result in frozen.iter_and_clear_inserted() { + for result in frozen.iter_and_clear_inserted()? { let extracted_geo_point = result?; rtree.insert(GeoPoint::from(extracted_geo_point)); let inserted = faceted.insert(extracted_geo_point.docid); @@ -56,38 +57,37 @@ where let rtree_mmap = unsafe { Mmap::map(&file)? }; geo_sender.set_rtree(rtree_mmap).unwrap(); - geo_sender.set_geo_faceted(&faceted).unwrap(); + geo_sender.set_geo_faceted(&faceted)?; Ok(()) } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_docids<'extractor, MSP>( +pub fn merge_and_send_docids<'extractor, MSP, D>( mut caches: Vec>, database: Database, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: WordDocidsSender, must_stop_processing: &MSP, ) -> Result<()> where MSP: Fn() -> bool + Sync, + D: DatabaseType + Sync, { transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } - merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), @@ -101,26 +101,30 @@ pub fn merge_and_send_facet_docids<'extractor>( mut caches: Vec>, database: FacetDatabases, index: &Index, - docids_sender: impl DocidsSender + Sync, + rtxn: &RoTxn, + docids_sender: FacetDocidsSender, ) -> Result { + let max_string_count = (index.facet_id_string_docids.len(rtxn)? / 500) as usize; + let max_number_count = (index.facet_id_f64_docids.len(rtxn)? / 500) as usize; + let max_string_count = max_string_count.clamp(1000, 100_000); + let max_number_count = max_number_count.clamp(1000, 100_000); transpose_and_freeze_caches(&mut caches)? .into_par_iter() .map(|frozen| { - let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); + let mut facet_field_ids_delta = + FacetFieldIdsDelta::new(max_string_count, max_number_count); let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); - merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { + merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { facet_field_ids_delta.register_from_key(key); - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), @@ -129,7 +133,10 @@ pub fn merge_and_send_facet_docids<'extractor>( Ok(facet_field_ids_delta) }) - .reduce(|| Ok(FacetFieldIdsDelta::default()), |lhs, rhs| Ok(lhs?.merge(rhs?))) + .reduce( + || Ok(FacetFieldIdsDelta::new(max_string_count, max_number_count)), + |lhs, rhs| Ok(lhs?.merge(rhs?)), + ) } pub struct FacetDatabases<'a> { @@ -158,60 +165,131 @@ impl<'a> FacetDatabases<'a> { } } -#[derive(Debug, Default)] +#[derive(Debug)] +pub enum FacetFieldIdDelta { + Bulk, + Incremental(Vec), +} + +impl FacetFieldIdDelta { + fn push(&mut self, facet_value: &[u8], max_count: usize) { + *self = match std::mem::replace(self, FacetFieldIdDelta::Bulk) { + FacetFieldIdDelta::Bulk => FacetFieldIdDelta::Bulk, + FacetFieldIdDelta::Incremental(mut v) => { + if v.len() >= max_count { + FacetFieldIdDelta::Bulk + } else { + v.push(FacetFieldIdChange { facet_value: facet_value.into() }); + FacetFieldIdDelta::Incremental(v) + } + } + } + } + + fn merge(&mut self, rhs: Option, max_count: usize) { + let Some(rhs) = rhs else { + return; + }; + *self = match (std::mem::replace(self, FacetFieldIdDelta::Bulk), rhs) { + (FacetFieldIdDelta::Bulk, _) | (_, FacetFieldIdDelta::Bulk) => FacetFieldIdDelta::Bulk, + ( + FacetFieldIdDelta::Incremental(mut left), + FacetFieldIdDelta::Incremental(mut right), + ) => { + if left.len() + right.len() >= max_count { + FacetFieldIdDelta::Bulk + } else { + left.append(&mut right); + FacetFieldIdDelta::Incremental(left) + } + } + }; + } +} + +#[derive(Debug)] pub struct FacetFieldIdsDelta { /// The field ids that have been modified - modified_facet_string_ids: HashSet, - modified_facet_number_ids: HashSet, + modified_facet_string_ids: HashMap, + modified_facet_number_ids: HashMap, + max_string_count: usize, + max_number_count: usize, } impl FacetFieldIdsDelta { - fn register_facet_string_id(&mut self, field_id: FieldId) { - self.modified_facet_string_ids.insert(field_id); + pub fn new(max_string_count: usize, max_number_count: usize) -> Self { + Self { + max_string_count, + max_number_count, + modified_facet_string_ids: Default::default(), + modified_facet_number_ids: Default::default(), + } } - fn register_facet_number_id(&mut self, field_id: FieldId) { - self.modified_facet_number_ids.insert(field_id); + fn register_facet_string_id(&mut self, field_id: FieldId, facet_value: &[u8]) { + self.modified_facet_string_ids + .entry(field_id) + .or_insert(FacetFieldIdDelta::Incremental(Default::default())) + .push(facet_value, self.max_string_count); + } + + fn register_facet_number_id(&mut self, field_id: FieldId, facet_value: &[u8]) { + self.modified_facet_number_ids + .entry(field_id) + .or_insert(FacetFieldIdDelta::Incremental(Default::default())) + .push(facet_value, self.max_number_count); } fn register_from_key(&mut self, key: &[u8]) { - let (facet_kind, field_id) = self.extract_key_data(key); - match facet_kind { - FacetKind::Number => self.register_facet_number_id(field_id), - FacetKind::String => self.register_facet_string_id(field_id), + let (facet_kind, field_id, facet_value) = self.extract_key_data(key); + match (facet_kind, facet_value) { + (FacetKind::Number, Some(facet_value)) => { + self.register_facet_number_id(field_id, facet_value) + } + (FacetKind::String, Some(facet_value)) => { + self.register_facet_string_id(field_id, facet_value) + } _ => (), } } - fn extract_key_data(&self, key: &[u8]) -> (FacetKind, FieldId) { + fn extract_key_data<'key>(&self, key: &'key [u8]) -> (FacetKind, FieldId, Option<&'key [u8]>) { let facet_kind = FacetKind::from(key[0]); let field_id = FieldId::from_be_bytes([key[1], key[2]]); - (facet_kind, field_id) + let facet_value = if key.len() >= 4 { + // level is also stored in the key at [3] (always 0) + Some(&key[4..]) + } else { + None + }; + + (facet_kind, field_id, facet_value) } - pub fn modified_facet_string_ids(&self) -> Option> { - if self.modified_facet_string_ids.is_empty() { - None - } else { - Some(self.modified_facet_string_ids.iter().copied().collect()) - } + pub fn consume_facet_string_delta( + &mut self, + ) -> impl Iterator + '_ { + self.modified_facet_string_ids.drain() } - pub fn modified_facet_number_ids(&self) -> Option> { - if self.modified_facet_number_ids.is_empty() { - None - } else { - Some(self.modified_facet_number_ids.iter().copied().collect()) - } + pub fn consume_facet_number_delta( + &mut self, + ) -> impl Iterator + '_ { + self.modified_facet_number_ids.drain() } pub fn merge(mut self, rhs: Self) -> Self { - let Self { modified_facet_number_ids, modified_facet_string_ids } = rhs; - modified_facet_number_ids.into_iter().for_each(|fid| { - self.modified_facet_number_ids.insert(fid); + // rhs.max_xx_count is assumed to be equal to self.max_xx_count, and so gets unused + let Self { modified_facet_number_ids, modified_facet_string_ids, .. } = rhs; + modified_facet_number_ids.into_iter().for_each(|(fid, mut delta)| { + let old_delta = self.modified_facet_number_ids.remove(&fid); + delta.merge(old_delta, self.max_number_count); + self.modified_facet_number_ids.insert(fid, delta); }); - modified_facet_string_ids.into_iter().for_each(|fid| { - self.modified_facet_string_ids.insert(fid); + modified_facet_string_ids.into_iter().for_each(|(fid, mut delta)| { + let old_delta = self.modified_facet_string_ids.remove(&fid); + delta.merge(old_delta, self.max_string_count); + self.modified_facet_string_ids.insert(fid, delta); }); self } @@ -238,8 +316,12 @@ fn merge_cbo_bitmaps( (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), (Some(current), Some(del), add) => { + debug_assert!( + del.is_subset(¤t), + "del is not a subset of current, which must be impossible." + ); let output = match add { - Some(add) => (¤t - del) | add, + Some(add) => (¤t - (&del - &add)) | (add - del), None => ¤t - del, }; if output.is_empty() { @@ -252,10 +334,3 @@ fn merge_cbo_bitmaps( } } } - -/// TODO Return the slice directly from the serialize_into method -fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { - buffer.clear(); - CboRoaringBitmapCodec::serialize_into(bitmap, buffer); - buffer.as_slice() -} diff --git a/crates/milli/src/update/new/mod.rs b/crates/milli/src/update/new/mod.rs index 87995ee55..81ff93e54 100644 --- a/crates/milli/src/update/new/mod.rs +++ b/crates/milli/src/update/new/mod.rs @@ -1,4 +1,5 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update}; +pub use indexer::ChannelCongestion; pub use merger::{ merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta, }; @@ -16,6 +17,7 @@ pub mod indexer; mod merger; mod parallel_iterator_ext; mod ref_cell_ext; +pub mod reindex; pub(crate) mod steps; pub(crate) mod thread_local; pub mod vector_document; diff --git a/crates/milli/src/update/new/ref_cell_ext.rs b/crates/milli/src/update/new/ref_cell_ext.rs index c66f4af0a..77f5fa800 100644 --- a/crates/milli/src/update/new/ref_cell_ext.rs +++ b/crates/milli/src/update/new/ref_cell_ext.rs @@ -5,6 +5,7 @@ pub trait RefCellExt { &self, ) -> std::result::Result, std::cell::BorrowMutError>; + #[track_caller] fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { self.try_borrow_mut_or_yield().unwrap() } diff --git a/crates/milli/src/update/new/reindex.rs b/crates/milli/src/update/new/reindex.rs new file mode 100644 index 000000000..6bfeb123e --- /dev/null +++ b/crates/milli/src/update/new/reindex.rs @@ -0,0 +1,38 @@ +use heed::RwTxn; + +use super::document::{Document, DocumentFromDb}; +use crate::progress::{self, AtomicSubStep, Progress}; +use crate::{FieldDistribution, Index, Result}; + +pub fn field_distribution(index: &Index, wtxn: &mut RwTxn<'_>, progress: &Progress) -> Result<()> { + let mut distribution = FieldDistribution::new(); + + let document_count = index.number_of_documents(wtxn)?; + let field_id_map = index.fields_ids_map(wtxn)?; + + let (update_document_count, sub_step) = + AtomicSubStep::::new(document_count as u32); + progress.update_progress(sub_step); + + let docids = index.documents_ids(wtxn)?; + + for docid in docids { + update_document_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + + let Some(document) = DocumentFromDb::new(docid, wtxn, index, &field_id_map)? else { + continue; + }; + let geo_iter = document.geo_field().transpose().map(|res| res.map(|rv| ("_geo", rv))); + for res in document.iter_top_level_fields().chain(geo_iter) { + let (field_name, _) = res?; + if let Some(count) = distribution.get_mut(field_name) { + *count += 1; + } else { + distribution.insert(field_name.to_owned(), 1); + } + } + } + + index.put_field_distribution(wtxn, &distribution)?; + Ok(()) +} diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index 7c2441933..ad8fe9cb1 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -1,8 +1,12 @@ +use std::borrow::Cow; + use enum_iterator::Sequence; +use crate::progress::Step; + #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] -#[repr(u16)] -pub enum Step { +#[repr(u8)] +pub enum IndexingStep { PreparingPayloads, ExtractingDocuments, ExtractingFacets, @@ -10,38 +14,39 @@ pub enum Step { ExtractingWordProximity, ExtractingEmbeddings, WritingGeoPoints, - WritingToDatabase, - WritingEmbeddingsToDatabase, + WaitingForDatabaseWrites, WaitingForExtractors, + WritingEmbeddingsToDatabase, PostProcessingFacets, PostProcessingWords, Finalizing, } -impl Step { - pub fn name(&self) -> &'static str { +impl Step for IndexingStep { + fn name(&self) -> Cow<'static, str> { match self { - Step::PreparingPayloads => "preparing update file", - Step::ExtractingDocuments => "extracting documents", - Step::ExtractingFacets => "extracting facets", - Step::ExtractingWords => "extracting words", - Step::ExtractingWordProximity => "extracting word proximity", - Step::ExtractingEmbeddings => "extracting embeddings", - Step::WritingGeoPoints => "writing geo points", - Step::WritingToDatabase => "writing to database", - Step::WritingEmbeddingsToDatabase => "writing embeddings to database", - Step::WaitingForExtractors => "waiting for extractors", - Step::PostProcessingFacets => "post-processing facets", - Step::PostProcessingWords => "post-processing words", - Step::Finalizing => "finalizing", + IndexingStep::PreparingPayloads => "preparing update file", + IndexingStep::ExtractingDocuments => "extracting documents", + IndexingStep::ExtractingFacets => "extracting facets", + IndexingStep::ExtractingWords => "extracting words", + IndexingStep::ExtractingWordProximity => "extracting word proximity", + IndexingStep::ExtractingEmbeddings => "extracting embeddings", + IndexingStep::WritingGeoPoints => "writing geo points", + IndexingStep::WaitingForDatabaseWrites => "waiting for database writes", + IndexingStep::WaitingForExtractors => "waiting for extractors", + IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database", + IndexingStep::PostProcessingFacets => "post-processing facets", + IndexingStep::PostProcessingWords => "post-processing words", + IndexingStep::Finalizing => "finalizing", } + .into() } - pub fn finished_steps(self) -> u16 { - self as u16 + fn current(&self) -> u32 { + *self as u32 } - pub const fn total_steps() -> u16 { - Self::CARDINALITY as u16 + fn total(&self) -> u32 { + Self::CARDINALITY as u32 } } diff --git a/crates/milli/src/update/new/thread_local.rs b/crates/milli/src/update/new/thread_local.rs index acdc78c7b..a5af2b0bb 100644 --- a/crates/milli/src/update/new/thread_local.rs +++ b/crates/milli/src/update/new/thread_local.rs @@ -29,9 +29,9 @@ use std::cell::RefCell; /// - An example of a type that verifies (1) and (2) is [`std::rc::Rc`] (when `T` is `Send` and `Sync`). /// - An example of a type that doesn't verify (1) is thread-local data. /// - An example of a type that doesn't verify (2) is [`std::sync::MutexGuard`]: a lot of mutex implementations require that -/// a lock is returned to the operating system on the same thread that initially locked the mutex, failing to uphold this -/// invariant will cause Undefined Behavior -/// (see last § in [the nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). +/// a lock is returned to the operating system on the same thread that initially locked the mutex, failing to uphold this +/// invariant will cause Undefined Behavior +/// (see last § in [the nomicon](https://doc.rust-lang.org/nomicon/send-and-sync.html)). /// /// It is **always safe** to implement this trait on a type that is `Send`, but no placeholder impl is provided due to limitations in /// coherency. Use the [`FullySend`] wrapper in this situation. @@ -86,7 +86,7 @@ impl MostlySendWrapper { /// # Safety /// /// 1. `T` is [`MostlySend`], so by its safety contract it can be accessed by any thread and all of its operations are available -/// from any thread. +/// from any thread. /// 2. (P1) of `MostlySendWrapper::new` forces the user to never access the value from multiple threads concurrently. unsafe impl Send for MostlySendWrapper {} diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 319730db0..a52dab6a1 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -1,19 +1,19 @@ use std::collections::BTreeSet; use bumpalo::Bump; +use bumparaw_collections::RawMap; use deserr::{Deserr, IntoValue}; use heed::RoTxn; -use raw_collections::RawMap; +use rustc_hash::FxBuildHasher; use serde::Serialize; use serde_json::value::RawValue; use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; use super::indexer::de::DeserrRawValue; +use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::documents::FieldIdMapper; use crate::index::IndexEmbeddingConfig; -use crate::vector::parsed_vectors::{ - RawVectors, RawVectorsError, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME, -}; +use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; use crate::{DocumentId, Index, InternalError, Result, UserError}; @@ -84,7 +84,7 @@ pub struct VectorDocumentFromDb<'t> { docid: DocumentId, embedding_config: Vec, index: &'t Index, - vectors_field: Option>, + vectors_field: Option>, rtxn: &'t RoTxn<'t>, doc_alloc: &'t Bump, } @@ -102,9 +102,10 @@ impl<'t> VectorDocumentFromDb<'t> { }; let vectors = document.vectors_field()?; let vectors_field = match vectors { - Some(vectors) => { - Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) - } + Some(vectors) => Some( + RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc) + .map_err(InternalError::SerdeJson)?, + ), None => None, }; @@ -220,7 +221,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, - vectors: RawMap<'doc>, + vectors: RawMap<'doc, FxBuildHasher>, embedders: &'doc EmbeddingConfigs, } @@ -233,8 +234,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> { ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { - let vectors = - RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; + let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump) + .map_err(UserError::SerdeJson)?; Ok(Some(Self { external_document_id, vectors, embedders })) } else { Ok(None) diff --git a/crates/milli/src/update/new/word_fst_builder.rs b/crates/milli/src/update/new/word_fst_builder.rs index 2b1c4604b..a9a5222be 100644 --- a/crates/milli/src/update/new/word_fst_builder.rs +++ b/crates/milli/src/update/new/word_fst_builder.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::io::BufWriter; use fst::{Set, SetBuilder, Streamer}; @@ -75,18 +75,18 @@ pub struct PrefixData { #[derive(Debug)] pub struct PrefixDelta { - pub modified: HashSet, - pub deleted: HashSet, + pub modified: BTreeSet, + pub deleted: BTreeSet, } struct PrefixFstBuilder { - prefix_count_threshold: u64, + prefix_count_threshold: usize, max_prefix_length: usize, /// TODO: Replace the full memory allocation prefix_fst_builders: Vec>>, current_prefix: Vec, - current_prefix_count: Vec, - modified_prefixes: HashSet, + current_prefix_count: Vec, + modified_prefixes: BTreeSet, current_prefix_is_modified: Vec, } @@ -95,7 +95,7 @@ impl PrefixFstBuilder { let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = prefix_settings; - if !compute_prefixes { + if compute_prefixes != crate::index::PrefixSearch::IndexingTime { return None; } @@ -110,7 +110,7 @@ impl PrefixFstBuilder { prefix_fst_builders, current_prefix: vec![Prefix::new(); max_prefix_length], current_prefix_count: vec![0; max_prefix_length], - modified_prefixes: HashSet::new(), + modified_prefixes: BTreeSet::new(), current_prefix_is_modified: vec![false; max_prefix_length], }) } @@ -180,7 +180,7 @@ impl PrefixFstBuilder { let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; let new_prefix_fst = Set::new(&prefix_fst_mmap)?; let old_prefix_fst = index.words_prefixes_fst(rtxn)?; - let mut deleted_prefixes = HashSet::new(); + let mut deleted_prefixes = BTreeSet::new(); { let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference(); while let Some(prefix) = deleted_prefixes_stream.next() { diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index 338d22505..9abd01bac 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::collections::HashSet; +use std::collections::BTreeSet; use std::io::{BufReader, BufWriter, Read, Seek, Write}; use hashbrown::HashMap; @@ -25,7 +25,7 @@ impl WordPrefixDocids { fn new( database: Database, prefix_database: Database, - grenad_parameters: GrenadParameters, + grenad_parameters: &GrenadParameters, ) -> WordPrefixDocids { WordPrefixDocids { database, @@ -37,8 +37,8 @@ impl WordPrefixDocids { fn execute( self, wtxn: &mut heed::RwTxn, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, ) -> Result<()> { delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) @@ -48,7 +48,7 @@ impl WordPrefixDocids { fn recompute_modified_prefixes( &self, wtxn: &mut RwTxn, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. // And collect the CboRoaringBitmaps pointers in an HashMap. @@ -76,7 +76,7 @@ impl WordPrefixDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixEntry { prefix, serialized_length: buffer.len() }); file.write_all(buffer) })?; @@ -127,7 +127,7 @@ impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> { pub fn from_prefixes( database: Database, rtxn: &'rtxn RoTxn, - prefixes: &'a HashSet, + prefixes: &'a BTreeSet, ) -> heed::Result { let database = database.remap_data_type::(); @@ -149,7 +149,7 @@ impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> { } } -unsafe impl<'a, 'rtxn> Sync for FrozenPrefixBitmaps<'a, 'rtxn> {} +unsafe impl Sync for FrozenPrefixBitmaps<'_, '_> {} struct WordPrefixIntegerDocids { database: Database, @@ -161,7 +161,7 @@ impl WordPrefixIntegerDocids { fn new( database: Database, prefix_database: Database, - grenad_parameters: GrenadParameters, + grenad_parameters: &GrenadParameters, ) -> WordPrefixIntegerDocids { WordPrefixIntegerDocids { database, @@ -173,8 +173,8 @@ impl WordPrefixIntegerDocids { fn execute( self, wtxn: &mut heed::RwTxn, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, ) -> Result<()> { delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; self.recompute_modified_prefixes(wtxn, prefix_to_compute) @@ -184,7 +184,7 @@ impl WordPrefixIntegerDocids { fn recompute_modified_prefixes( &self, wtxn: &mut RwTxn, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We fetch the docids associated to the newly added word prefix fst only. // And collect the CboRoaringBitmaps pointers in an HashMap. @@ -205,15 +205,22 @@ impl WordPrefixIntegerDocids { let (ref mut index, ref mut file, ref mut buffer) = *refmut; for (&pos, bitmaps_bytes) in frozen.bitmaps(prefix).unwrap() { - let output = bitmaps_bytes - .iter() - .map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes)) - .union()?; - - buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); - index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() }); - file.write_all(buffer)?; + if bitmaps_bytes.is_empty() { + index.push(PrefixIntegerEntry { prefix, pos, serialized_length: None }); + } else { + let output = bitmaps_bytes + .iter() + .map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes)) + .union()?; + buffer.clear(); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); + index.push(PrefixIntegerEntry { + prefix, + pos, + serialized_length: Some(buffer.len()), + }); + file.write_all(buffer)?; + } } Result::Ok(()) @@ -230,14 +237,24 @@ impl WordPrefixIntegerDocids { file.rewind()?; let mut file = BufReader::new(file); for PrefixIntegerEntry { prefix, pos, serialized_length } in index { - buffer.resize(serialized_length, 0); - file.read_exact(&mut buffer)?; - key_buffer.clear(); key_buffer.extend_from_slice(prefix.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(&pos.to_be_bytes()); - self.prefix_database.remap_data_type::().put(wtxn, &key_buffer, &buffer)?; + match serialized_length { + Some(serialized_length) => { + buffer.resize(serialized_length, 0); + file.read_exact(&mut buffer)?; + self.prefix_database.remap_data_type::().put( + wtxn, + &key_buffer, + &buffer, + )?; + } + None => { + self.prefix_database.delete(wtxn, &key_buffer)?; + } + } } } @@ -249,7 +266,7 @@ impl WordPrefixIntegerDocids { struct PrefixIntegerEntry<'a> { prefix: &'a str, pos: u16, - serialized_length: usize, + serialized_length: Option, } /// TODO doc @@ -262,7 +279,7 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { pub fn from_prefixes( database: Database, rtxn: &'rtxn RoTxn, - prefixes: &'a HashSet, + prefixes: &'a BTreeSet, ) -> heed::Result { let database = database.remap_data_type::(); @@ -285,13 +302,13 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { } } -unsafe impl<'a, 'rtxn> Sync for FrozenPrefixIntegerBitmaps<'a, 'rtxn> {} +unsafe impl Sync for FrozenPrefixIntegerBitmaps<'_, '_> {} #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] fn delete_prefixes( wtxn: &mut RwTxn, prefix_database: &Database, - prefixes: &HashSet, + prefixes: &BTreeSet, ) -> Result<()> { // We remove all the entries that are no more required in this word prefix docids database. for prefix in prefixes { @@ -309,9 +326,9 @@ fn delete_prefixes( pub fn compute_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, - grenad_parameters: GrenadParameters, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, + grenad_parameters: &GrenadParameters, ) -> Result<()> { WordPrefixDocids::new( index.word_docids.remap_key_type(), @@ -325,9 +342,9 @@ pub fn compute_word_prefix_docids( pub fn compute_exact_word_prefix_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, - grenad_parameters: GrenadParameters, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, + grenad_parameters: &GrenadParameters, ) -> Result<()> { WordPrefixDocids::new( index.exact_word_docids.remap_key_type(), @@ -341,9 +358,9 @@ pub fn compute_exact_word_prefix_docids( pub fn compute_word_prefix_fid_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, - grenad_parameters: GrenadParameters, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, + grenad_parameters: &GrenadParameters, ) -> Result<()> { WordPrefixIntegerDocids::new( index.word_fid_docids.remap_key_type(), @@ -357,9 +374,9 @@ pub fn compute_word_prefix_fid_docids( pub fn compute_word_prefix_position_docids( wtxn: &mut RwTxn, index: &Index, - prefix_to_compute: &HashSet, - prefix_to_delete: &HashSet, - grenad_parameters: GrenadParameters, + prefix_to_compute: &BTreeSet, + prefix_to_delete: &BTreeSet, + grenad_parameters: &GrenadParameters, ) -> Result<()> { WordPrefixIntegerDocids::new( index.word_position_docids.remap_key_type(), diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index ccfdb1711..37761f649 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -6,31 +6,35 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; -use itertools::{EitherOrBoth, Itertools}; +use itertools::{merge_join_by, EitherOrBoth, Itertools}; use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; -use super::del_add::DelAddOperation; +use super::del_add::{DelAdd, DelAddOperation}; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; +use crate::attribute_patterns::PatternMatch; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::criterion::Criterion; use crate::error::UserError; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; +use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ - IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, + IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, + DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; use crate::prompt::default_max_bytes; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; -use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::settings::{ - check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction, - WriteBackToDocuments, + EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, + SubEmbeddingSettings, WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result}; +use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -154,7 +158,7 @@ pub struct Settings<'a, 't, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - filterable_fields: Setting>, + filterable_fields: Setting>, sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, @@ -177,6 +181,8 @@ pub struct Settings<'a, 't, 'i> { embedder_settings: Setting>>, search_cutoff: Setting, localized_attributes_rules: Setting>, + prefix_search: Setting, + facet_search: Setting, } impl<'a, 't, 'i> Settings<'a, 't, 'i> { @@ -212,6 +218,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { embedder_settings: Setting::NotSet, search_cutoff: Setting::NotSet, localized_attributes_rules: Setting::NotSet, + prefix_search: Setting::NotSet, + facet_search: Setting::NotSet, indexer_config, } } @@ -236,8 +244,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.filterable_fields = Setting::Reset; } - pub fn set_filterable_fields(&mut self, names: HashSet) { - self.filterable_fields = Setting::Set(names); + pub fn set_filterable_fields(&mut self, rules: Vec) { + self.filterable_fields = Setting::Set(rules); } pub fn set_sortable_fields(&mut self, names: HashSet) { @@ -418,6 +426,22 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.localized_attributes_rules = Setting::Reset; } + pub fn set_prefix_search(&mut self, value: PrefixSearch) { + self.prefix_search = Setting::Set(value); + } + + pub fn reset_prefix_search(&mut self) { + self.prefix_search = Setting::Reset; + } + + pub fn set_facet_search(&mut self, value: bool) { + self.facet_search = Setting::Set(value); + } + + pub fn reset_facet_search(&mut self) { + self.facet_search = Setting::Reset; + } + #[tracing::instrument( level = "trace" skip(self, progress_callback, should_abort, settings_diff), @@ -495,7 +519,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } /// Updates the index's searchable attributes. - fn update_searchable(&mut self) -> Result { + fn update_user_defined_searchable_attributes(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else @@ -508,26 +532,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { return Ok(false); } - // Since we're updating the settings we can only add new fields at the end of the field id map - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names = fields.iter().unique().map(String::as_str).collect::>(); - // Add all the searchable attributes to the field map, and then add the - // remaining fields from the old field map to the new one - for name in names.iter() { - // The fields ids map won't change the field id of already present elements thus only the - // new fields will be inserted. - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - self.index.put_all_searchable_fields_from_fields_ids_map( - self.wtxn, - &names, - &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), - &fields_ids_map, - )?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_user_defined_searchable_fields(self.wtxn, &names)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), @@ -552,7 +560,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // Does the new FST differ from the previous one? if current - .map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) + .is_none_or(|current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { // we want to re-create our FST. self.index.put_stop_words(self.wtxn, &fst)?; @@ -572,7 +580,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let current = self.index.non_separator_tokens(self.wtxn)?; // Does the new list differ from the previous one? - if current.map_or(true, |current| ¤t != non_separator_tokens) { + if current.is_none_or(|current| ¤t != non_separator_tokens) { self.index.put_non_separator_tokens(self.wtxn, non_separator_tokens)?; true } else { @@ -597,7 +605,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let current = self.index.separator_tokens(self.wtxn)?; // Does the new list differ from the previous one? - if current.map_or(true, |current| ¤t != separator_tokens) { + if current.is_none_or(|current| ¤t != separator_tokens) { self.index.put_separator_tokens(self.wtxn, separator_tokens)?; true } else { @@ -622,7 +630,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let current = self.index.dictionary(self.wtxn)?; // Does the new list differ from the previous one? - if current.map_or(true, |current| ¤t != dictionary) { + if current.is_none_or(|current| ¤t != dictionary) { self.index.put_dictionary(self.wtxn, dictionary)?; true } else { @@ -739,14 +747,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { - let mut new_facets = HashSet::new(); - for name in fields { - new_facets.insert(name.clone()); - } - self.index.put_filterable_fields(self.wtxn, &new_facets)?; + self.index.put_filterable_attributes_rules(self.wtxn, fields)?; } Setting::Reset => { - self.index.delete_filterable_fields(self.wtxn)?; + self.index.delete_filterable_attributes_rules(self.wtxn)?; } Setting::NotSet => (), } @@ -944,7 +948,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { false } else { self.index.put_proximity_precision(self.wtxn, new)?; - true + old.is_some() || new != ProximityPrecision::default() } } Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, @@ -954,6 +958,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } + fn update_prefix_search(&mut self) -> Result { + let changed = match self.prefix_search { + Setting::Set(new) => { + let old = self.index.prefix_search(self.wtxn)?; + if old == Some(new) { + false + } else { + self.index.put_prefix_search(self.wtxn, new)?; + old.is_some() || new != PrefixSearch::default() + } + } + Setting::Reset => self.index.delete_prefix_search(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + + fn update_facet_search(&mut self) -> Result { + let changed = match self.facet_search { + Setting::Set(new) => { + let old = self.index.facet_search(self.wtxn)?; + if old == new { + false + } else { + self.index.put_facet_search(self.wtxn, new)?; + true + } + } + Setting::Reset => self.index.delete_facet_search(self.wtxn)?, + Setting::NotSet => false, + }; + + Ok(changed) + } + fn update_embedding_configs(&mut self) -> Result> { match std::mem::take(&mut self.embedder_settings) { Setting::Set(configs) => self.update_embedding_configs_set(configs), @@ -1200,15 +1240,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_separator_tokens()?; self.update_dictionary()?; self.update_synonyms()?; - self.update_searchable()?; + self.update_user_defined_searchable_attributes()?; self.update_exact_attributes()?; self.update_proximity_precision()?; + self.update_prefix_search()?; + self.update_facet_search()?; self.update_localized_attributes_rules()?; let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; - new_inner_settings.recompute_facets(self.wtxn, self.index)?; + new_inner_settings.recompute_searchables(self.wtxn, self.index)?; let primary_key_id = self .index @@ -1260,8 +1302,8 @@ impl InnerIndexSettingsDiff { settings_update_only: bool, ) -> Self { let only_additional_fields = match ( - &old_settings.user_defined_searchable_fields, - &new_settings.user_defined_searchable_fields, + &old_settings.user_defined_searchable_attributes, + &new_settings.user_defined_searchable_attributes, ) { (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * (Some(old), Some(new)) => { @@ -1282,14 +1324,15 @@ impl InnerIndexSettingsDiff { || old_settings.allowed_separators != new_settings.allowed_separators || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision - || old_settings.localized_searchable_fields_ids - != new_settings.localized_searchable_fields_ids + || old_settings.prefix_search != new_settings.prefix_search + || old_settings.localized_attributes_rules + != new_settings.localized_attributes_rules }; let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; - let cache_user_defined_searchables = old_settings.user_defined_searchable_fields - != new_settings.user_defined_searchable_fields; + let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes + != new_settings.user_defined_searchable_attributes; // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { @@ -1297,7 +1340,7 @@ impl InnerIndexSettingsDiff { new_settings.embedding_configs.inner_as_ref() { let was_quantized = - old_settings.embedding_configs.get(embedder_name).map_or(false, |conf| conf.2); + old_settings.embedding_configs.get(embedder_name).is_some_and(|conf| conf.2); // skip embedders that don't use document templates if !config.uses_document_template() { continue; @@ -1372,27 +1415,75 @@ impl InnerIndexSettingsDiff { } } + /// List the faceted fields from the inner fid map. + /// This is used to list the faceted fields when we are reindexing, + /// but it can't be used in document addition because the field id map must be exhaustive. + pub fn list_faceted_fields_from_fid_map(&self, del_add: DelAdd) -> BTreeSet { + let settings = match del_add { + DelAdd::Deletion => &self.old, + DelAdd::Addition => &self.new, + }; + + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| metadata.is_faceted(&settings.filterable_attributes_rules)) + .map(|(id, _)| id) + .collect() + } + + pub fn facet_fids_changed(&self) -> bool { + for eob in merge_join_by( + self.old.fields_ids_map.iter().filter(|(_, _, metadata)| { + metadata.is_faceted(&self.old.filterable_attributes_rules) + }), + self.new.fields_ids_map.iter().filter(|(_, _, metadata)| { + metadata.is_faceted(&self.new.filterable_attributes_rules) + }), + |(old_fid, _, _), (new_fid, _, _)| old_fid.cmp(new_fid), + ) { + match eob { + // If there is a difference, we need to reindex facet databases. + EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => return true, + // If the field is faceted in both old and new settings, we check the facet-searchable and facet level database. + EitherOrBoth::Both((_, _, old_metadata), (_, _, new_metadata)) => { + // Check if the field is facet-searchable in the old and new settings. + // If there is a difference, we need to reindex facet-search database. + let old_filterable_features = old_metadata + .filterable_attributes_features(&self.old.filterable_attributes_rules); + let new_filterable_features = new_metadata + .filterable_attributes_features(&self.new.filterable_attributes_rules); + let is_old_facet_searchable = + old_filterable_features.is_facet_searchable() && self.old.facet_search; + let is_new_facet_searchable = + new_filterable_features.is_facet_searchable() && self.new.facet_search; + if is_old_facet_searchable != is_new_facet_searchable { + return true; + } + + // Check if the field needs a facet level database in the old and new settings. + // If there is a difference, we need to reindex facet level databases. + let old_facet_level_database = old_metadata + .require_facet_level_database(&self.old.filterable_attributes_rules); + let new_facet_level_database = new_metadata + .require_facet_level_database(&self.new.filterable_attributes_rules); + if old_facet_level_database != new_facet_level_database { + return true; + } + } + } + } + + false + } + + pub fn global_facet_settings_changed(&self) -> bool { + self.old.localized_attributes_rules != self.new.localized_attributes_rules + || self.old.facet_search != self.new.facet_search + } + pub fn reindex_facets(&self) -> bool { - let existing_fields = &self.new.existing_fields; - if existing_fields.iter().any(|field| field.contains('.')) { - return true; - } - - let old_faceted_fields = &self.old.user_defined_faceted_fields; - if old_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - // If there is new faceted fields we indicate that we must reindex as we must - // index new fields as facets. It means that the distinct attribute, - // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = &self.new.user_defined_faceted_fields; - if new_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) - || self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + self.facet_fids_changed() || self.global_facet_settings_changed() } pub fn reindex_vectors(&self) -> bool { @@ -1407,10 +1498,6 @@ impl InnerIndexSettingsDiff { self.old.geo_fields_ids != self.new.geo_fields_ids || (!self.settings_update_only && self.new.geo_fields_ids.is_some()) } - - pub fn modified_faceted_fields(&self) -> HashSet { - &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields - } } #[derive(Clone)] @@ -1418,20 +1505,19 @@ pub(crate) struct InnerIndexSettings { pub stop_words: Option>>, pub allowed_separators: Option>, pub dictionary: Option>, - pub fields_ids_map: FieldsIdsMap, - pub user_defined_faceted_fields: HashSet, - pub user_defined_searchable_fields: Option>, - pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Vec, + pub fields_ids_map: FieldIdMapWithMetadata, + pub localized_attributes_rules: Vec, + pub filterable_attributes_rules: Vec, + pub asc_desc_fields: HashSet, + pub distinct_field: Option, + pub user_defined_searchable_attributes: Option>, + pub sortable_fields: HashSet, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, - pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, - pub non_searchable_fields_ids: Vec, - pub non_faceted_fields_ids: Vec, - pub localized_searchable_fields_ids: LocalizedFieldIds, - pub localized_faceted_fields_ids: LocalizedFieldIds, + pub prefix_search: PrefixSearch, + pub facet_search: bool, } impl InnerIndexSettings { @@ -1445,97 +1531,65 @@ impl InnerIndexSettings { let allowed_separators = index.allowed_separators(rtxn)?; let dictionary = index.dictionary(rtxn)?; let mut fields_ids_map = index.fields_ids_map(rtxn)?; - let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; - let user_defined_searchable_fields = - user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); - let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = match embedding_configs { Some(embedding_configs) => embedding_configs, None => embedders(index.embedding_configs(rtxn)?)?, }; - let existing_fields: HashSet<_> = index - .field_distribution(rtxn)? - .into_iter() - .filter_map(|(field, count)| (count != 0).then_some(field)) - .collect(); - // index.fields_ids_map($a)? ==>> fields_ids_map - let geo_fields_ids = match fields_ids_map.id("_geo") { - Some(gfid) => { - let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid); - let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid); + let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); + let facet_search = index.facet_search(rtxn)?; + let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) { + Some(_) if index.is_geo_enabled(rtxn)? => { // if `_geo` is faceted then we get the `lat` and `lng` - if is_sortable || is_filterable { - let field_ids = fields_ids_map - .insert("_geo.lat") - .zip(fields_ids_map.insert("_geo.lng")) - .ok_or(UserError::AttributeLimitReached)?; - Some(field_ids) - } else { - None - } + let field_ids = fields_ids_map + .insert("_geo.lat") + .zip(fields_ids_map.insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + Some(field_ids) } - None => None, + _ => None, }; - let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; - let localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - searchable_fields_ids.iter().cloned(), - ); - let localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - faceted_fields_ids.iter().cloned(), - ); - - let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); - searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); - faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + let localized_attributes_rules = + index.localized_attributes_rules(rtxn)?.unwrap_or_default(); + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + let sortable_fields = index.sortable_fields(rtxn)?; + let asc_desc_fields = index.asc_desc_fields(rtxn)?; + let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string()); + let user_defined_searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(|f| f.to_string()).collect()); + let builder = MetadataBuilder::from_index(index, rtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Self { stop_words, allowed_separators, dictionary, fields_ids_map, - user_defined_faceted_fields, - user_defined_searchable_fields, - faceted_fields_ids, - searchable_fields_ids, + localized_attributes_rules, + filterable_attributes_rules, + asc_desc_fields, + distinct_field, + user_defined_searchable_attributes, + sortable_fields, exact_attributes, proximity_precision, embedding_configs, - existing_fields, geo_fields_ids, - non_searchable_fields_ids: vectors_fids.clone(), - non_faceted_fields_ids: vectors_fids.clone(), - localized_searchable_fields_ids, - localized_faceted_fields_ids, + prefix_search, + facet_search, }) } - // find and insert the new field ids - pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> { - let new_facets = self - .fields_ids_map - .iter() - .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) - .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|(_fid, field)| field.to_string()) - .collect(); - index.put_faceted_fields(wtxn, &new_facets)?; - - self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.faceted_fields_ids.iter().cloned(), - ); - Ok(()) + pub fn match_faceted_field(&self, field: &str) -> PatternMatch { + match_faceted_field( + field, + &self.filterable_attributes_rules, + &self.sortable_fields, + &self.asc_desc_fields, + &self.distinct_field, + ) } // find and insert the new field ids @@ -1545,7 +1599,7 @@ impl InnerIndexSettings { index: &Index, ) -> Result<()> { let searchable_fields = self - .user_defined_searchable_fields + .user_defined_searchable_attributes .as_ref() .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); @@ -1554,17 +1608,9 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, - &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } - self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.searchable_fields_ids.iter().cloned(), - ); Ok(()) } @@ -1582,7 +1628,8 @@ fn embedders(embedding_configs: Vec) -> Result) -> Result, -) -> Result> { - match new { - Setting::Set(EmbeddingSettings { - source, - model, - revision, - api_key, - dimensions, - document_template: Setting::Set(template), - document_template_max_bytes, - url, - request, - response, - distribution, - headers, - binary_quantized: binary_quantize, - }) => { - let max_bytes = match document_template_max_bytes.set() { + new_prompt: Setting, + max_bytes: Setting, +) -> Result> { + match new_prompt { + Setting::Set(template) => { + let max_bytes = match max_bytes.set() { Some(max_bytes) => NonZeroUsize::new(max_bytes).ok_or_else(|| { crate::error::UserError::InvalidSettingsDocumentTemplateMaxBytes { embedder_name: name.to_owned(), @@ -1631,21 +1665,7 @@ fn validate_prompt( .map(|prompt| crate::prompt::PromptData::from(prompt).template) .map_err(|inner| UserError::InvalidPromptForEmbeddings(name.to_owned(), inner))?; - Ok(Setting::Set(EmbeddingSettings { - source, - model, - revision, - api_key, - dimensions, - document_template: Setting::Set(template), - document_template_max_bytes, - url, - request, - response, - distribution, - headers, - binary_quantized: binary_quantize, - })) + Ok(Setting::Set(template)) } new => Ok(new), } @@ -1655,12 +1675,12 @@ pub fn validate_embedding_settings( settings: Setting, name: &str, ) -> Result> { - let settings = validate_prompt(name, settings)?; let Setting::Set(settings) = settings else { return Ok(settings) }; let EmbeddingSettings { source, model, revision, + pooling, api_key, dimensions, document_template, @@ -1668,11 +1688,15 @@ pub fn validate_embedding_settings( url, request, response, + search_embedder, + mut indexing_embedder, distribution, headers, binary_quantized: binary_quantize, } = settings; + let document_template = validate_prompt(name, document_template, document_template_max_bytes)?; + if let Some(0) = dimensions.set() { return Err(crate::error::UserError::InvalidSettingsDimensions { embedder_name: name.to_owned(), @@ -1698,10 +1722,12 @@ pub fn validate_embedding_settings( } let Some(inferred_source) = source.set() else { + // we are validating the fused settings, so we always have a source return Ok(Setting::Set(EmbeddingSettings { source, model, revision, + pooling, api_key, dimensions, document_template, @@ -1709,19 +1735,35 @@ pub fn validate_embedding_settings( url, request, response, + search_embedder, + indexing_embedder, distribution, headers, binary_quantized: binary_quantize, })); }; + EmbeddingSettings::check_settings( + name, + inferred_source, + NestingContext::NotNested, + &model, + &revision, + &pooling, + &dimensions, + &api_key, + &url, + &request, + &response, + &document_template, + &document_template_max_bytes, + &headers, + &search_embedder, + &indexing_embedder, + &binary_quantize, + &distribution, + )?; match inferred_source { EmbedderSource::OpenAi => { - check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; - - check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; - check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; - check_unset(&headers, EmbeddingSettings::HEADERS, inferred_source, name)?; - if let Setting::Set(model) = &model { let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str()) .ok_or(crate::error::UserError::InvalidOpenAiModel { @@ -1752,58 +1794,124 @@ pub fn validate_embedding_settings( } } } - EmbedderSource::Ollama => { - check_set(&model, EmbeddingSettings::MODEL, inferred_source, name)?; - check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; + EmbedderSource::Ollama + | EmbedderSource::HuggingFace + | EmbedderSource::UserProvided + | EmbedderSource::Rest => {} + EmbedderSource::Composite => { + if let Setting::Set(embedder) = &search_embedder { + if let Some(source) = embedder.source.set() { + let search_embedder = match embedder.search_embedder.clone() { + Setting::Set(search_embedder) => Setting::Set(deserialize_sub_embedder( + search_embedder, + name, + NestingContext::Search, + )?), + Setting::Reset => Setting::Reset, + Setting::NotSet => Setting::NotSet, + }; + let indexing_embedder = match embedder.indexing_embedder.clone() { + Setting::Set(indexing_embedder) => Setting::Set(deserialize_sub_embedder( + indexing_embedder, + name, + NestingContext::Search, + )?), + Setting::Reset => Setting::Reset, + Setting::NotSet => Setting::NotSet, + }; + EmbeddingSettings::check_nested_source(name, source, NestingContext::Search)?; + EmbeddingSettings::check_settings( + name, + source, + NestingContext::Search, + &embedder.model, + &embedder.revision, + &embedder.pooling, + &embedder.dimensions, + &embedder.api_key, + &embedder.url, + &embedder.request, + &embedder.response, + &embedder.document_template, + &embedder.document_template_max_bytes, + &embedder.headers, + &search_embedder, + &indexing_embedder, + &embedder.binary_quantized, + &embedder.distribution, + )?; + } else { + return Err(UserError::MissingSourceForNested { + embedder_name: NestingContext::Search.embedder_name_with_context(name), + } + .into()); + } + } - check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; - check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; - check_unset(&headers, EmbeddingSettings::HEADERS, inferred_source, name)?; - } - EmbedderSource::HuggingFace => { - check_unset(&api_key, EmbeddingSettings::API_KEY, inferred_source, name)?; - check_unset(&dimensions, EmbeddingSettings::DIMENSIONS, inferred_source, name)?; + indexing_embedder = if let Setting::Set(mut embedder) = indexing_embedder { + embedder.document_template = validate_prompt( + name, + embedder.document_template, + embedder.document_template_max_bytes, + )?; - check_unset(&url, EmbeddingSettings::URL, inferred_source, name)?; - check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; - check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; - check_unset(&headers, EmbeddingSettings::HEADERS, inferred_source, name)?; - } - EmbedderSource::UserProvided => { - check_unset(&model, EmbeddingSettings::MODEL, inferred_source, name)?; - check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; - check_unset(&api_key, EmbeddingSettings::API_KEY, inferred_source, name)?; - check_unset( - &document_template, - EmbeddingSettings::DOCUMENT_TEMPLATE, - inferred_source, - name, - )?; - check_unset( - &document_template_max_bytes, - EmbeddingSettings::DOCUMENT_TEMPLATE_MAX_BYTES, - inferred_source, - name, - )?; - check_set(&dimensions, EmbeddingSettings::DIMENSIONS, inferred_source, name)?; - - check_unset(&url, EmbeddingSettings::URL, inferred_source, name)?; - check_unset(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; - check_unset(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; - check_unset(&headers, EmbeddingSettings::HEADERS, inferred_source, name)?; - } - EmbedderSource::Rest => { - check_unset(&model, EmbeddingSettings::MODEL, inferred_source, name)?; - check_unset(&revision, EmbeddingSettings::REVISION, inferred_source, name)?; - check_set(&url, EmbeddingSettings::URL, inferred_source, name)?; - check_set(&request, EmbeddingSettings::REQUEST, inferred_source, name)?; - check_set(&response, EmbeddingSettings::RESPONSE, inferred_source, name)?; + if let Some(source) = embedder.source.set() { + let search_embedder = match embedder.search_embedder.clone() { + Setting::Set(search_embedder) => Setting::Set(deserialize_sub_embedder( + search_embedder, + name, + NestingContext::Indexing, + )?), + Setting::Reset => Setting::Reset, + Setting::NotSet => Setting::NotSet, + }; + let indexing_embedder = match embedder.indexing_embedder.clone() { + Setting::Set(indexing_embedder) => Setting::Set(deserialize_sub_embedder( + indexing_embedder, + name, + NestingContext::Indexing, + )?), + Setting::Reset => Setting::Reset, + Setting::NotSet => Setting::NotSet, + }; + EmbeddingSettings::check_nested_source(name, source, NestingContext::Indexing)?; + EmbeddingSettings::check_settings( + name, + source, + NestingContext::Indexing, + &embedder.model, + &embedder.revision, + &embedder.pooling, + &embedder.dimensions, + &embedder.api_key, + &embedder.url, + &embedder.request, + &embedder.response, + &embedder.document_template, + &embedder.document_template_max_bytes, + &embedder.headers, + &search_embedder, + &indexing_embedder, + &embedder.binary_quantized, + &embedder.distribution, + )?; + } else { + return Err(UserError::MissingSourceForNested { + embedder_name: NestingContext::Indexing.embedder_name_with_context(name), + } + .into()); + } + Setting::Set(embedder) + } else { + indexing_embedder + }; } } Ok(Setting::Set(EmbeddingSettings { source, model, revision, + pooling, api_key, dimensions, document_template, @@ -1811,970 +1919,31 @@ pub fn validate_embedding_settings( url, request, response, + search_embedder, + indexing_embedder, distribution, headers, binary_quantized: binary_quantize, })) } -#[cfg(test)] -mod tests { - use big_s::S; - use heed::types::Bytes; - use maplit::{btreemap, btreeset, hashset}; - use meili_snap::snapshot; - - use super::*; - use crate::error::Error; - use crate::index::tests::TempIndex; - use crate::update::ClearDocuments; - use crate::{db_snap, Criterion, Filter, SearchResult}; - - #[test] - fn set_and_reset_searchable_fields() { - let index = TempIndex::new(); - - // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 1, "name": "kevin", "age": 23 }, - { "id": 2, "name": "kevina", "age": 21}, - { "id": 3, "name": "benoit", "age": 34 } - ]), - ) - .unwrap(); - - // We change the searchable fields to be the "name" field only. - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_searchable_fields(vec!["name".into()]); +fn deserialize_sub_embedder( + sub_embedder: serde_json::Value, + embedder_name: &str, + context: NestingContext, +) -> std::result::Result { + match deserr::deserialize::<_, _, deserr::errors::JsonError>(sub_embedder) { + Ok(sub_embedder) => Ok(sub_embedder), + Err(error) => { + let message = format!("{error}{}", context.nesting_embedders()); + Err(UserError::InvalidSettingsEmbedder { + embedder_name: context.embedder_name_with_context(embedder_name), + message, }) - .unwrap(); - - wtxn.commit().unwrap(); - - db_snap!(index, fields_ids_map, @r###" - 0 id | - 1 name | - 2 age | - "###); - db_snap!(index, searchable_fields, @r###"["name"]"###); - db_snap!(index, fieldids_weights_map, @r###" - fid weight - 1 0 | - "###); - - // Check that the searchable field is correctly set to "name" only. - let rtxn = index.read_txn().unwrap(); - // When we search for something that is not in - // the searchable fields it must not return any document. - let result = index.search(&rtxn).query("23").execute().unwrap(); - assert_eq!(result.documents_ids, Vec::::new()); - - // When we search for something that is in the searchable fields - // we must find the appropriate document. - let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); - let documents = index.documents(&rtxn, result.documents_ids).unwrap(); - let fid_map = index.fields_ids_map(&rtxn).unwrap(); - assert_eq!(documents.len(), 1); - assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); - drop(rtxn); - - // We change the searchable fields to be the "name" field only. - index - .update_settings(|settings| { - settings.reset_searchable_fields(); - }) - .unwrap(); - - db_snap!(index, fields_ids_map, @r###" - 0 id | - 1 name | - 2 age | - "###); - db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###); - db_snap!(index, fieldids_weights_map, @r###" - fid weight - 0 0 | - 1 0 | - 2 0 | - "###); - - // Check that the searchable field have been reset and documents are found now. - let rtxn = index.read_txn().unwrap(); - let fid_map = index.fields_ids_map(&rtxn).unwrap(); - let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap(); - snapshot!(format!("{user_defined_searchable_fields:?}"), @"None"); - // the searchable fields should contain all the fields - let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###); - let result = index.search(&rtxn).query("23").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 1); - let documents = index.documents(&rtxn, result.documents_ids).unwrap(); - assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); - } - - #[test] - fn mixup_searchable_with_displayed_fields() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - // First we send 3 documents with ids from 1 to 3. - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "age": 23}, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 } - ]), - ) - .unwrap(); - - // In the same transaction we change the displayed fields to be only the "age". - // We also change the searchable fields to be the "name" field only. - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_displayed_fields(vec!["age".into()]); - settings.set_searchable_fields(vec!["name".into()]); - }) - .unwrap(); - wtxn.commit().unwrap(); - - // Check that the displayed fields are correctly set to `None` (default value). - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.displayed_fields(&rtxn).unwrap(); - assert_eq!(fields_ids.unwrap(), (&["age"][..])); - drop(rtxn); - - // We change the searchable fields to be the "name" field only. - index - .update_settings(|settings| { - settings.reset_searchable_fields(); - }) - .unwrap(); - - // Check that the displayed fields always contains only the "age" field. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.displayed_fields(&rtxn).unwrap(); - assert_eq!(fields_ids.unwrap(), &["age"][..]); - } - - #[test] - fn default_displayed_fields() { - let index = TempIndex::new(); - - // First we send 3 documents with ids from 1 to 3. - index - .add_documents(documents!([ - { "id": 0, "name": "kevin", "age": 23}, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 } - ])) - .unwrap(); - - // Check that the displayed fields are correctly set to `None` (default value). - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.displayed_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, None); - } - - #[test] - fn set_and_reset_displayed_field() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "age": 23}, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 } - ]), - ) - .unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_displayed_fields(vec!["age".into()]); - }) - .unwrap(); - wtxn.commit().unwrap(); - - // Check that the displayed fields are correctly set to only the "age" field. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.displayed_fields(&rtxn).unwrap(); - assert_eq!(fields_ids.unwrap(), &["age"][..]); - drop(rtxn); - - // We reset the fields ids to become `None`, the default value. - index - .update_settings(|settings| { - settings.reset_displayed_fields(); - }) - .unwrap(); - - // Check that the displayed fields are correctly set to `None` (default value). - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.displayed_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, None); - } - - #[test] - fn set_filterable_fields() { - let index = TempIndex::new(); - - // Set the filterable fields to be the age. - index - .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("age") }); - }) - .unwrap(); - - // Then index some documents. - index - .add_documents(documents!([ - { "id": 0, "name": "kevin", "age": 23}, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 } - ])) - .unwrap(); - - // Check that the displayed fields are correctly set. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("age") }); - // Only count the field_id 0 and level 0 facet values. - // TODO we must support typed CSVs for numbers to be understood. - let fidmap = index.fields_ids_map(&rtxn).unwrap(); - for document in index.all_documents(&rtxn).unwrap() { - let document = document.unwrap(); - let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1) - .unwrap(); - println!("json: {:?}", json); } - let count = index - .facet_id_f64_docids - .remap_key_type::() - // The faceted field id is 2u16 - .prefix_iter(&rtxn, &[0, 2, 0]) - .unwrap() - .count(); - assert_eq!(count, 3); - drop(rtxn); - - // Index a little more documents with new and current facets values. - index - .add_documents(documents!([ - { "id": 3, "name": "kevin2", "age": 23}, - { "id": 4, "name": "kevina2", "age": 21 }, - { "id": 5, "name": "benoit", "age": 35 } - ])) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - // Only count the field_id 0 and level 0 facet values. - let count = index - .facet_id_f64_docids - .remap_key_type::() - .prefix_iter(&rtxn, &[0, 2, 0]) - .unwrap() - .count(); - assert_eq!(count, 4); - - // Set the filterable fields to be the age and the name. - index - .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("age"), S("name") }); - }) - .unwrap(); - - // Check that the displayed fields are correctly set. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("age"), S("name") }); - - let rtxn = index.read_txn().unwrap(); - // Only count the field_id 2 and level 0 facet values. - let count = index - .facet_id_f64_docids - .remap_key_type::() - .prefix_iter(&rtxn, &[0, 2, 0]) - .unwrap() - .count(); - assert_eq!(count, 4); - - let rtxn = index.read_txn().unwrap(); - // Only count the field_id 1 and level 0 facet values. - let count = index - .facet_id_string_docids - .remap_key_type::() - .prefix_iter(&rtxn, &[0, 1]) - .unwrap() - .count(); - assert_eq!(count, 5); - - // Remove the age from the filterable fields. - index - .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("name") }); - }) - .unwrap(); - - // Check that the displayed fields are correctly set. - let rtxn = index.read_txn().unwrap(); - let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset! { S("name") }); - - let rtxn = index.read_txn().unwrap(); - // Only count the field_id 2 and level 0 facet values. - let count = index - .facet_id_f64_docids - .remap_key_type::() - .prefix_iter(&rtxn, &[0, 2, 0]) - .unwrap() - .count(); - assert_eq!(count, 0); - - let rtxn = index.read_txn().unwrap(); - // Only count the field_id 1 and level 0 facet values. - let count = index - .facet_id_string_docids - .remap_key_type::() - .prefix_iter(&rtxn, &[0, 1]) - .unwrap() - .count(); - assert_eq!(count, 5); - } - - #[test] - fn set_asc_desc_field() { - let index = TempIndex::new(); - - // Set the filterable fields to be the age. - index - .update_settings(|settings| { - settings.set_displayed_fields(vec![S("name")]); - settings.set_criteria(vec![Criterion::Asc("age".to_owned())]); - }) - .unwrap(); - - // Then index some documents. - index - .add_documents(documents!([ - { "id": 0, "name": "kevin", "age": 23}, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 } - ])) - .unwrap(); - - // Run an empty query just to ensure that the search results are ordered. - let rtxn = index.read_txn().unwrap(); - let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); - let documents = index.documents(&rtxn, documents_ids).unwrap(); - - // Fetch the documents "age" field in the ordre in which the documents appear. - let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap(); - let iter = documents.into_iter().map(|(_, doc)| { - let bytes = doc.get(age_field_id).unwrap(); - let string = std::str::from_utf8(bytes).unwrap(); - string.parse::().unwrap() - }); - - assert_eq!(iter.collect::>(), vec![21, 23, 34]); - } - - #[test] - fn set_distinct_field() { - let index = TempIndex::new(); - - // Set the filterable fields to be the age. - index - .update_settings(|settings| { - // Don't display the generated `id` field. - settings.set_displayed_fields(vec![S("name"), S("age")]); - settings.set_distinct_field(S("age")); - }) - .unwrap(); - - // Then index some documents. - index - .add_documents(documents!([ - { "id": 0, "name": "kevin", "age": 23 }, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 }, - { "id": 3, "name": "bernard", "age": 34 }, - { "id": 4, "name": "bertrand", "age": 34 }, - { "id": 5, "name": "bernie", "age": 34 }, - { "id": 6, "name": "ben", "age": 34 } - ])) - .unwrap(); - - // Run an empty query just to ensure that the search results are ordered. - let rtxn = index.read_txn().unwrap(); - let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); - - // There must be at least one document with a 34 as the age. - assert_eq!(documents_ids.len(), 3); - } - - #[test] - fn set_nested_distinct_field() { - let index = TempIndex::new(); - - // Set the filterable fields to be the age. - index - .update_settings(|settings| { - // Don't display the generated `id` field. - settings.set_displayed_fields(vec![S("person")]); - settings.set_distinct_field(S("person.age")); - }) - .unwrap(); - - // Then index some documents. - index - .add_documents(documents!([ - { "id": 0, "person": { "name": "kevin", "age": 23 }}, - { "id": 1, "person": { "name": "kevina", "age": 21 }}, - { "id": 2, "person": { "name": "benoit", "age": 34 }}, - { "id": 3, "person": { "name": "bernard", "age": 34 }}, - { "id": 4, "person": { "name": "bertrand", "age": 34 }}, - { "id": 5, "person": { "name": "bernie", "age": 34 }}, - { "id": 6, "person": { "name": "ben", "age": 34 }} - ])) - .unwrap(); - - // Run an empty query just to ensure that the search results are ordered. - let rtxn = index.read_txn().unwrap(); - let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); - - // There must be at least one document with a 34 as the age. - assert_eq!(documents_ids.len(), 3); - } - - #[test] - fn default_stop_words() { - let index = TempIndex::new(); - - // First we send 3 documents with ids from 1 to 3. - index - .add_documents(documents!([ - { "id": 0, "name": "kevin", "age": 23}, - { "id": 1, "name": "kevina", "age": 21 }, - { "id": 2, "name": "benoit", "age": 34 } - ])) - .unwrap(); - - // Ensure there is no stop_words by default - let rtxn = index.read_txn().unwrap(); - let stop_words = index.stop_words(&rtxn).unwrap(); - assert!(stop_words.is_none()); - } - - #[test] - fn set_and_reset_stop_words() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - // First we send 3 documents with ids from 1 to 3. - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "age": 23, "maxim": "I love dogs" }, - { "id": 1, "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, - { "id": 2, "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, - ]), - ) - .unwrap(); - - // In the same transaction we provide some stop_words - let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_stop_words(set.clone()); - }) - .unwrap(); - - wtxn.commit().unwrap(); - - // Ensure stop_words are effectively stored - let rtxn = index.read_txn().unwrap(); - let stop_words = index.stop_words(&rtxn).unwrap(); - assert!(stop_words.is_some()); // at this point the index should return something - - let stop_words = stop_words.unwrap(); - let expected = fst::Set::from_iter(&set).unwrap(); - assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); - - // when we search for something that is a non prefix stop_words it should be ignored - // thus we should get a placeholder search (all the results = 3) - let result = index.search(&rtxn).query("the ").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 3); - let result = index.search(&rtxn).query("i ").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 3); - let result = index.search(&rtxn).query("are ").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 3); - - let result = index.search(&rtxn).query("dog").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos - let result = index.search(&rtxn).query("benoĆ®t").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data - - // now we'll reset the stop_words and ensure it's None - index - .update_settings(|settings| { - settings.reset_stop_words(); - }) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - let stop_words = index.stop_words(&rtxn).unwrap(); - assert!(stop_words.is_none()); - - // now we can search for the stop words - let result = index.search(&rtxn).query("the").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 2); - let result = index.search(&rtxn).query("i").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 1); - let result = index.search(&rtxn).query("are").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 2); - - // the rest of the search is still not impacted - let result = index.search(&rtxn).query("dog").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos - let result = index.search(&rtxn).query("benoĆ®t").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data - } - - #[test] - fn set_and_reset_synonyms() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - // Send 3 documents with ids from 1 to 3. - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "kevin", "age": 23, "maxim": "I love dogs"}, - { "id": 1, "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, - { "id": 2, "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, - ]), - ) - .unwrap(); - - // In the same transaction provide some synonyms - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_synonyms(btreemap! { - "blini".to_string() => vec!["crepes".to_string()], - "super like".to_string() => vec!["love".to_string()], - "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] - }); - }) - .unwrap(); - wtxn.commit().unwrap(); - - // Ensure synonyms are effectively stored - let rtxn = index.read_txn().unwrap(); - let synonyms = index.synonyms(&rtxn).unwrap(); - assert!(!synonyms.is_empty()); // at this point the index should return something - - // Check that we can use synonyms - let result = index.search(&rtxn).query("blini").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 1); - let result = index.search(&rtxn).query("super like").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 1); - let result = index.search(&rtxn).query("puppies").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 2); - - // Reset the synonyms - index - .update_settings(|settings| { - settings.reset_synonyms(); - }) - .unwrap(); - - // Ensure synonyms are reset - let rtxn = index.read_txn().unwrap(); - let synonyms = index.synonyms(&rtxn).unwrap(); - assert!(synonyms.is_empty()); - - // Check that synonyms are no longer work - let result = index.search(&rtxn).query("blini").execute().unwrap(); - assert!(result.documents_ids.is_empty()); - let result = index.search(&rtxn).query("super like").execute().unwrap(); - assert!(result.documents_ids.is_empty()); - let result = index.search(&rtxn).query("puppies").execute().unwrap(); - assert!(result.documents_ids.is_empty()); - } - - #[test] - fn thai_synonyms() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - // Send 3 documents with ids from 1 to 3. - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "id": 0, "name": "ąø¢ąøµą¹ˆąø›ąøøą¹ˆąø™" }, - { "id": 1, "name": "ąøąøµą¹ˆąø›ąøøą¹ˆąø™" }, - ]), - ) - .unwrap(); - - // In the same transaction provide some synonyms - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_synonyms(btreemap! { - "japanese".to_string() => vec![S("ąøąøµą¹ˆąø›ąøøą¹ˆąø™"), S("ąø¢ąøµą¹ˆąø›ąøøą¹ˆąø™")], - }); - }) - .unwrap(); - wtxn.commit().unwrap(); - - // Ensure synonyms are effectively stored - let rtxn = index.read_txn().unwrap(); - let synonyms = index.synonyms(&rtxn).unwrap(); - assert!(!synonyms.is_empty()); // at this point the index should return something - - // Check that we can use synonyms - let result = index.search(&rtxn).query("japanese").execute().unwrap(); - assert_eq!(result.documents_ids.len(), 2); - } - - #[test] - fn setting_searchable_recomputes_other_settings() { - let index = TempIndex::new(); - - // Set all the settings except searchable - index - .update_settings(|settings| { - settings.set_displayed_fields(vec!["hello".to_string()]); - settings.set_filterable_fields(hashset! { S("age"), S("toto") }); - settings.set_criteria(vec![Criterion::Asc(S("toto"))]); - }) - .unwrap(); - - // check the output - let rtxn = index.read_txn().unwrap(); - assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); - // since no documents have been pushed the primary key is still unset - assert!(index.primary_key(&rtxn).unwrap().is_none()); - assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); - drop(rtxn); - - // We set toto and age as searchable to force reordering of the fields - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); - }) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); - assert!(index.primary_key(&rtxn).unwrap().is_none()); - assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); - } - - #[test] - fn setting_not_filterable_cant_filter() { - let index = TempIndex::new(); - - // Set all the settings except searchable - index - .update_settings(|settings| { - settings.set_displayed_fields(vec!["hello".to_string()]); - // It is only Asc(toto), there is a facet database but it is denied to filter with toto. - settings.set_criteria(vec![Criterion::Asc(S("toto"))]); - }) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - let filter = Filter::from_str("toto = 32").unwrap().unwrap(); - let _ = filter.evaluate(&rtxn, &index).unwrap_err(); - } - - #[test] - fn setting_primary_key() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - // Set the primary key settings - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("mykey")); - }) - .unwrap(); - wtxn.commit().unwrap(); - let mut wtxn = index.write_txn().unwrap(); - assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); - - // Then index some documents with the "mykey" primary key. - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "mykey": 1, "name": "kevin", "age": 23 }, - { "mykey": 2, "name": "kevina", "age": 21 }, - { "mykey": 3, "name": "benoit", "age": 34 }, - { "mykey": 4, "name": "bernard", "age": 34 }, - { "mykey": 5, "name": "bertrand", "age": 34 }, - { "mykey": 6, "name": "bernie", "age": 34 }, - { "mykey": 7, "name": "ben", "age": 34 } - ]), - ) - .unwrap(); - wtxn.commit().unwrap(); - - // Updating settings with the same primary key should do nothing - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("mykey")); - }) - .unwrap(); - assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); - wtxn.commit().unwrap(); - - // Updating the settings with a different (or no) primary key causes an error - let mut wtxn = index.write_txn().unwrap(); - let error = index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.reset_primary_key(); - }) - .unwrap_err(); - assert!(matches!(error, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); - wtxn.abort(); - - // But if we clear the database... - let mut wtxn = index.write_txn().unwrap(); - let builder = ClearDocuments::new(&mut wtxn, &index); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - // ...we can change the primary key - index - .update_settings(|settings| { - settings.set_primary_key(S("myid")); - }) - .unwrap(); - } - - #[test] - fn setting_impact_relevancy() { - let index = TempIndex::new(); - - // Set the genres setting - index - .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("genres") }); - }) - .unwrap(); - - index.add_documents(documents!([ - { - "id": 11, - "title": "Star Wars", - "overview": - "Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.", - "genres": ["Adventure", "Action", "Science Fiction"], - "poster": "https://image.tmdb.org/t/p/w500/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg", - "release_date": 233366400 - }, - { - "id": 30, - "title": "Magnetic Rose", - "overview": "", - "genres": ["Animation", "Science Fiction"], - "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", - "release_date": 819676800 - } - ])).unwrap(); - - let rtxn = index.read_txn().unwrap(); - let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap(); - let first_id = documents_ids[0]; - let documents = index.documents(&rtxn, documents_ids).unwrap(); - let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap(); - - let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap(); - let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap(); - assert_eq!(line, r#""Star Wars""#); - } - - #[test] - fn test_disable_typo() { - let index = TempIndex::new(); - - let mut txn = index.write_txn().unwrap(); - assert!(index.authorize_typos(&txn).unwrap()); - - index - .update_settings_using_wtxn(&mut txn, |settings| { - settings.set_autorize_typos(false); - }) - .unwrap(); - - assert!(!index.authorize_typos(&txn).unwrap()); - } - - #[test] - fn update_min_word_len_for_typo() { - let index = TempIndex::new(); - - // Set the genres setting - index - .update_settings(|settings| { - settings.set_min_word_len_one_typo(8); - settings.set_min_word_len_two_typos(8); - }) - .unwrap(); - - let txn = index.read_txn().unwrap(); - assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); - assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); - - index - .update_settings(|settings| { - settings.reset_min_word_len_one_typo(); - settings.reset_min_word_len_two_typos(); - }) - .unwrap(); - - let txn = index.read_txn().unwrap(); - assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); - assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); - } - - #[test] - fn update_invalid_min_word_len_for_typo() { - let index = TempIndex::new(); - - // Set the genres setting - index - .update_settings(|settings| { - settings.set_min_word_len_one_typo(10); - settings.set_min_word_len_two_typos(7); - }) - .unwrap_err(); - } - - #[test] - fn update_exact_words_normalization() { - let index = TempIndex::new(); - - let mut txn = index.write_txn().unwrap(); - // Set the genres setting - index - .update_settings_using_wtxn(&mut txn, |settings| { - let words = btreeset! { S("Ab"), S("ac") }; - settings.set_exact_words(words); - }) - .unwrap(); - - let exact_words = index.exact_words(&txn).unwrap().unwrap(); - for word in exact_words.into_fst().stream().into_str_vec().unwrap() { - assert!(word.0 == "ac" || word.0 == "ab"); - } - } - - #[test] - fn test_correct_settings_init() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - // we don't actually update the settings, just check their content - let Settings { - wtxn: _, - index: _, - indexer_config: _, - searchable_fields, - displayed_fields, - filterable_fields, - sortable_fields, - criteria, - stop_words, - non_separator_tokens, - separator_tokens, - dictionary, - distinct_field, - synonyms, - primary_key, - authorize_typos, - min_word_len_two_typos, - min_word_len_one_typo, - exact_words, - exact_attributes, - max_values_per_facet, - sort_facet_values_by, - pagination_max_total_hits, - proximity_precision, - embedder_settings, - search_cutoff, - localized_attributes_rules, - } = settings; - assert!(matches!(searchable_fields, Setting::NotSet)); - assert!(matches!(displayed_fields, Setting::NotSet)); - assert!(matches!(filterable_fields, Setting::NotSet)); - assert!(matches!(sortable_fields, Setting::NotSet)); - assert!(matches!(criteria, Setting::NotSet)); - assert!(matches!(stop_words, Setting::NotSet)); - assert!(matches!(non_separator_tokens, Setting::NotSet)); - assert!(matches!(separator_tokens, Setting::NotSet)); - assert!(matches!(dictionary, Setting::NotSet)); - assert!(matches!(distinct_field, Setting::NotSet)); - assert!(matches!(synonyms, Setting::NotSet)); - assert!(matches!(primary_key, Setting::NotSet)); - assert!(matches!(authorize_typos, Setting::NotSet)); - assert!(matches!(min_word_len_two_typos, Setting::NotSet)); - assert!(matches!(min_word_len_one_typo, Setting::NotSet)); - assert!(matches!(exact_words, Setting::NotSet)); - assert!(matches!(exact_attributes, Setting::NotSet)); - assert!(matches!(max_values_per_facet, Setting::NotSet)); - assert!(matches!(sort_facet_values_by, Setting::NotSet)); - assert!(matches!(pagination_max_total_hits, Setting::NotSet)); - assert!(matches!(proximity_precision, Setting::NotSet)); - assert!(matches!(embedder_settings, Setting::NotSet)); - assert!(matches!(search_cutoff, Setting::NotSet)); - assert!(matches!(localized_attributes_rules, Setting::NotSet)); - }) - .unwrap(); - } - - #[test] - fn settings_must_ignore_soft_deleted() { - use serde_json::json; - - let index = TempIndex::new(); - - let mut docs = vec![]; - for i in 0..10 { - docs.push(json!({ "id": i, "title": format!("{:x}", i) })); - } - index.add_documents(documents! { docs }).unwrap(); - - index.delete_documents((0..5).map(|id| id.to_string()).collect()); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_searchable_fields(vec!["id".to_string()]); - }) - .unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.write_txn().unwrap(); - let docs: StdResult, _> = index.all_documents(&rtxn).unwrap().collect(); - let docs = docs.unwrap(); - assert_eq!(docs.len(), 5); } } + +#[cfg(test)] +#[path = "test_settings.rs"] +mod tests; diff --git a/crates/milli/src/update/test_settings.rs b/crates/milli/src/update/test_settings.rs new file mode 100644 index 000000000..00be0476a --- /dev/null +++ b/crates/milli/src/update/test_settings.rs @@ -0,0 +1,956 @@ +use big_s::S; +use heed::types::Bytes; +use maplit::{btreemap, btreeset}; +use meili_snap::snapshot; + +use super::*; +use crate::error::Error; +use crate::index::tests::TempIndex; +use crate::update::ClearDocuments; +use crate::{db_snap, Criterion, Filter, SearchResult}; + +#[test] +fn set_and_reset_searchable_fields() { + let index = TempIndex::new(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 1, "name": "kevin", "age": 23 }, + { "id": 2, "name": "kevina", "age": 21}, + { "id": 3, "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + + // We change the searchable fields to be the "name" field only. + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_searchable_fields(vec!["name".into()]); + }) + .unwrap(); + + wtxn.commit().unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["name"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 1 0 | + "###); + + // Check that the searchable field is correctly set to "name" only. + let rtxn = index.read_txn().unwrap(); + // When we search for something that is not in + // the searchable fields it must not return any document. + let result = index.search(&rtxn).query("23").execute().unwrap(); + assert_eq!(result.documents_ids, Vec::::new()); + + // When we search for something that is in the searchable fields + // we must find the appropriate document. + let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); + let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); + assert_eq!(documents.len(), 1); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); + drop(rtxn); + + // We change the searchable fields to be the "name" field only. + index + .update_settings(|settings| { + settings.reset_searchable_fields(); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 name | + 2 age | + "###); + db_snap!(index, searchable_fields, @r###"["id", "name", "age"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + 1 0 | + 2 0 | + "###); + + // Check that the searchable field have been reset and documents are found now. + let rtxn = index.read_txn().unwrap(); + let fid_map = index.fields_ids_map(&rtxn).unwrap(); + let user_defined_searchable_fields = index.user_defined_searchable_fields(&rtxn).unwrap(); + snapshot!(format!("{user_defined_searchable_fields:?}"), @"None"); + // the searchable fields should contain all the fields + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + snapshot!(format!("{searchable_fields:?}"), @r###"["id", "name", "age"]"###); + let result = index.search(&rtxn).query("23").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + assert_eq!(documents[0].1.get(fid_map.id("name").unwrap()), Some(&br#""kevin""#[..])); +} + +#[test] +fn mixup_searchable_with_displayed_fields() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + // First we send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "age": 23}, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + + // In the same transaction we change the displayed fields to be only the "age". + // We also change the searchable fields to be the "name" field only. + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_displayed_fields(vec!["age".into()]); + settings.set_searchable_fields(vec!["name".into()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids.unwrap(), (&["age"][..])); + drop(rtxn); + + // We change the searchable fields to be the "name" field only. + index + .update_settings(|settings| { + settings.reset_searchable_fields(); + }) + .unwrap(); + + // Check that the displayed fields always contains only the "age" field. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids.unwrap(), &["age"][..]); +} + +#[test] +fn default_displayed_fields() { + let index = TempIndex::new(); + + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "age": 23}, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, None); +} + +#[test] +fn set_and_reset_displayed_field() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "age": 23}, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_displayed_fields(vec!["age".into()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set to only the "age" field. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids.unwrap(), &["age"][..]); + drop(rtxn); + + // We reset the fields ids to become `None`, the default value. + index + .update_settings(|settings| { + settings.reset_displayed_fields(); + }) + .unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, None); +} + +#[test] +fn set_filterable_fields() { + let index = TempIndex::new(); + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("age"))]); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "age": 23}, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Check that the displayed fields are correctly set. + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + // TODO we must support typed CSVs for numbers to be understood. + let fidmap = index.fields_ids_map(&rtxn).unwrap(); + for document in index.all_documents(&rtxn).unwrap() { + let document = document.unwrap(); + let json = + crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1).unwrap(); + println!("json: {:?}", json); + } + let count = index + .facet_id_f64_docids + .remap_key_type::() + // The faceted field id is 2u16 + .prefix_iter(&rtxn, &[0, 2, 0]) + .unwrap() + .count(); + assert_eq!(count, 3); + drop(rtxn); + + // Index a little more documents with new and current facets values. + index + .add_documents(documents!([ + { "id": 3, "name": "kevin2", "age": 23}, + { "id": 4, "name": "kevina2", "age": 21 }, + { "id": 5, "name": "benoit", "age": 35 } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index + .facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 2, 0]) + .unwrap() + .count(); + assert_eq!(count, 4); + + // Set the filterable fields to be the age and the name. + index + .update_settings(|settings| { + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("age")), + FilterableAttributesRule::Field(S("name")), + ]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 2 and level 0 facet values. + let count = index + .facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 2, 0]) + .unwrap() + .count(); + assert_eq!(count, 4); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 1 and level 0 facet values. + let count = index + .facet_id_string_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1]) + .unwrap() + .count(); + assert_eq!(count, 5); + + // Remove the age from the filterable fields. + index + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("name"))]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 2 and level 0 facet values. + let count = index + .facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 2, 0]) + .unwrap() + .count(); + assert_eq!(count, 0); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 1 and level 0 facet values. + let count = index + .facet_id_string_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1]) + .unwrap() + .count(); + assert_eq!(count, 5); +} + +#[test] +fn set_asc_desc_field() { + let index = TempIndex::new(); + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + settings.set_displayed_fields(vec![S("name")]); + settings.set_criteria(vec![Criterion::Asc("age".to_owned())]); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "age": 23}, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + let documents = index.documents(&rtxn, documents_ids).unwrap(); + + // Fetch the documents "age" field in the ordre in which the documents appear. + let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap(); + let iter = documents.into_iter().map(|(_, doc)| { + let bytes = doc.get(age_field_id).unwrap(); + let string = std::str::from_utf8(bytes).unwrap(); + string.parse::().unwrap() + }); + + assert_eq!(iter.collect::>(), vec![21, 23, 34]); +} + +#[test] +fn set_distinct_field() { + let index = TempIndex::new(); + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + // Don't display the generated `id` field. + settings.set_displayed_fields(vec![S("name"), S("age")]); + settings.set_distinct_field(S("age")); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "age": 23 }, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 }, + { "id": 3, "name": "bernard", "age": 34 }, + { "id": 4, "name": "bertrand", "age": 34 }, + { "id": 5, "name": "bernie", "age": 34 }, + { "id": 6, "name": "ben", "age": 34 } + ])) + .unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); +} + +#[test] +fn set_nested_distinct_field() { + let index = TempIndex::new(); + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + // Don't display the generated `id` field. + settings.set_displayed_fields(vec![S("person")]); + settings.set_distinct_field(S("person.age")); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "id": 0, "person": { "name": "kevin", "age": 23 }}, + { "id": 1, "person": { "name": "kevina", "age": 21 }}, + { "id": 2, "person": { "name": "benoit", "age": 34 }}, + { "id": 3, "person": { "name": "bernard", "age": 34 }}, + { "id": 4, "person": { "name": "bertrand", "age": 34 }}, + { "id": 5, "person": { "name": "bernie", "age": 34 }}, + { "id": 6, "person": { "name": "ben", "age": 34 }} + ])) + .unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); +} + +#[test] +fn default_stop_words() { + let index = TempIndex::new(); + + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "age": 23}, + { "id": 1, "name": "kevina", "age": 21 }, + { "id": 2, "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Ensure there is no stop_words by default + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); +} + +#[test] +fn set_and_reset_stop_words() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + // First we send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "age": 23, "maxim": "I love dogs" }, + { "id": 1, "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, + { "id": 2, "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, + ]), + ) + .unwrap(); + + // In the same transaction we provide some stop_words + let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_stop_words(set.clone()); + }) + .unwrap(); + + wtxn.commit().unwrap(); + + // Ensure stop_words are effectively stored + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_some()); // at this point the index should return something + + let stop_words = stop_words.unwrap(); + let expected = fst::Set::from_iter(&set).unwrap(); + assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); + + // when we search for something that is a non prefix stop_words it should be ignored + // thus we should get a placeholder search (all the results = 3) + let result = index.search(&rtxn).query("the ").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 3); + let result = index.search(&rtxn).query("i ").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 3); + let result = index.search(&rtxn).query("are ").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 3); + + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoĆ®t").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data + + // now we'll reset the stop_words and ensure it's None + index + .update_settings(|settings| { + settings.reset_stop_words(); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + + // now we can search for the stop words + let result = index.search(&rtxn).query("the").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + let result = index.search(&rtxn).query("i").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("are").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // the rest of the search is still not impacted + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoĆ®t").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data +} + +#[test] +fn set_and_reset_synonyms() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "age": 23, "maxim": "I love dogs"}, + { "id": 1, "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, + { "id": 2, "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, + ]), + ) + .unwrap(); + + // In the same transaction provide some synonyms + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(btreemap! { + "blini".to_string() => vec!["crepes".to_string()], + "super like".to_string() => vec!["love".to_string()], + "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] + }); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // Reset the synonyms + index + .update_settings(|settings| { + settings.reset_synonyms(); + }) + .unwrap(); + + // Ensure synonyms are reset + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(synonyms.is_empty()); + + // Check that synonyms are no longer work + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert!(result.documents_ids.is_empty()); +} + +#[test] +fn thai_synonyms() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "ąø¢ąøµą¹ˆąø›ąøøą¹ˆąø™" }, + { "id": 1, "name": "ąøąøµą¹ˆąø›ąøøą¹ˆąø™" }, + ]), + ) + .unwrap(); + + // In the same transaction provide some synonyms + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(btreemap! { + "japanese".to_string() => vec![S("ąøąøµą¹ˆąø›ąøøą¹ˆąø™"), S("ąø¢ąøµą¹ˆąø›ąøøą¹ˆąø™")], + }); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("japanese").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); +} + +#[test] +fn setting_searchable_recomputes_other_settings() { + let index = TempIndex::new(); + + // Set all the settings except searchable + index + .update_settings(|settings| { + settings.set_displayed_fields(vec!["hello".to_string()]); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("age")), + FilterableAttributesRule::Field(S("toto")), + ]); + settings.set_criteria(vec![Criterion::Asc(S("toto"))]); + }) + .unwrap(); + + // check the output + let rtxn = index.read_txn().unwrap(); + assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); + // since no documents have been pushed the primary key is still unset + assert!(index.primary_key(&rtxn).unwrap().is_none()); + assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); + drop(rtxn); + + // We set toto and age as searchable to force reordering of the fields + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); + assert!(index.primary_key(&rtxn).unwrap().is_none()); + assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); +} + +#[test] +fn setting_not_filterable_cant_filter() { + let index = TempIndex::new(); + + // Set all the settings except searchable + index + .update_settings(|settings| { + settings.set_displayed_fields(vec!["hello".to_string()]); + // It is only Asc(toto), there is a facet database but it is denied to filter with toto. + settings.set_criteria(vec![Criterion::Asc(S("toto"))]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let filter = Filter::from_str("toto = 32").unwrap().unwrap(); + let _ = filter.evaluate(&rtxn, &index).unwrap_err(); +} + +#[test] +fn setting_primary_key() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + // Set the primary key settings + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("mykey")); + }) + .unwrap(); + wtxn.commit().unwrap(); + let mut wtxn = index.write_txn().unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + + // Then index some documents with the "mykey" primary key. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mykey": 1, "name": "kevin", "age": 23 }, + { "mykey": 2, "name": "kevina", "age": 21 }, + { "mykey": 3, "name": "benoit", "age": 34 }, + { "mykey": 4, "name": "bernard", "age": 34 }, + { "mykey": 5, "name": "bertrand", "age": 34 }, + { "mykey": 6, "name": "bernie", "age": 34 }, + { "mykey": 7, "name": "ben", "age": 34 } + ]), + ) + .unwrap(); + wtxn.commit().unwrap(); + + // Updating settings with the same primary key should do nothing + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("mykey")); + }) + .unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + wtxn.commit().unwrap(); + + // Updating the settings with a different (or no) primary key causes an error + let mut wtxn = index.write_txn().unwrap(); + let error = index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.reset_primary_key(); + }) + .unwrap_err(); + assert!(matches!(error, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); + wtxn.abort(); + + // But if we clear the database... + let mut wtxn = index.write_txn().unwrap(); + let builder = ClearDocuments::new(&mut wtxn, &index); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // ...we can change the primary key + index + .update_settings(|settings| { + settings.set_primary_key(S("myid")); + }) + .unwrap(); +} + +#[test] +fn setting_impact_relevancy() { + let index = TempIndex::new(); + + // Set the genres setting + index + .update_settings(|settings| { + settings.set_filterable_fields(vec![FilterableAttributesRule::Field(S("genres"))]); + }) + .unwrap(); + + index.add_documents(documents!([ + { + "id": 11, + "title": "Star Wars", + "overview": + "Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.", + "genres": ["Adventure", "Action", "Science Fiction"], + "poster": "https://image.tmdb.org/t/p/w500/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg", + "release_date": 233366400 + }, + { + "id": 30, + "title": "Magnetic Rose", + "overview": "", + "genres": ["Animation", "Science Fiction"], + "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", + "release_date": 819676800 + } + ])).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap(); + let first_id = documents_ids[0]; + let documents = index.documents(&rtxn, documents_ids).unwrap(); + let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap(); + + let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap(); + let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap(); + assert_eq!(line, r#""Star Wars""#); +} + +#[test] +fn test_disable_typo() { + let index = TempIndex::new(); + + let mut txn = index.write_txn().unwrap(); + assert!(index.authorize_typos(&txn).unwrap()); + + index + .update_settings_using_wtxn(&mut txn, |settings| { + settings.set_autorize_typos(false); + }) + .unwrap(); + + assert!(!index.authorize_typos(&txn).unwrap()); +} + +#[test] +fn update_min_word_len_for_typo() { + let index = TempIndex::new(); + + // Set the genres setting + index + .update_settings(|settings| { + settings.set_min_word_len_one_typo(8); + settings.set_min_word_len_two_typos(8); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); + + index + .update_settings(|settings| { + settings.reset_min_word_len_one_typo(); + settings.reset_min_word_len_two_typos(); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); +} + +#[test] +fn update_invalid_min_word_len_for_typo() { + let index = TempIndex::new(); + + // Set the genres setting + index + .update_settings(|settings| { + settings.set_min_word_len_one_typo(10); + settings.set_min_word_len_two_typos(7); + }) + .unwrap_err(); +} + +#[test] +fn update_exact_words_normalization() { + let index = TempIndex::new(); + + let mut txn = index.write_txn().unwrap(); + // Set the genres setting + index + .update_settings_using_wtxn(&mut txn, |settings| { + let words = btreeset! { S("Ab"), S("ac") }; + settings.set_exact_words(words); + }) + .unwrap(); + + let exact_words = index.exact_words(&txn).unwrap().unwrap(); + for word in exact_words.into_fst().stream().into_str_vec().unwrap() { + assert!(word.0 == "ac" || word.0 == "ab"); + } +} + +#[test] +fn test_correct_settings_init() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + // we don't actually update the settings, just check their content + let Settings { + wtxn: _, + index: _, + indexer_config: _, + searchable_fields, + displayed_fields, + filterable_fields, + sortable_fields, + criteria, + stop_words, + non_separator_tokens, + separator_tokens, + dictionary, + distinct_field, + synonyms, + primary_key, + authorize_typos, + min_word_len_two_typos, + min_word_len_one_typo, + exact_words, + exact_attributes, + max_values_per_facet, + sort_facet_values_by, + pagination_max_total_hits, + proximity_precision, + embedder_settings, + search_cutoff, + localized_attributes_rules, + prefix_search, + facet_search, + } = settings; + assert!(matches!(searchable_fields, Setting::NotSet)); + assert!(matches!(displayed_fields, Setting::NotSet)); + assert!(matches!(filterable_fields, Setting::NotSet)); + assert!(matches!(sortable_fields, Setting::NotSet)); + assert!(matches!(criteria, Setting::NotSet)); + assert!(matches!(stop_words, Setting::NotSet)); + assert!(matches!(non_separator_tokens, Setting::NotSet)); + assert!(matches!(separator_tokens, Setting::NotSet)); + assert!(matches!(dictionary, Setting::NotSet)); + assert!(matches!(distinct_field, Setting::NotSet)); + assert!(matches!(synonyms, Setting::NotSet)); + assert!(matches!(primary_key, Setting::NotSet)); + assert!(matches!(authorize_typos, Setting::NotSet)); + assert!(matches!(min_word_len_two_typos, Setting::NotSet)); + assert!(matches!(min_word_len_one_typo, Setting::NotSet)); + assert!(matches!(exact_words, Setting::NotSet)); + assert!(matches!(exact_attributes, Setting::NotSet)); + assert!(matches!(max_values_per_facet, Setting::NotSet)); + assert!(matches!(sort_facet_values_by, Setting::NotSet)); + assert!(matches!(pagination_max_total_hits, Setting::NotSet)); + assert!(matches!(proximity_precision, Setting::NotSet)); + assert!(matches!(embedder_settings, Setting::NotSet)); + assert!(matches!(search_cutoff, Setting::NotSet)); + assert!(matches!(localized_attributes_rules, Setting::NotSet)); + assert!(matches!(prefix_search, Setting::NotSet)); + assert!(matches!(facet_search, Setting::NotSet)); + }) + .unwrap(); +} + +#[test] +fn settings_must_ignore_soft_deleted() { + use serde_json::json; + + let index = TempIndex::new(); + + let mut docs = vec![]; + for i in 0..10 { + docs.push(json!({ "id": i, "title": format!("{:x}", i) })); + } + index.add_documents(documents! { docs }).unwrap(); + + index.delete_documents((0..5).map(|id| id.to_string()).collect()); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_searchable_fields(vec!["id".to_string()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.write_txn().unwrap(); + let docs: StdResult, _> = index.all_documents(&rtxn).unwrap().collect(); + let docs = docs.unwrap(); + assert_eq!(docs.len(), 5); +} diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs new file mode 100644 index 000000000..7c8dcf64a --- /dev/null +++ b/crates/milli/src/update/upgrade/mod.rs @@ -0,0 +1,79 @@ +mod v1_12; +mod v1_13; +mod v1_14; + +use heed::RwTxn; +use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; +use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; +use v1_14::Latest_V1_13_To_Latest_V1_14; + +use crate::progress::{Progress, VariableNameStep}; +use crate::{Index, InternalError, Result}; + +trait UpgradeIndex { + /// Returns `true` if the index scheduler must regenerate its cached stats. + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + original: (u32, u32, u32), + progress: Progress, + ) -> Result; + fn target_version(&self) -> (u32, u32, u32); +} + +/// Return true if the cached stats of the index must be regenerated +pub fn upgrade( + wtxn: &mut RwTxn, + index: &Index, + db_version: (u32, u32, u32), + progress: Progress, +) -> Result { + let from = index.get_version(wtxn)?.unwrap_or(db_version); + let upgrade_functions: &[&dyn UpgradeIndex] = &[ + &V1_12_To_V1_12_3 {}, + &V1_12_3_To_V1_13_0 {}, + &V1_13_0_To_V1_13_1 {}, + &V1_13_1_To_Latest_V1_13 {}, + &Latest_V1_13_To_Latest_V1_14 {}, + ]; + + let start = match from { + (1, 12, 0..=2) => 0, + (1, 12, 3..) => 1, + (1, 13, 0) => 2, + (1, 13, _) => 4, + // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. + (1, 14, _) => 4, + (major, minor, patch) => { + return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into()) + } + }; + + enum UpgradeVersion {} + let upgrade_path = &upgrade_functions[start..]; + + let mut current_version = from; + let mut regenerate_stats = false; + for (i, upgrade) in upgrade_path.iter().enumerate() { + let target = upgrade.target_version(); + progress.update_progress(VariableNameStep::::new( + format!( + "Upgrading from v{}.{}.{} to v{}.{}.{}", + current_version.0, + current_version.1, + current_version.2, + target.0, + target.1, + target.2 + ), + i as u32, + upgrade_path.len() as u32, + )); + regenerate_stats |= upgrade.upgrade(wtxn, index, from, progress.clone())?; + index.put_version(wtxn, target)?; + current_version = target; + } + + Ok(regenerate_stats) +} diff --git a/crates/milli/src/update/upgrade/v1_12.rs b/crates/milli/src/update/upgrade/v1_12.rs new file mode 100644 index 000000000..f46e7f745 --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_12.rs @@ -0,0 +1,51 @@ +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::progress::Progress; +use crate::{make_enum_progress, Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct V1_12_To_V1_12_3 {} + +impl UpgradeIndex for V1_12_To_V1_12_3 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + progress: Progress, + ) -> Result { + make_enum_progress! { + enum FieldDistribution { + RebuildingFieldDistribution, + } + }; + progress.update_progress(FieldDistribution::RebuildingFieldDistribution); + crate::update::new::reindex::field_distribution(index, wtxn, &progress)?; + Ok(true) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 12, 3) + } +} + +#[allow(non_camel_case_types)] +pub(super) struct V1_12_3_To_V1_13_0 {} + +impl UpgradeIndex for V1_12_3_To_V1_13_0 { + fn upgrade( + &self, + _wtxn: &mut RwTxn, + _index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + // recompute the indexes stats + Ok(true) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 13, 0) + } +} diff --git a/crates/milli/src/update/upgrade/v1_13.rs b/crates/milli/src/update/upgrade/v1_13.rs new file mode 100644 index 000000000..8e5e052bd --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_13.rs @@ -0,0 +1,60 @@ +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use crate::database_stats::DatabaseStats; +use crate::progress::Progress; +use crate::{make_enum_progress, Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_0_To_V1_13_1(); + +impl UpgradeIndex for V1_13_0_To_V1_13_1 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + progress: Progress, + ) -> Result { + make_enum_progress! { + enum DocumentsStats { + CreatingDocumentsStats, + } + }; + + // Create the new documents stats. + progress.update_progress(DocumentsStats::CreatingDocumentsStats); + let stats = DatabaseStats::new(index.documents.remap_types(), wtxn)?; + index.put_documents_stats(wtxn, stats)?; + + Ok(true) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 13, 1) + } +} + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_1_To_Latest_V1_13(); + +impl UpgradeIndex for V1_13_1_To_Latest_V1_13 { + fn upgrade( + &self, + _wtxn: &mut RwTxn, + _index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + ( + VERSION_MAJOR.parse().unwrap(), + VERSION_MINOR.parse().unwrap(), + VERSION_PATCH.parse().unwrap(), + ) + } +} diff --git a/crates/milli/src/update/upgrade/v1_14.rs b/crates/milli/src/update/upgrade/v1_14.rs new file mode 100644 index 000000000..039734b75 --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_14.rs @@ -0,0 +1,41 @@ +use arroy::distances::Cosine; +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::progress::Progress; +use crate::{make_enum_progress, Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct Latest_V1_13_To_Latest_V1_14(); + +impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + progress: Progress, + ) -> Result { + make_enum_progress! { + enum VectorStore { + UpdateInternalVersions, + } + }; + + progress.update_progress(VectorStore::UpdateInternalVersions); + + let rtxn = index.read_txn()?; + arroy::upgrade::from_0_5_to_0_6::( + &rtxn, + index.vector_arroy.remap_data_type(), + wtxn, + index.vector_arroy.remap_data_type(), + )?; + + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 14, 0) + } +} diff --git a/crates/milli/src/update/words_prefixes_fst.rs b/crates/milli/src/update/words_prefixes_fst.rs index d47d6d14c..d18bfa74c 100644 --- a/crates/milli/src/update/words_prefixes_fst.rs +++ b/crates/milli/src/update/words_prefixes_fst.rs @@ -9,7 +9,7 @@ use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'i> { wtxn: &'t mut RwTxn<'i>, index: &'i Index, - threshold: u32, + threshold: usize, max_prefix_length: usize, } @@ -24,8 +24,8 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { /// /// Default value is 100. This value must be higher than 50 and will be clamped /// to this bound otherwise. - pub fn threshold(&mut self, value: u32) -> &mut Self { - self.threshold = value.max(50); + pub fn threshold(&mut self, value: usize) -> &mut Self { + self.threshold = value; self } @@ -34,7 +34,7 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped /// to these bounds, otherwise. pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.clamp(1, 25); + self.max_prefix_length = value; self } diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs new file mode 100644 index 000000000..9c5992bd3 --- /dev/null +++ b/crates/milli/src/vector/composite.rs @@ -0,0 +1,323 @@ +use std::time::Instant; + +use arroy::Distance; + +use super::error::CompositeEmbedderContainsHuggingFace; +use super::{ + hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, EmbeddingCache, + NewEmbedderError, +}; +use crate::ThreadPoolNoAbort; + +#[derive(Debug)] +pub enum SubEmbedder { + /// An embedder based on running local models, fetched from the Hugging Face Hub. + HuggingFace(hf::Embedder), + /// An embedder based on making embedding queries against the OpenAI API. + OpenAi(openai::Embedder), + /// An embedder based on the user providing the embeddings in the documents and queries. + UserProvided(manual::Embedder), + /// An embedder based on making embedding queries against an embedding server. + Ollama(ollama::Embedder), + /// An embedder based on making embedding queries against a generic JSON/REST embedding server. + Rest(rest::Embedder), +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum SubEmbedderOptions { + HuggingFace(hf::EmbedderOptions), + OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), + UserProvided(manual::EmbedderOptions), + Rest(rest::EmbedderOptions), +} + +impl SubEmbedderOptions { + pub fn distribution(&self) -> Option { + match self { + SubEmbedderOptions::HuggingFace(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::OpenAi(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::Ollama(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::UserProvided(embedder_options) => embedder_options.distribution, + SubEmbedderOptions::Rest(embedder_options) => embedder_options.distribution, + } + } +} + +#[derive(Debug)] +pub struct Embedder { + pub(super) search: SubEmbedder, + pub(super) index: SubEmbedder, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub struct EmbedderOptions { + pub search: SubEmbedderOptions, + pub index: SubEmbedderOptions, +} + +impl Embedder { + pub fn new( + EmbedderOptions { search, index }: EmbedderOptions, + cache_cap: usize, + ) -> Result { + let search = SubEmbedder::new(search, cache_cap)?; + // cache is only used at search + let index = SubEmbedder::new(index, 0)?; + + // check dimensions + if search.dimensions() != index.dimensions() { + return Err(NewEmbedderError::composite_dimensions_mismatch( + search.dimensions(), + index.dimensions(), + )); + } + // check similarity + let search_embeddings = search + .embed( + vec![ + "test".into(), + "a brave dog".into(), + "This is a sample text. It is meant to compare similarity.".into(), + ], + None, + ) + .map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?; + + let index_embeddings = index + .embed( + vec![ + "test".into(), + "a brave dog".into(), + "This is a sample text. It is meant to compare similarity.".into(), + ], + None, + ) + .map_err(|error| { + NewEmbedderError::composite_test_embedding_failed(error, "indexing") + })?; + + let hint = configuration_hint(&search, &index); + + check_similarity(search_embeddings, index_embeddings, hint)?; + + Ok(Self { search, index }) + } + + /// Indicates the dimensions of a single embedding produced by the embedder. + pub fn dimensions(&self) -> usize { + // can use the dimensions of any embedder since they should match + self.index.dimensions() + } + + /// An optional distribution used to apply an affine transformation to the similarity score of a document. + pub fn distribution(&self) -> Option { + // 3 cases here: + // 1. distribution provided by user => use that one, which was stored in search + // 2. no user-provided distribution, distribution in search embedder => use that one + // 2. no user-provided distribution, no distribution in search embedder => use the distribution in indexing embedder + self.search.distribution().or_else(|| self.index.distribution()) + } +} + +impl SubEmbedder { + pub fn new( + options: SubEmbedderOptions, + cache_cap: usize, + ) -> std::result::Result { + Ok(match options { + SubEmbedderOptions::HuggingFace(options) => { + Self::HuggingFace(hf::Embedder::new(options, cache_cap)?) + } + SubEmbedderOptions::OpenAi(options) => { + Self::OpenAi(openai::Embedder::new(options, cache_cap)?) + } + SubEmbedderOptions::Ollama(options) => { + Self::Ollama(ollama::Embedder::new(options, cache_cap)?) + } + SubEmbedderOptions::UserProvided(options) => { + Self::UserProvided(manual::Embedder::new(options)) + } + SubEmbedderOptions::Rest(options) => Self::Rest(rest::Embedder::new( + options, + cache_cap, + rest::ConfigurationSource::User, + )?), + }) + } + + pub fn embed( + &self, + texts: Vec, + deadline: Option, + ) -> std::result::Result, EmbedError> { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed(texts), + SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline), + SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline), + SubEmbedder::UserProvided(embedder) => embedder.embed(&texts), + SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline), + } + } + + pub fn embed_one( + &self, + text: &str, + deadline: Option, + ) -> std::result::Result { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed_one(text), + SubEmbedder::OpenAi(embedder) => { + embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + } + SubEmbedder::Ollama(embedder) => { + embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + } + SubEmbedder::UserProvided(embedder) => embedder.embed_one(text), + SubEmbedder::Rest(embedder) => embedder + .embed_ref(&[text], deadline)? + .pop() + .ok_or_else(EmbedError::missing_embedding), + } + } + + /// Embed multiple chunks of texts. + /// + /// Each chunk is composed of one or multiple texts. + pub fn embed_index( + &self, + text_chunks: Vec>, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result>, EmbedError> { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), + SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads), + SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads), + SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks), + SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads), + } + } + + /// Non-owning variant of [`Self::embed_index`]. + pub fn embed_index_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), + SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads), + SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads), + SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts), + SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads), + } + } + + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] + pub fn chunk_count_hint(&self) -> usize { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.chunk_count_hint(), + SubEmbedder::OpenAi(embedder) => embedder.chunk_count_hint(), + SubEmbedder::Ollama(embedder) => embedder.chunk_count_hint(), + SubEmbedder::UserProvided(_) => 100, + SubEmbedder::Rest(embedder) => embedder.chunk_count_hint(), + } + } + + /// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`] + pub fn prompt_count_in_chunk_hint(&self) -> usize { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), + SubEmbedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), + SubEmbedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), + SubEmbedder::UserProvided(_) => 1, + SubEmbedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), + } + } + + pub fn uses_document_template(&self) -> bool { + match self { + SubEmbedder::HuggingFace(_) + | SubEmbedder::OpenAi(_) + | SubEmbedder::Ollama(_) + | SubEmbedder::Rest(_) => true, + SubEmbedder::UserProvided(_) => false, + } + } + + /// Indicates the dimensions of a single embedding produced by the embedder. + pub fn dimensions(&self) -> usize { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.dimensions(), + SubEmbedder::OpenAi(embedder) => embedder.dimensions(), + SubEmbedder::Ollama(embedder) => embedder.dimensions(), + SubEmbedder::UserProvided(embedder) => embedder.dimensions(), + SubEmbedder::Rest(embedder) => embedder.dimensions(), + } + } + + /// An optional distribution used to apply an affine transformation to the similarity score of a document. + pub fn distribution(&self) -> Option { + match self { + SubEmbedder::HuggingFace(embedder) => embedder.distribution(), + SubEmbedder::OpenAi(embedder) => embedder.distribution(), + SubEmbedder::Ollama(embedder) => embedder.distribution(), + SubEmbedder::UserProvided(embedder) => embedder.distribution(), + SubEmbedder::Rest(embedder) => embedder.distribution(), + } + } + + pub(super) fn cache(&self) -> Option<&EmbeddingCache> { + match self { + SubEmbedder::HuggingFace(embedder) => Some(embedder.cache()), + SubEmbedder::OpenAi(embedder) => Some(embedder.cache()), + SubEmbedder::UserProvided(_) => None, + SubEmbedder::Ollama(embedder) => Some(embedder.cache()), + SubEmbedder::Rest(embedder) => Some(embedder.cache()), + } + } +} + +fn check_similarity( + left: Vec, + right: Vec, + hint: CompositeEmbedderContainsHuggingFace, +) -> Result<(), NewEmbedderError> { + if left.len() != right.len() { + return Err(NewEmbedderError::composite_embedding_count_mismatch(left.len(), right.len())); + } + + for (left, right) in left.into_iter().zip(right) { + let left = arroy::internals::UnalignedVector::from_slice(&left); + let right = arroy::internals::UnalignedVector::from_slice(&right); + let left = arroy::internals::Leaf { + header: arroy::distances::Cosine::new_header(&left), + vector: left, + }; + let right = arroy::internals::Leaf { + header: arroy::distances::Cosine::new_header(&right), + vector: right, + }; + + let distance = arroy::distances::Cosine::built_distance(&left, &right); + + if distance > super::MAX_COMPOSITE_DISTANCE { + return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint)); + } + } + Ok(()) +} + +fn configuration_hint( + search: &SubEmbedder, + index: &SubEmbedder, +) -> CompositeEmbedderContainsHuggingFace { + match (search, index) { + (SubEmbedder::HuggingFace(_), SubEmbedder::HuggingFace(_)) => { + CompositeEmbedderContainsHuggingFace::Both + } + (SubEmbedder::HuggingFace(_), _) => CompositeEmbedderContainsHuggingFace::Search, + (_, SubEmbedder::HuggingFace(_)) => CompositeEmbedderContainsHuggingFace::Indexing, + _ => CompositeEmbedderContainsHuggingFace::None, + } +} diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 97bbe5d68..685022de8 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -6,6 +6,7 @@ use hf_hub::api::sync::ApiError; use super::parsed_vectors::ParsedVectorsDiff; use super::rest::ConfigurationSource; +use super::MAX_COMPOSITE_DISTANCE; use crate::error::FaultSource; use crate::update::new::vector_document::VectorDocument; use crate::{FieldDistribution, PanicCatched}; @@ -67,7 +68,7 @@ pub enum EmbedErrorKind { #[error("could not authenticate against {embedding} server{server_reply}{hint}", embedding=match *.1 { ConfigurationSource::User => "embedding", ConfigurationSource::OpenAi => "OpenAI", - ConfigurationSource::Ollama => "ollama" + ConfigurationSource::Ollama => "Ollama" }, server_reply=option_info(.0.as_deref(), "server replied with "), hint=match *.1 { @@ -86,9 +87,9 @@ pub enum EmbedErrorKind { }, option_info(.0.as_deref(), "server replied with "))] RestBadRequest(Option, ConfigurationSource), - #[error("received internal error HTTP {0} from embedding server{}", option_info(.1.as_deref(), "server replied with "))] + #[error("received internal error HTTP {} from embedding server{}", .0, option_info(.1.as_deref(), "server replied with "))] RestInternalServerError(u16, Option), - #[error("received unexpected HTTP {0} from embedding server{}", option_info(.1.as_deref(), "server replied with "))] + #[error("received unexpected HTTP {} from embedding server{}", .0, option_info(.1.as_deref(), "server replied with "))] RestOtherStatusCode(u16, Option), #[error("could not reach embedding server:\n - {0}")] RestNetwork(ureq::Transport), @@ -262,6 +263,31 @@ impl NewEmbedderError { } } + pub fn open_pooling_config( + pooling_config_filename: PathBuf, + inner: std::io::Error, + ) -> NewEmbedderError { + let open_config = OpenPoolingConfig { filename: pooling_config_filename, inner }; + + Self { + kind: NewEmbedderErrorKind::OpenPoolingConfig(open_config), + fault: FaultSource::Runtime, + } + } + + pub fn deserialize_pooling_config( + model_name: String, + pooling_config_filename: PathBuf, + inner: serde_json::Error, + ) -> NewEmbedderError { + let deserialize_pooling_config = + DeserializePoolingConfig { model_name, filename: pooling_config_filename, inner }; + Self { + kind: NewEmbedderErrorKind::DeserializePoolingConfig(deserialize_pooling_config), + fault: FaultSource::Runtime, + } + } + pub fn open_tokenizer( tokenizer_filename: PathBuf, inner: Box, @@ -306,6 +332,81 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn ollama_unsupported_url(url: String) -> NewEmbedderError { + Self { kind: NewEmbedderErrorKind::OllamaUnsupportedUrl(url), fault: FaultSource::User } + } + + pub(crate) fn composite_dimensions_mismatch( + search_dimensions: usize, + index_dimensions: usize, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeDimensionsMismatch { + search_dimensions, + index_dimensions, + }, + fault: FaultSource::User, + } + } + + pub(crate) fn composite_test_embedding_failed( + inner: EmbedError, + failing_embedder: &'static str, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeTestEmbeddingFailed { inner, failing_embedder }, + fault: FaultSource::Runtime, + } + } + + pub(crate) fn composite_embedding_count_mismatch( + search_count: usize, + index_count: usize, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeEmbeddingCountMismatch { + search_count, + index_count, + }, + fault: FaultSource::Runtime, + } + } + + pub(crate) fn composite_embedding_value_mismatch( + distance: f32, + hint: CompositeEmbedderContainsHuggingFace, + ) -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::CompositeEmbeddingValueMismatch { distance, hint }, + fault: FaultSource::User, + } + } +} + +#[derive(Debug, Clone, Copy)] +pub enum CompositeEmbedderContainsHuggingFace { + Both, + Search, + Indexing, + None, +} + +impl std::fmt::Display for CompositeEmbedderContainsHuggingFace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + CompositeEmbedderContainsHuggingFace::Both => f.write_str( + "\n - Make sure the `model`, `revision` and `pooling` of both embedders match.", + ), + CompositeEmbedderContainsHuggingFace::Search => f.write_str( + "\n - Consider trying a different `pooling` method for the search embedder.", + ), + CompositeEmbedderContainsHuggingFace::Indexing => f.write_str( + "\n - Consider trying a different `pooling` method for the indexing embedder.", + ), + CompositeEmbedderContainsHuggingFace::None => Ok(()), + } + } } #[derive(Debug, thiserror::Error)] @@ -315,6 +416,13 @@ pub struct OpenConfig { pub inner: std::io::Error, } +#[derive(Debug, thiserror::Error)] +#[error("could not open pooling config at {filename}: {inner}")] +pub struct OpenPoolingConfig { + pub filename: PathBuf, + pub inner: std::io::Error, +} + #[derive(Debug, thiserror::Error)] #[error("for model '{model_name}', could not deserialize config at {filename} as JSON: {inner}")] pub struct DeserializeConfig { @@ -323,6 +431,14 @@ pub struct DeserializeConfig { pub inner: serde_json::Error, } +#[derive(Debug, thiserror::Error)] +#[error("for model '{model_name}', could not deserialize file at `{filename}` as a pooling config: {inner}")] +pub struct DeserializePoolingConfig { + pub model_name: String, + pub filename: PathBuf, + pub inner: serde_json::Error, +} + #[derive(Debug, thiserror::Error)] #[error("model `{model_name}` appears to be unsupported{}\n - inner error: {inner}", if architectures.is_empty() { @@ -350,8 +466,12 @@ pub enum NewEmbedderErrorKind { #[error(transparent)] OpenConfig(OpenConfig), #[error(transparent)] + OpenPoolingConfig(OpenPoolingConfig), + #[error(transparent)] DeserializeConfig(DeserializeConfig), #[error(transparent)] + DeserializePoolingConfig(DeserializePoolingConfig), + #[error(transparent)] UnsupportedModel(UnsupportedModel), #[error(transparent)] OpenTokenizer(OpenTokenizer), @@ -369,6 +489,16 @@ pub enum NewEmbedderErrorKind { LoadModel(candle_core::Error), #[error("{0}")] CouldNotParseTemplate(String), + #[error("unsupported Ollama URL.\n - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n - Got `{0}`")] + OllamaUnsupportedUrl(String), + #[error("error while generating test embeddings.\n - the dimensions of embeddings produced at search time and at indexing time don't match.\n - Search time dimensions: {search_dimensions}\n - Indexing time dimensions: {index_dimensions}\n - Note: Dimensions of embeddings produced by both embedders are required to match.")] + CompositeDimensionsMismatch { search_dimensions: usize, index_dimensions: usize }, + #[error("error while generating test embeddings.\n - could not generate test embedding with embedder at {failing_embedder} time.\n - Embedding failed with {inner}")] + CompositeTestEmbeddingFailed { inner: EmbedError, failing_embedder: &'static str }, + #[error("error while generating test embeddings.\n - the number of generated embeddings differs.\n - {search_count} embeddings for the search time embedder.\n - {index_count} embeddings for the indexing time embedder.")] + CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize }, + #[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance:.2}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")] + CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, } pub struct PossibleEmbeddingMistakes { diff --git a/crates/milli/src/vector/hf.rs b/crates/milli/src/vector/hf.rs index 3fe28e53a..1e5c7bd1c 100644 --- a/crates/milli/src/vector/hf.rs +++ b/crates/milli/src/vector/hf.rs @@ -7,7 +7,7 @@ use hf_hub::{Repo, RepoType}; use tokenizers::{PaddingParams, Tokenizer}; pub use super::error::{EmbedError, Error, NewEmbedderError}; -use super::{DistributionShift, Embedding}; +use super::{DistributionShift, Embedding, EmbeddingCache}; #[derive( Debug, @@ -34,6 +34,30 @@ pub struct EmbedderOptions { pub model: String, pub revision: Option, pub distribution: Option, + #[serde(default)] + pub pooling: OverridePooling, +} + +#[derive( + Debug, + Clone, + Copy, + Default, + Hash, + PartialEq, + Eq, + serde::Deserialize, + serde::Serialize, + utoipa::ToSchema, + deserr::Deserr, +)] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +pub enum OverridePooling { + UseModel, + ForceCls, + #[default] + ForceMean, } impl EmbedderOptions { @@ -42,6 +66,7 @@ impl EmbedderOptions { model: "BAAI/bge-base-en-v1.5".to_string(), revision: Some("617ca489d9e86b49b8167676d8220688b99db36e".into()), distribution: None, + pooling: OverridePooling::UseModel, } } } @@ -58,6 +83,8 @@ pub struct Embedder { tokenizer: Tokenizer, options: EmbedderOptions, dimensions: usize, + pooling: Pooling, + cache: EmbeddingCache, } impl std::fmt::Debug for Embedder { @@ -66,12 +93,67 @@ impl std::fmt::Debug for Embedder { .field("model", &self.options.model) .field("tokenizer", &self.tokenizer) .field("options", &self.options) + .field("pooling", &self.pooling) .finish() } } +#[derive(Clone, Copy, serde::Deserialize)] +struct PoolingConfig { + #[serde(default)] + pub pooling_mode_cls_token: bool, + #[serde(default)] + pub pooling_mode_mean_tokens: bool, + #[serde(default)] + pub pooling_mode_max_tokens: bool, + #[serde(default)] + pub pooling_mode_mean_sqrt_len_tokens: bool, + #[serde(default)] + pub pooling_mode_lasttoken: bool, +} + +#[derive(Debug, Clone, Copy, Default)] +pub enum Pooling { + #[default] + Mean, + Cls, + Max, + MeanSqrtLen, + LastToken, +} +impl Pooling { + fn override_with(&mut self, pooling: OverridePooling) { + match pooling { + OverridePooling::UseModel => {} + OverridePooling::ForceCls => *self = Pooling::Cls, + OverridePooling::ForceMean => *self = Pooling::Mean, + } + } +} + +impl From for Pooling { + fn from(value: PoolingConfig) -> Self { + if value.pooling_mode_cls_token { + Self::Cls + } else if value.pooling_mode_mean_tokens { + Self::Mean + } else if value.pooling_mode_lasttoken { + Self::LastToken + } else if value.pooling_mode_mean_sqrt_len_tokens { + Self::MeanSqrtLen + } else if value.pooling_mode_max_tokens { + Self::Max + } else { + Self::default() + } + } +} + impl Embedder { - pub fn new(options: EmbedderOptions) -> std::result::Result { + pub fn new( + options: EmbedderOptions, + cache_cap: usize, + ) -> std::result::Result { let device = match candle_core::Device::cuda_if_available(0) { Ok(device) => device, Err(error) => { @@ -83,7 +165,7 @@ impl Embedder { Some(revision) => Repo::with_revision(options.model.clone(), RepoType::Model, revision), None => Repo::model(options.model.clone()), }; - let (config_filename, tokenizer_filename, weights_filename, weight_source) = { + let (config_filename, tokenizer_filename, weights_filename, weight_source, pooling) = { let api = Api::new().map_err(NewEmbedderError::new_api_fail)?; let api = api.repo(repo); let config = api.get("config.json").map_err(NewEmbedderError::api_get)?; @@ -97,7 +179,38 @@ impl Embedder { }) .map_err(NewEmbedderError::api_get)? }; - (config, tokenizer, weights, source) + let pooling = match api.get("1_Pooling/config.json") { + Ok(pooling) => Some(pooling), + Err(hf_hub::api::sync::ApiError::RequestError(error)) + if matches!(*error, ureq::Error::Status(404, _,)) => + { + // ignore the error if the file simply doesn't exist + None + } + Err(error) => return Err(NewEmbedderError::api_get(error)), + }; + let mut pooling: Pooling = match pooling { + Some(pooling_filename) => { + let pooling = std::fs::read_to_string(&pooling_filename).map_err(|inner| { + NewEmbedderError::open_pooling_config(pooling_filename.clone(), inner) + })?; + + let pooling: PoolingConfig = + serde_json::from_str(&pooling).map_err(|inner| { + NewEmbedderError::deserialize_pooling_config( + options.model.clone(), + pooling_filename, + inner, + ) + })?; + pooling.into() + } + None => Pooling::default(), + }; + + pooling.override_with(options.pooling); + + (config, tokenizer, weights, source, pooling) }; let config = std::fs::read_to_string(&config_filename) @@ -122,6 +235,8 @@ impl Embedder { }, }; + tracing::debug!(model = options.model, weight=?weight_source, pooling=?pooling, "model config"); + let model = BertModel::load(vb, &config).map_err(NewEmbedderError::load_model)?; if let Some(pp) = tokenizer.get_padding_mut() { @@ -134,7 +249,14 @@ impl Embedder { tokenizer.with_padding(Some(pp)); } - let mut this = Self { model, tokenizer, options, dimensions: 0 }; + let mut this = Self { + model, + tokenizer, + options, + dimensions: 0, + pooling, + cache: EmbeddingCache::new(cache_cap), + }; let embeddings = this .embed(vec!["test".into()]) @@ -144,37 +266,49 @@ impl Embedder { Ok(this) } - pub fn embed(&self, mut texts: Vec) -> std::result::Result, EmbedError> { - let tokens = match texts.len() { - 1 => vec![self - .tokenizer - .encode(texts.pop().unwrap(), true) - .map_err(EmbedError::tokenize)?], - _ => self.tokenizer.encode_batch(texts, true).map_err(EmbedError::tokenize)?, - }; - let token_ids = tokens - .iter() - .map(|tokens| { - let mut tokens = tokens.get_ids().to_vec(); - tokens.truncate(512); - Tensor::new(tokens.as_slice(), &self.model.device).map_err(EmbedError::tensor_shape) - }) - .collect::, EmbedError>>()?; + pub fn embed(&self, texts: Vec) -> std::result::Result, EmbedError> { + texts.into_iter().map(|text| self.embed_one(&text)).collect() + } - let token_ids = Tensor::stack(&token_ids, 0).map_err(EmbedError::tensor_shape)?; - let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?; - let embeddings = - self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?; + fn pooling(embeddings: Tensor, pooling: Pooling) -> Result { + match pooling { + Pooling::Mean => Self::mean_pooling(embeddings), + Pooling::Cls => Self::cls_pooling(embeddings), + Pooling::Max => Self::max_pooling(embeddings), + Pooling::MeanSqrtLen => Self::mean_sqrt_pooling(embeddings), + Pooling::LastToken => Self::last_token_pooling(embeddings), + } + } - // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) + fn cls_pooling(embeddings: Tensor) -> Result { + embeddings.get_on_dim(1, 0).map_err(EmbedError::tensor_value) + } + + fn mean_sqrt_pooling(embeddings: Tensor) -> Result { let (_n_sentence, n_tokens, _hidden_size) = embeddings.dims3().map_err(EmbedError::tensor_shape)?; - let embeddings = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) - .map_err(EmbedError::tensor_shape)?; + (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64).sqrt()) + .map_err(EmbedError::tensor_shape) + } - let embeddings: Vec = embeddings.to_vec2().map_err(EmbedError::tensor_shape)?; - Ok(embeddings) + fn mean_pooling(embeddings: Tensor) -> Result { + let (_n_sentence, n_tokens, _hidden_size) = + embeddings.dims3().map_err(EmbedError::tensor_shape)?; + + (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) + .map_err(EmbedError::tensor_shape) + } + + fn max_pooling(embeddings: Tensor) -> Result { + embeddings.max(1).map_err(EmbedError::tensor_shape) + } + + fn last_token_pooling(embeddings: Tensor) -> Result { + let (_n_sentence, n_tokens, _hidden_size) = + embeddings.dims3().map_err(EmbedError::tensor_shape)?; + + embeddings.get_on_dim(1, n_tokens - 1).map_err(EmbedError::tensor_value) } pub fn embed_one(&self, text: &str) -> std::result::Result { @@ -185,20 +319,19 @@ impl Embedder { Tensor::new(token_ids, &self.model.device).map_err(EmbedError::tensor_shape)?; let token_ids = Tensor::stack(&[token_ids], 0).map_err(EmbedError::tensor_shape)?; let token_type_ids = token_ids.zeros_like().map_err(EmbedError::tensor_shape)?; - let embeddings = - self.model.forward(&token_ids, &token_type_ids).map_err(EmbedError::model_forward)?; + let embeddings = self + .model + .forward(&token_ids, &token_type_ids, None) + .map_err(EmbedError::model_forward)?; + + let embedding = Self::pooling(embeddings, self.pooling)?; - // Apply some avg-pooling by taking the mean embedding value for all tokens (including padding) - let (_n_sentence, n_tokens, _hidden_size) = - embeddings.dims3().map_err(EmbedError::tensor_shape)?; - let embedding = (embeddings.sum(1).map_err(EmbedError::tensor_value)? / (n_tokens as f64)) - .map_err(EmbedError::tensor_shape)?; let embedding = embedding.squeeze(0).map_err(EmbedError::tensor_shape)?; let embedding: Embedding = embedding.to_vec1().map_err(EmbedError::tensor_shape)?; Ok(embedding) } - pub fn embed_chunks( + pub fn embed_index( &self, text_chunks: Vec>, ) -> std::result::Result>, EmbedError> { @@ -230,7 +363,11 @@ impl Embedder { }) } - pub(crate) fn embed_chunks_ref(&self, texts: &[&str]) -> Result, EmbedError> { + pub(crate) fn embed_index_ref(&self, texts: &[&str]) -> Result, EmbedError> { texts.iter().map(|text| self.embed_one(text)).collect() } + + pub(super) fn cache(&self) -> &EmbeddingCache { + &self.cache + } } diff --git a/crates/milli/src/vector/json_template.rs b/crates/milli/src/vector/json_template.rs index 454f23251..179cbe9af 100644 --- a/crates/milli/src/vector/json_template.rs +++ b/crates/milli/src/vector/json_template.rs @@ -311,7 +311,7 @@ fn last_named_object<'a>( last_named_object } -impl<'a> std::fmt::Display for LastNamedObject<'a> { +impl std::fmt::Display for LastNamedObject<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { LastNamedObject::Object { name } => write!(f, "`{name}`"), diff --git a/crates/milli/src/vector/manual.rs b/crates/milli/src/vector/manual.rs index 8c2ef97b2..b95bf0ea2 100644 --- a/crates/milli/src/vector/manual.rs +++ b/crates/milli/src/vector/manual.rs @@ -30,7 +30,7 @@ impl Embedder { self.dimensions } - pub fn embed_chunks( + pub fn embed_index( &self, text_chunks: Vec>, ) -> Result>, EmbedError> { @@ -41,7 +41,7 @@ impl Embedder { self.distribution } - pub(crate) fn embed_chunks_ref(&self, texts: &[&str]) -> Result, EmbedError> { + pub(crate) fn embed_index_ref(&self, texts: &[&str]) -> Result, EmbedError> { texts.iter().map(|text| self.embed_one(text)).collect() } } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3047e6dfc..c2978f5db 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; -use std::sync::Arc; +use std::num::NonZeroUsize; +use std::sync::{Arc, Mutex}; use std::time::Instant; use arroy::distances::{BinaryQuantizedCosine, Cosine}; @@ -9,11 +10,14 @@ use heed::{RoTxn, RwTxn, Unspecified}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; use self::error::{EmbedError, NewEmbedderError}; +use crate::progress::Progress; use crate::prompt::{Prompt, PromptData}; use crate::ThreadPoolNoAbort; +pub mod composite; pub mod error; pub mod hf; pub mod json_template; @@ -30,6 +34,7 @@ pub use self::error::Error; pub type Embedding = Vec; pub const REQUEST_PARALLELISM: usize = 40; +pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01; pub struct ArroyWrapper { quantized: bool, @@ -54,7 +59,7 @@ impl ArroyWrapper { &'a self, rtxn: &'a RoTxn<'a>, db: arroy::Database, - ) -> impl Iterator, arroy::Error>> + 'a { + ) -> impl Iterator, arroy::Error>> + 'a { arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { match arroy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { @@ -77,12 +82,15 @@ impl ArroyWrapper { } } + #[allow(clippy::too_many_arguments)] pub fn build_and_quantize( &mut self, wtxn: &mut RwTxn, + progress: &Progress, rng: &mut R, dimension: usize, quantizing: bool, + arroy_memory: Option, cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { @@ -102,9 +110,19 @@ impl ArroyWrapper { // sensitive. if quantizing && !self.quantized { let writer = writer.prepare_changing_distance::(wtxn)?; - writer.builder(rng).cancel(cancel).build(wtxn)?; + writer + .builder(rng) + .available_memory(arroy_memory.unwrap_or(usize::MAX)) + .progress(|step| progress.update_progress_from_arroy(step)) + .cancel(cancel) + .build(wtxn)?; } else if writer.need_build(wtxn)? { - writer.builder(rng).cancel(cancel).build(wtxn)?; + writer + .builder(rng) + .available_memory(arroy_memory.unwrap_or(usize::MAX)) + .progress(|step| progress.update_progress_from_arroy(step)) + .cancel(cancel) + .build(wtxn)?; } else if writer.is_empty(wtxn)? { break; } @@ -409,8 +427,43 @@ impl ArroyWrapper { fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } + + pub fn aggregate_stats( + &self, + rtxn: &RoTxn, + stats: &mut ArroyStats, + ) -> Result<(), arroy::Error> { + if self.quantized { + for reader in self.readers(rtxn, self.quantized_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } else { + for reader in self.readers(rtxn, self.angular_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } + + Ok(()) + } } +#[derive(Debug, Default, Clone)] +pub struct ArroyStats { + pub number_of_embeddings: u64, + pub documents: RoaringBitmap, +} /// One or multiple embeddings stored consecutively in a flat vector. pub struct Embeddings { data: Vec, @@ -475,7 +528,7 @@ impl Embeddings { Ok(()) } - /// Append a flat vector of embeddings a the end of the embeddings. + /// Append a flat vector of embeddings at the end of the embeddings. /// /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { @@ -500,6 +553,48 @@ pub enum Embedder { Ollama(ollama::Embedder), /// An embedder based on making embedding queries against a generic JSON/REST embedding server. Rest(rest::Embedder), + /// An embedder composed of an embedder at search time and an embedder at indexing time. + Composite(composite::Embedder), +} + +#[derive(Debug)] +struct EmbeddingCache { + data: Option>>, +} + +impl EmbeddingCache { + const MAX_TEXT_LEN: usize = 2000; + + pub fn new(cap: usize) -> Self { + let data = NonZeroUsize::new(cap).map(lru::LruCache::new).map(Mutex::new); + Self { data } + } + + /// Get the embedding corresponding to `text`, if any is present in the cache. + pub fn get(&self, text: &str) -> Option { + let data = self.data.as_ref()?; + if text.len() > Self::MAX_TEXT_LEN { + return None; + } + let mut cache = data.lock().unwrap(); + + cache.get(text).cloned() + } + + /// Puts a new embedding for the specified `text` + pub fn put(&self, text: String, embedding: Embedding) { + let Some(data) = self.data.as_ref() else { + return; + }; + if text.len() > Self::MAX_TEXT_LEN { + return; + } + tracing::trace!(text, "embedding added to cache"); + + let mut cache = data.lock().unwrap(); + + cache.put(text, embedding); + } } /// Configuration for an embedder. @@ -569,6 +664,7 @@ pub enum EmbedderOptions { Ollama(ollama::EmbedderOptions), UserProvided(manual::EmbedderOptions), Rest(rest::EmbedderOptions), + Composite(composite::EmbedderOptions), } impl Default for EmbedderOptions { @@ -579,75 +675,102 @@ impl Default for EmbedderOptions { impl Embedder { /// Spawns a new embedder built from its options. - pub fn new(options: EmbedderOptions) -> std::result::Result { + pub fn new( + options: EmbedderOptions, + cache_cap: usize, + ) -> std::result::Result { Ok(match options { - EmbedderOptions::HuggingFace(options) => Self::HuggingFace(hf::Embedder::new(options)?), - EmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?), - EmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?), + EmbedderOptions::HuggingFace(options) => { + Self::HuggingFace(hf::Embedder::new(options, cache_cap)?) + } + EmbedderOptions::OpenAi(options) => { + Self::OpenAi(openai::Embedder::new(options, cache_cap)?) + } + EmbedderOptions::Ollama(options) => { + Self::Ollama(ollama::Embedder::new(options, cache_cap)?) + } EmbedderOptions::UserProvided(options) => { Self::UserProvided(manual::Embedder::new(options)) } - EmbedderOptions::Rest(options) => { - Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?) + EmbedderOptions::Rest(options) => Self::Rest(rest::Embedder::new( + options, + cache_cap, + rest::ConfigurationSource::User, + )?), + EmbedderOptions::Composite(options) => { + Self::Composite(composite::Embedder::new(options, cache_cap)?) } }) } - /// Embed one or multiple texts. - /// - /// Each text can be embedded as one or multiple embeddings. - pub fn embed( - &self, - texts: Vec, - deadline: Option, - ) -> std::result::Result, EmbedError> { - match self { - Embedder::HuggingFace(embedder) => embedder.embed(texts), - Embedder::OpenAi(embedder) => embedder.embed(&texts, deadline), - Embedder::Ollama(embedder) => embedder.embed(&texts, deadline), - Embedder::UserProvided(embedder) => embedder.embed(&texts), - Embedder::Rest(embedder) => embedder.embed(texts, deadline), - } - } + /// Embed in search context - pub fn embed_one( + #[tracing::instrument(level = "debug", skip_all, target = "search")] + pub fn embed_search( &self, - text: String, + text: &str, deadline: Option, ) -> std::result::Result { - let mut embedding = self.embed(vec![text], deadline)?; - let embedding = embedding.pop().ok_or_else(EmbedError::missing_embedding)?; + if let Some(cache) = self.cache() { + if let Some(embedding) = cache.get(text) { + tracing::trace!(text, "embedding found in cache"); + return Ok(embedding); + } + } + let embedding = match self { + Embedder::HuggingFace(embedder) => embedder.embed_one(text), + Embedder::OpenAi(embedder) => { + embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + } + Embedder::Ollama(embedder) => { + embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding) + } + Embedder::UserProvided(embedder) => embedder.embed_one(text), + Embedder::Rest(embedder) => embedder + .embed_ref(&[text], deadline)? + .pop() + .ok_or_else(EmbedError::missing_embedding), + Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline), + }?; + + if let Some(cache) = self.cache() { + cache.put(text.to_owned(), embedding.clone()); + } + Ok(embedding) } /// Embed multiple chunks of texts. /// /// Each chunk is composed of one or multiple texts. - pub fn embed_chunks( + pub fn embed_index( &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> std::result::Result>, EmbedError> { match self { - Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), - Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks, threads), - Embedder::Ollama(embedder) => embedder.embed_chunks(text_chunks, threads), - Embedder::UserProvided(embedder) => embedder.embed_chunks(text_chunks), - Embedder::Rest(embedder) => embedder.embed_chunks(text_chunks, threads), + Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), + Embedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads), + Embedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads), + Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks), + Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads), + Embedder::Composite(embedder) => embedder.index.embed_index(text_chunks, threads), } } - pub fn embed_chunks_ref( + /// Non-owning variant of [`Self::embed_index`]. + pub fn embed_index_ref( &self, texts: &[&str], threads: &ThreadPoolNoAbort, ) -> std::result::Result, EmbedError> { match self { - Embedder::HuggingFace(embedder) => embedder.embed_chunks_ref(texts), - Embedder::OpenAi(embedder) => embedder.embed_chunks_ref(texts, threads), - Embedder::Ollama(embedder) => embedder.embed_chunks_ref(texts, threads), - Embedder::UserProvided(embedder) => embedder.embed_chunks_ref(texts), - Embedder::Rest(embedder) => embedder.embed_chunks_ref(texts, threads), + Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), + Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads), + Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads), + Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts), + Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads), + Embedder::Composite(embedder) => embedder.index.embed_index_ref(texts, threads), } } @@ -659,6 +782,7 @@ impl Embedder { Embedder::Ollama(embedder) => embedder.chunk_count_hint(), Embedder::UserProvided(_) => 100, Embedder::Rest(embedder) => embedder.chunk_count_hint(), + Embedder::Composite(embedder) => embedder.index.chunk_count_hint(), } } @@ -670,6 +794,7 @@ impl Embedder { Embedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), Embedder::UserProvided(_) => 1, Embedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), + Embedder::Composite(embedder) => embedder.index.prompt_count_in_chunk_hint(), } } @@ -681,6 +806,7 @@ impl Embedder { Embedder::Ollama(embedder) => embedder.dimensions(), Embedder::UserProvided(embedder) => embedder.dimensions(), Embedder::Rest(embedder) => embedder.dimensions(), + Embedder::Composite(embedder) => embedder.dimensions(), } } @@ -692,6 +818,7 @@ impl Embedder { Embedder::Ollama(embedder) => embedder.distribution(), Embedder::UserProvided(embedder) => embedder.distribution(), Embedder::Rest(embedder) => embedder.distribution(), + Embedder::Composite(embedder) => embedder.distribution(), } } @@ -702,6 +829,18 @@ impl Embedder { | Embedder::Ollama(_) | Embedder::Rest(_) => true, Embedder::UserProvided(_) => false, + Embedder::Composite(embedder) => embedder.index.uses_document_template(), + } + } + + fn cache(&self) -> Option<&EmbeddingCache> { + match self { + Embedder::HuggingFace(embedder) => Some(embedder.cache()), + Embedder::OpenAi(embedder) => Some(embedder.cache()), + Embedder::UserProvided(_) => None, + Embedder::Ollama(embedder) => Some(embedder.cache()), + Embedder::Rest(embedder) => Some(embedder.cache()), + Embedder::Composite(embedder) => embedder.search.cache(), } } } @@ -710,18 +849,20 @@ impl Embedder { /// /// The intended use is to make the similarity score more comparable to the regular ranking score. /// This allows to correct effects where results are too "packed" around a certain value. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize, Serialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize, Serialize, ToSchema)] #[serde(from = "DistributionShiftSerializable")] #[serde(into = "DistributionShiftSerializable")] pub struct DistributionShift { /// Value where the results are "packed". /// /// Similarity scores are translated so that they are packed around 0.5 instead + #[schema(value_type = f32)] pub current_mean: OrderedFloat, /// standard deviation of a similarity score. /// /// Set below 0.4 to make the results less packed around the mean, and above 0.4 to make them more packed. + #[schema(value_type = f32)] pub current_sigma: OrderedFloat, } diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index 7ee775cbf..8beae6205 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -5,7 +5,7 @@ use rayon::slice::ParallelSlice as _; use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::DistributionShift; +use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM}; use crate::error::FaultSource; use crate::vector::Embedding; use crate::ThreadPoolNoAbort; @@ -38,26 +38,47 @@ impl EmbedderOptions { dimensions, } } -} -impl Embedder { - pub fn new(options: EmbedderOptions) -> Result { - let model = options.embedding_model.as_str(); - let rest_embedder = match RestEmbedder::new( - RestEmbedderOptions { - api_key: options.api_key, - dimensions: options.dimensions, - distribution: options.distribution, - url: options.url.unwrap_or_else(get_ollama_path), - request: serde_json::json!({ + fn into_rest_embedder_config(self) -> Result { + let url = self.url.unwrap_or_else(get_ollama_path); + let model = self.embedding_model.as_str(); + + // **warning**: do not swap these two `if`s, as the second one is always true when the first one is. + let (request, response) = if url.ends_with("/api/embeddings") { + ( + serde_json::json!({ "model": model, "prompt": super::rest::REQUEST_PLACEHOLDER, }), - response: serde_json::json!({ + serde_json::json!({ "embedding": super::rest::RESPONSE_PLACEHOLDER, }), - headers: Default::default(), - }, + ) + } else if url.ends_with("/api/embed") { + ( + serde_json::json!({"model": model, "input": [super::rest::REQUEST_PLACEHOLDER, super::rest::REPEAT_PLACEHOLDER]}), + serde_json::json!({"embeddings": [super::rest::RESPONSE_PLACEHOLDER, super::rest::REPEAT_PLACEHOLDER]}), + ) + } else { + return Err(NewEmbedderError::ollama_unsupported_url(url)); + }; + Ok(RestEmbedderOptions { + api_key: self.api_key, + dimensions: self.dimensions, + distribution: self.distribution, + url, + request, + response, + headers: Default::default(), + }) + } +} + +impl Embedder { + pub fn new(options: EmbedderOptions, cache_cap: usize) -> Result { + let rest_embedder = match RestEmbedder::new( + options.into_rest_embedder_config()?, + cache_cap, super::rest::ConfigurationSource::Ollama, ) { Ok(embedder) => embedder, @@ -93,40 +114,58 @@ impl Embedder { } } - pub fn embed_chunks( + pub fn embed_index( &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect() + } else { + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } - pub(crate) fn embed_chunks_ref( + pub(crate) fn embed_index_ref( &self, texts: &[&str], threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - let embeddings: Result>, _> = texts - .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) - .collect(); + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + let embeddings: Result>, _> = texts + .chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); - let embeddings = embeddings?; - Ok(embeddings.into_iter().flatten().collect()) - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + } else { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub fn chunk_count_hint(&self) -> usize { @@ -144,6 +183,10 @@ impl Embedder { pub fn distribution(&self) -> Option { self.rest_embedder.distribution() } + + pub(super) fn cache(&self) -> &EmbeddingCache { + self.rest_embedder.cache() + } } fn get_ollama_path() -> String { diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 7262bfef8..df29f6916 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -1,3 +1,4 @@ +use std::fmt; use std::time::Instant; use ordered_float::OrderedFloat; @@ -6,7 +7,7 @@ use rayon::slice::ParallelSlice as _; use super::error::{EmbedError, NewEmbedderError}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::DistributionShift; +use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM}; use crate::error::FaultSource; use crate::vector::error::EmbedErrorKind; use crate::vector::Embedding; @@ -168,7 +169,6 @@ fn infer_api_key() -> String { .unwrap_or_default() } -#[derive(Debug)] pub struct Embedder { tokenizer: tiktoken_rs::CoreBPE, rest_embedder: RestEmbedder, @@ -176,7 +176,7 @@ pub struct Embedder { } impl Embedder { - pub fn new(options: EmbedderOptions) -> Result { + pub fn new(options: EmbedderOptions, cache_cap: usize) -> Result { let mut inferred_api_key = Default::default(); let api_key = options.api_key.as_ref().unwrap_or_else(|| { inferred_api_key = infer_api_key(); @@ -201,6 +201,7 @@ impl Embedder { }), headers: Default::default(), }, + cache_cap, super::rest::ConfigurationSource::OpenAi, )?; @@ -250,40 +251,57 @@ impl Embedder { Ok(all_embeddings) } - pub fn embed_chunks( + pub fn embed_index( &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect() + } else { + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } - pub(crate) fn embed_chunks_ref( + pub(crate) fn embed_index_ref( &self, texts: &[&str], threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - let embeddings: Result>, _> = texts - .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed(chunk, None)) - .collect(); + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + let embeddings: Result>, _> = texts + .chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + } else { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed(chunk, None)) + .collect(); - let embeddings = embeddings?; - Ok(embeddings.into_iter().flatten().collect()) - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub fn chunk_count_hint(&self) -> usize { @@ -301,4 +319,18 @@ impl Embedder { pub fn distribution(&self) -> Option { self.options.distribution() } + + pub(super) fn cache(&self) -> &EmbeddingCache { + self.rest_embedder.cache() + } +} + +impl fmt::Debug for Embedder { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Embedder") + .field("tokenizer", &"CoreBPE") + .field("rest_embedder", &self.rest_embedder) + .field("options", &self.options) + .finish() + } } diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index da41d1771..5fcb2912b 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -10,8 +10,6 @@ use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::{DocumentId, FieldId, InternalError, UserError}; -pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; - #[derive(serde::Serialize, Debug)] #[serde(untagged)] pub enum RawVectors<'doc> { diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 98be311d4..b87ac9f77 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -9,7 +9,9 @@ use serde::{Deserialize, Serialize}; use super::error::EmbedErrorKind; use super::json_template::ValueTemplate; -use super::{DistributionShift, EmbedError, Embedding, NewEmbedderError, REQUEST_PARALLELISM}; +use super::{ + DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, REQUEST_PARALLELISM, +}; use crate::error::FaultSource; use crate::ThreadPoolNoAbort; @@ -75,6 +77,7 @@ pub struct Embedder { data: EmbedderData, dimensions: usize, distribution: Option, + cache: EmbeddingCache, } /// All data needed to perform requests and parse responses @@ -123,6 +126,7 @@ enum InputType { impl Embedder { pub fn new( options: EmbedderOptions, + cache_cap: usize, configuration_source: ConfigurationSource, ) -> Result { let bearer = options.api_key.as_deref().map(|api_key| format!("Bearer {api_key}")); @@ -130,6 +134,7 @@ impl Embedder { let client = ureq::AgentBuilder::new() .max_idle_connections(REQUEST_PARALLELISM * 2) .max_idle_connections_per_host(REQUEST_PARALLELISM * 2) + .timeout(std::time::Duration::from_secs(30)) .build(); let request = Request::new(options.request)?; @@ -151,7 +156,12 @@ impl Embedder { infer_dimensions(&data)? }; - Ok(Self { data, dimensions, distribution: options.distribution }) + Ok(Self { + data, + dimensions, + distribution: options.distribution, + cache: EmbeddingCache::new(cache_cap), + }) } pub fn embed( @@ -175,7 +185,7 @@ impl Embedder { pub fn embed_tokens( &self, - tokens: &[usize], + tokens: &[u32], deadline: Option, ) -> Result { let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions), deadline)?; @@ -183,40 +193,58 @@ impl Embedder { Ok(embeddings.pop().unwrap()) } - pub fn embed_chunks( + pub fn embed_index( &self, text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>, EmbedError> { - threads - .install(move || { - text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect() - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + text_chunks.into_iter().map(move |chunk| self.embed(chunk, None)).collect() + } else { + threads + .install(move || { + text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect() + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } - pub(crate) fn embed_chunks_ref( + pub(crate) fn embed_index_ref( &self, texts: &[&str], threads: &ThreadPoolNoAbort, ) -> Result, EmbedError> { - threads - .install(move || { - let embeddings: Result>, _> = texts - .par_chunks(self.prompt_count_in_chunk_hint()) - .map(move |chunk| self.embed_ref(chunk, None)) - .collect(); + // This condition helps reduce the number of active rayon jobs + // so that we avoid consuming all the LMDB rtxns and avoid stack overflows. + if threads.active_operations() >= REQUEST_PARALLELISM { + let embeddings: Result>, _> = texts + .chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed_ref(chunk, None)) + .collect(); - let embeddings = embeddings?; - Ok(embeddings.into_iter().flatten().collect()) - }) - .map_err(|error| EmbedError { - kind: EmbedErrorKind::PanicInThreadPool(error), - fault: FaultSource::Bug, - })? + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + } else { + threads + .install(move || { + let embeddings: Result>, _> = texts + .par_chunks(self.prompt_count_in_chunk_hint()) + .map(move |chunk| self.embed_ref(chunk, None)) + .collect(); + + let embeddings = embeddings?; + Ok(embeddings.into_iter().flatten().collect()) + }) + .map_err(|error| EmbedError { + kind: EmbedErrorKind::PanicInThreadPool(error), + fault: FaultSource::Bug, + })? + } } pub fn chunk_count_hint(&self) -> usize { @@ -237,6 +265,10 @@ impl Embedder { pub fn distribution(&self) -> Option { self.distribution } + + pub(super) fn cache(&self) -> &EmbeddingCache { + &self.cache + } } fn infer_dimensions(data: &EmbedderData) -> Result { diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index d1cf364a2..3948ad4d8 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -4,75 +4,542 @@ use std::num::NonZeroUsize; use deserr::Deserr; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; -use super::{ollama, openai, DistributionShift}; +use super::composite::SubEmbedderOptions; +use super::hf::OverridePooling; +use super::{ollama, openai, DistributionShift, EmbedderOptions}; use crate::prompt::{default_max_bytes, PromptData}; use crate::update::Setting; use crate::vector::EmbeddingConfig; use crate::UserError; -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub struct EmbeddingSettings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// The source used to provide the embeddings. + /// + /// Which embedder parameters are available and mandatory is determined by the value of this setting. + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings. + /// + /// # Defaults + /// + /// - Defaults to `openAi` pub source: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// The name of the model to use. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `ollama` + /// + /// # Availability + /// + /// - This parameter is available for sources `openAi`, `huggingFace`, `ollama` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings. + /// + /// # Defaults + /// + /// - For source `openAi`, defaults to `text-embedding-3-small` + /// - For source `huggingFace`, defaults to `BAAI/bge-base-en-v1.5` pub model: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// The revision (commit SHA1) of the model to use. + /// + /// If unspecified, Meilisearch picks the latest revision of the model. + /// + /// # Availability + /// + /// - This parameter is available for source `huggingFace` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings + /// + /// # Defaults + /// + /// - When `model` is set to default, defaults to `617ca489d9e86b49b8167676d8220688b99db36e` + /// - Otherwise, defaults to `null` pub revision: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// The pooling method to use. + /// + /// # Availability + /// + /// - This parameter is available for source `huggingFace` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings + /// + /// # Defaults + /// + /// - Defaults to `useModel` + /// + /// # Compatibility Note + /// + /// - Embedders created before this parameter was available default to `forceMean` to preserve the existing behavior. + pub pooling: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The API key to pass to the remote embedder while making requests. + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `ollama`, `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + /// + /// # Defaults + /// + /// - For source `openAi`, the key is read from `OPENAI_API_KEY`, then `MEILI_OPENAI_API_KEY`. + /// - For other sources, no bearer token is sent if this parameter is not set. + /// + /// # Note + /// + /// - This setting is partially hidden when returned by the settings pub api_key: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// The expected dimensions of the embeddings produced by this embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `userProvided` + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `ollama`, `rest`, `userProvided` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When the source is `openAi`, changing the value of this parameter always regenerates embeddings + /// - 🌱 For other sources, changing the value of this parameter never regenerates embeddings + /// + /// # Defaults + /// + /// - For source `openAi`, the dimensions is the maximum allowed by the model. + /// - For sources `ollama` and `rest`, the dimensions are inferred by embedding a sample text. pub dimensions: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// Whether to binary quantize the embeddings of this embedder. + /// + /// Binary quantized embeddings are smaller than regular embeddings, which improves + /// disk usage and retrieval speed, at the cost of relevancy. + /// + /// # Availability + /// + /// - This parameter is available for all embedders + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When set to `true`, embeddings are not regenerated, but they are binary quantized, which takes time. + /// + /// # Defaults + /// + /// - Defaults to `false` + /// + /// # Note + /// + /// As binary quantization is a destructive operation, it is not possible to disable again this setting after + /// first enabling it. If you are unsure of whether the performance-relevancy tradeoff is right for you, + /// we recommend to use this parameter on a test index first. pub binary_quantized: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// A liquid template used to render documents to a text that can be embedded. + /// + /// Meillisearch interpolates the template for each document and sends the resulting text to the embedder. + /// The embedder then generates document vectors based on this text. + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `huggingFace`, `ollama` and `rest + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When modified, embeddings are regenerated for documents whose rendering through the template produces a different text. pub document_template: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// Rendered texts are truncated to this size. + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `huggingFace`, `ollama` and `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When increased, embeddings are regenerated for documents whose rendering through the template produces a different text. + /// - 🌱 When decreased, embeddings are never regenerated + /// + /// # Default + /// + /// - Defaults to 400 pub document_template_max_bytes: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// URL to reach the remote embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `rest` + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `ollama` and `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 When modified for source `openAi`, embeddings are never regenerated + /// - šŸ—ļø When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// Template request to send to the remote embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `rest` + /// + /// # Availability + /// + /// - This parameter is available for source `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings pub request: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + /// Template response indicating how to find the embeddings in the response from the remote embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `rest` + /// + /// # Availability + /// + /// - This parameter is available for source `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings pub response: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option>)] + /// Additional headers to send to the remote embedder. + /// + /// # Availability + /// + /// - This parameter is available for source `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings pub headers: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] + #[schema(value_type = Option)] + pub search_embedder: Setting, + + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + pub indexing_embedder: Setting, + + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// Affine transformation applied to the semantic score to make it more comparable to the ranking score. + /// + /// # Availability + /// + /// - This parameter is available for all embedders + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings pub distribution: Setting, } -pub fn check_unset( - key: &Setting, - field: &'static str, - source: EmbedderSource, - embedder_name: &str, -) -> Result<(), UserError> { - if matches!(key, Setting::NotSet) { - Ok(()) - } else { - Err(UserError::InvalidFieldForSource { - embedder_name: embedder_name.to_owned(), - source_: source, - field, - allowed_fields_for_source: EmbeddingSettings::allowed_fields_for_source(source), - allowed_sources_for_field: EmbeddingSettings::allowed_sources_for_field(field), - }) - } +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] +#[serde(deny_unknown_fields, rename_all = "camelCase")] +#[deserr(rename_all = camelCase, deny_unknown_fields)] +pub struct SubEmbeddingSettings { + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The source used to provide the embeddings. + /// + /// Which embedder parameters are available and mandatory is determined by the value of this setting. + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings. + /// + /// # Defaults + /// + /// - Defaults to `openAi` + pub source: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The name of the model to use. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `ollama` + /// + /// # Availability + /// + /// - This parameter is available for sources `openAi`, `huggingFace`, `ollama` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings. + /// + /// # Defaults + /// + /// - For source `openAi`, defaults to `text-embedding-3-small` + /// - For source `huggingFace`, defaults to `BAAI/bge-base-en-v1.5` + pub model: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The revision (commit SHA1) of the model to use. + /// + /// If unspecified, Meilisearch picks the latest revision of the model. + /// + /// # Availability + /// + /// - This parameter is available for source `huggingFace` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings + /// + /// # Defaults + /// + /// - When `model` is set to default, defaults to `617ca489d9e86b49b8167676d8220688b99db36e` + /// - Otherwise, defaults to `null` + pub revision: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The pooling method to use. + /// + /// # Availability + /// + /// - This parameter is available for source `huggingFace` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings + /// + /// # Defaults + /// + /// - Defaults to `useModel` + /// + /// # Compatibility Note + /// + /// - Embedders created before this parameter was available default to `forceMean` to preserve the existing behavior. + pub pooling: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The API key to pass to the remote embedder while making requests. + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `ollama`, `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + /// + /// # Defaults + /// + /// - For source `openAi`, the key is read from `OPENAI_API_KEY`, then `MEILI_OPENAI_API_KEY`. + /// - For other sources, no bearer token is sent if this parameter is not set. + /// + /// # Note + /// + /// - This setting is partially hidden when returned by the settings + pub api_key: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// The expected dimensions of the embeddings produced by this embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `userProvided` + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `ollama`, `rest`, `userProvided` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When the source is `openAi`, changing the value of this parameter always regenerates embeddings + /// - 🌱 For other sources, changing the value of this parameter never regenerates embeddings + /// + /// # Defaults + /// + /// - For source `openAi`, the dimensions is the maximum allowed by the model. + /// - For sources `ollama` and `rest`, the dimensions are inferred by embedding a sample text. + pub dimensions: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// A liquid template used to render documents to a text that can be embedded. + /// + /// Meillisearch interpolates the template for each document and sends the resulting text to the embedder. + /// The embedder then generates document vectors based on this text. + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `huggingFace`, `ollama` and `rest + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When modified, embeddings are regenerated for documents whose rendering through the template produces a different text. + pub document_template: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// Rendered texts are truncated to this size. + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `huggingFace`, `ollama` and `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø When increased, embeddings are regenerated for documents whose rendering through the template produces a different text. + /// - 🌱 When decreased, embeddings are never regenerated + /// + /// # Default + /// + /// - Defaults to 400 + pub document_template_max_bytes: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// URL to reach the remote embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `rest` + /// + /// # Availability + /// + /// - This parameter is available for source `openAi`, `ollama` and `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 When modified for source `openAi`, embeddings are never regenerated + /// - šŸ—ļø When modified for sources `ollama` and `rest`, embeddings are always regenerated + pub url: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// Template request to send to the remote embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `rest` + /// + /// # Availability + /// + /// - This parameter is available for source `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings + pub request: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option)] + /// Template response indicating how to find the embeddings in the response from the remote embedder. + /// + /// # Mandatory + /// + /// - This parameter is mandatory for source `rest` + /// + /// # Availability + /// + /// - This parameter is available for source `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - šŸ—ļø Changing the value of this parameter always regenerates embeddings + pub response: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + /// Additional headers to send to the remote embedder. + /// + /// # Availability + /// + /// - This parameter is available for source `rest` + /// + /// # šŸ”„ Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + pub headers: Setting>, + + // The following fields are provided for the sake of improving error handling + // They should always be set to `NotSet`, otherwise an error will be returned + #[serde(default, skip_serializing)] + #[deserr(default)] + #[schema(ignore)] + pub distribution: Setting, + + #[serde(default, skip_serializing)] + #[deserr(default)] + #[schema(ignore)] + pub binary_quantized: Setting, + + #[serde(default, skip_serializing)] + #[deserr(default)] + #[schema(ignore)] + pub search_embedder: Setting, + + #[serde(default, skip_serializing)] + #[deserr(default)] + #[schema(ignore)] + pub indexing_embedder: Setting, } /// Indicates what action should take place during a reindexing operation for an embedder @@ -150,12 +617,15 @@ impl SettingsDiff { mut source, mut model, mut revision, + mut pooling, mut api_key, mut dimensions, mut document_template, mut url, mut request, mut response, + mut search_embedder, + mut indexing_embedder, mut distribution, mut headers, mut document_template_max_bytes, @@ -166,12 +636,15 @@ impl SettingsDiff { source: new_source, model: new_model, revision: new_revision, + pooling: new_pooling, api_key: new_api_key, dimensions: new_dimensions, document_template: new_document_template, url: new_url, request: new_request, response: new_response, + search_embedder: new_search_embedder, + indexing_embedder: new_indexing_embedder, distribution: new_distribution, headers: new_headers, document_template_max_bytes: new_document_template_max_bytes, @@ -188,100 +661,59 @@ impl SettingsDiff { let mut reindex_action = None; - // **Warning**: do not use short-circuiting || here, we want all these operations applied - if source.apply(new_source) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - // when the source changes, we need to reapply the default settings for the new source - apply_default_for_source( - &source, - &mut model, - &mut revision, - &mut dimensions, - &mut url, - &mut request, - &mut response, - &mut document_template, - &mut document_template_max_bytes, - &mut headers, - ) - } - if model.apply(new_model) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if revision.apply(new_revision) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if dimensions.apply(new_dimensions) { - match source { - // regenerate on dimensions change in OpenAI since truncation is supported - Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { - ReindexAction::push_action( - &mut reindex_action, - ReindexAction::FullReindex, - ); - } - // for all other embedders, the parameter is a hint that should not be able to change the result - // and so won't cause a reindex by itself. - _ => {} - } - } + Self::apply_and_diff( + &mut reindex_action, + &mut source, + &mut model, + &mut revision, + &mut pooling, + &mut api_key, + &mut dimensions, + &mut document_template, + &mut document_template_max_bytes, + &mut url, + &mut request, + &mut response, + &mut headers, + new_source, + new_model, + new_revision, + new_pooling, + new_api_key, + new_dimensions, + new_document_template, + new_document_template_max_bytes, + new_url, + new_request, + new_response, + new_headers, + ); + let binary_quantize_changed = binary_quantize.apply(new_binary_quantize); - if url.apply(new_url) { - match source { - // do not regenerate on an url change in OpenAI - Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => {} - _ => { - ReindexAction::push_action( - &mut reindex_action, - ReindexAction::FullReindex, - ); - } - } - } - if request.apply(new_request) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if response.apply(new_response) { - ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); - } - if document_template.apply(new_document_template) { - ReindexAction::push_action( - &mut reindex_action, - ReindexAction::RegeneratePrompts, - ); - } - if document_template_max_bytes.apply(new_document_template_max_bytes) { - let previous_document_template_max_bytes = - document_template_max_bytes.set().unwrap_or(default_max_bytes().get()); - let new_document_template_max_bytes = - new_document_template_max_bytes.set().unwrap_or(default_max_bytes().get()); - - // only reindex if the size increased. Reasoning: - // - size decrease is a performance optimization, so we don't reindex and we keep the more accurate vectors - // - size increase is an accuracy optimization, so we want to reindex - if new_document_template_max_bytes > previous_document_template_max_bytes { - ReindexAction::push_action( - &mut reindex_action, - ReindexAction::RegeneratePrompts, - ) - } - } + // changes to the *search* embedder never triggers any reindexing + search_embedder.apply(new_search_embedder); + indexing_embedder = Self::from_sub_settings( + indexing_embedder, + new_indexing_embedder, + &mut reindex_action, + )?; distribution.apply(new_distribution); - api_key.apply(new_api_key); - headers.apply(new_headers); let updated_settings = EmbeddingSettings { source, model, revision, + pooling, api_key, dimensions, document_template, url, request, response, + search_embedder, + indexing_embedder, distribution, headers, document_template_max_bytes, @@ -307,6 +739,223 @@ impl SettingsDiff { }; Ok(ret) } + + fn from_sub_settings( + sub_embedder: Setting, + new_sub_embedder: Setting, + reindex_action: &mut Option, + ) -> Result, UserError> { + let ret = match new_sub_embedder { + Setting::Set(new_sub_embedder) => { + let Setting::Set(SubEmbeddingSettings { + mut source, + mut model, + mut revision, + mut pooling, + mut api_key, + mut dimensions, + mut document_template, + mut document_template_max_bytes, + mut url, + mut request, + mut response, + mut headers, + // phony settings + mut distribution, + mut binary_quantized, + mut search_embedder, + mut indexing_embedder, + }) = sub_embedder + else { + // return the new_indexing_embedder if the indexing_embedder was not set + // this should happen only when changing the source, so the decision to reindex is already taken. + return Ok(Setting::Set(new_sub_embedder)); + }; + + let SubEmbeddingSettings { + source: new_source, + model: new_model, + revision: new_revision, + pooling: new_pooling, + api_key: new_api_key, + dimensions: new_dimensions, + document_template: new_document_template, + document_template_max_bytes: new_document_template_max_bytes, + url: new_url, + request: new_request, + response: new_response, + headers: new_headers, + distribution: new_distribution, + binary_quantized: new_binary_quantized, + search_embedder: new_search_embedder, + indexing_embedder: new_indexing_embedder, + } = new_sub_embedder; + + Self::apply_and_diff( + reindex_action, + &mut source, + &mut model, + &mut revision, + &mut pooling, + &mut api_key, + &mut dimensions, + &mut document_template, + &mut document_template_max_bytes, + &mut url, + &mut request, + &mut response, + &mut headers, + new_source, + new_model, + new_revision, + new_pooling, + new_api_key, + new_dimensions, + new_document_template, + new_document_template_max_bytes, + new_url, + new_request, + new_response, + new_headers, + ); + + // update phony settings, it is always an error to have them set. + distribution.apply(new_distribution); + binary_quantized.apply(new_binary_quantized); + search_embedder.apply(new_search_embedder); + indexing_embedder.apply(new_indexing_embedder); + + let updated_settings = SubEmbeddingSettings { + source, + model, + revision, + pooling, + api_key, + dimensions, + document_template, + url, + request, + response, + headers, + document_template_max_bytes, + distribution, + binary_quantized, + search_embedder, + indexing_embedder, + }; + Setting::Set(updated_settings) + } + // handled during validation of the settings + Setting::Reset | Setting::NotSet => sub_embedder, + }; + Ok(ret) + } + + #[allow(clippy::too_many_arguments)] + fn apply_and_diff( + reindex_action: &mut Option, + source: &mut Setting, + model: &mut Setting, + revision: &mut Setting, + pooling: &mut Setting, + api_key: &mut Setting, + dimensions: &mut Setting, + document_template: &mut Setting, + document_template_max_bytes: &mut Setting, + url: &mut Setting, + request: &mut Setting, + response: &mut Setting, + headers: &mut Setting>, + new_source: Setting, + new_model: Setting, + new_revision: Setting, + new_pooling: Setting, + new_api_key: Setting, + new_dimensions: Setting, + new_document_template: Setting, + new_document_template_max_bytes: Setting, + new_url: Setting, + new_request: Setting, + new_response: Setting, + new_headers: Setting>, + ) { + // **Warning**: do not use short-circuiting || here, we want all these operations applied + if source.apply(new_source) { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + // when the source changes, we need to reapply the default settings for the new source + apply_default_for_source( + &*source, + model, + revision, + pooling, + dimensions, + url, + request, + response, + document_template, + document_template_max_bytes, + headers, + // send dummy values, the source cannot recursively be composite + &mut Setting::NotSet, + &mut Setting::NotSet, + ) + } + if model.apply(new_model) { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + if revision.apply(new_revision) { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + if pooling.apply(new_pooling) { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + if dimensions.apply(new_dimensions) { + match *source { + // regenerate on dimensions change in OpenAI since truncation is supported + Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + // for all other embedders, the parameter is a hint that should not be able to change the result + // and so won't cause a reindex by itself. + _ => {} + } + } + if url.apply(new_url) { + match *source { + // do not regenerate on an url change in OpenAI + Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => {} + _ => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + } + } + if request.apply(new_request) { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + if response.apply(new_response) { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + } + if document_template.apply(new_document_template) { + ReindexAction::push_action(reindex_action, ReindexAction::RegeneratePrompts); + } + + if document_template_max_bytes.apply(new_document_template_max_bytes) { + let previous_document_template_max_bytes = + document_template_max_bytes.set().unwrap_or(default_max_bytes().get()); + let new_document_template_max_bytes = + new_document_template_max_bytes.set().unwrap_or(default_max_bytes().get()); + + // only reindex if the size increased. Reasoning: + // - size decrease is a performance optimization, so we don't reindex and we keep the more accurate vectors + // - size increase is an accuracy optimization, so we want to reindex + if new_document_template_max_bytes > previous_document_template_max_bytes { + ReindexAction::push_action(reindex_action, ReindexAction::RegeneratePrompts) + } + } + + api_key.apply(new_api_key); + headers.apply(new_headers); + } } impl ReindexAction { @@ -324,6 +973,7 @@ fn apply_default_for_source( source: &Setting, model: &mut Setting, revision: &mut Setting, + pooling: &mut Setting, dimensions: &mut Setting, url: &mut Setting, request: &mut Setting, @@ -331,47 +981,62 @@ fn apply_default_for_source( document_template: &mut Setting, document_template_max_bytes: &mut Setting, headers: &mut Setting>, + search_embedder: &mut Setting, + indexing_embedder: &mut Setting, ) { match source { Setting::Set(EmbedderSource::HuggingFace) => { *model = Setting::Reset; *revision = Setting::Reset; + *pooling = Setting::Reset; *dimensions = Setting::NotSet; *url = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; + *search_embedder = Setting::NotSet; + *indexing_embedder = Setting::NotSet; } Setting::Set(EmbedderSource::Ollama) => { *model = Setting::Reset; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; + *search_embedder = Setting::NotSet; + *indexing_embedder = Setting::NotSet; } Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { *model = Setting::Reset; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::Reset; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; + *search_embedder = Setting::NotSet; + *indexing_embedder = Setting::NotSet; } Setting::Set(EmbedderSource::Rest) => { *model = Setting::NotSet; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::Reset; *request = Setting::Reset; *response = Setting::Reset; *headers = Setting::Reset; + *search_embedder = Setting::NotSet; + *indexing_embedder = Setting::NotSet; } Setting::Set(EmbedderSource::UserProvided) => { *model = Setting::NotSet; *revision = Setting::NotSet; + *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; *request = Setting::NotSet; @@ -379,141 +1044,374 @@ fn apply_default_for_source( *document_template = Setting::NotSet; *document_template_max_bytes = Setting::NotSet; *headers = Setting::NotSet; + *search_embedder = Setting::NotSet; + *indexing_embedder = Setting::NotSet; + } + Setting::Set(EmbedderSource::Composite) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *pooling = Setting::NotSet; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *request = Setting::NotSet; + *response = Setting::NotSet; + *document_template = Setting::NotSet; + *document_template_max_bytes = Setting::NotSet; + *headers = Setting::NotSet; + *search_embedder = Setting::Reset; + *indexing_embedder = Setting::Reset; } Setting::NotSet => {} } } -pub fn check_set( - key: &Setting, - field: &'static str, - source: EmbedderSource, - embedder_name: &str, -) -> Result<(), UserError> { - if matches!(key, Setting::Set(_)) { - Ok(()) - } else { - Err(UserError::MissingFieldForSource { - field, - source_: source, - embedder_name: embedder_name.to_owned(), - }) +pub(crate) enum FieldStatus { + Mandatory, + Allowed, + Disallowed, +} + +#[derive(Debug, Clone, Copy)] +pub enum NestingContext { + NotNested, + Search, + Indexing, +} + +impl NestingContext { + pub fn embedder_name_with_context(&self, embedder_name: &str) -> String { + match self { + NestingContext::NotNested => embedder_name.to_string(), + NestingContext::Search => format!("{embedder_name}.searchEmbedder"), + NestingContext::Indexing => format!("{embedder_name}.indexingEmbedder",), + } + } + + pub fn in_context(&self) -> &'static str { + match self { + NestingContext::NotNested => "", + NestingContext::Search => " for the search embedder", + NestingContext::Indexing => " for the indexing embedder", + } + } + + pub fn nesting_embedders(&self) -> &'static str { + match self { + NestingContext::NotNested => "", + NestingContext::Search => { + "\n - note: nesting embedders in `searchEmbedder` is not allowed" + } + NestingContext::Indexing => { + "\n - note: nesting embedders in `indexingEmbedder` is not allowed" + } + } + } +} + +#[derive(Debug, Clone, Copy, enum_iterator::Sequence)] +pub enum MetaEmbeddingSetting { + Source, + Model, + Revision, + Pooling, + ApiKey, + Dimensions, + DocumentTemplate, + DocumentTemplateMaxBytes, + Url, + Request, + Response, + Headers, + SearchEmbedder, + IndexingEmbedder, + Distribution, + BinaryQuantized, +} + +impl MetaEmbeddingSetting { + pub(crate) fn name(&self) -> &'static str { + use MetaEmbeddingSetting::*; + match self { + Source => "source", + Model => "model", + Revision => "revision", + Pooling => "pooling", + ApiKey => "apiKey", + Dimensions => "dimensions", + DocumentTemplate => "documentTemplate", + DocumentTemplateMaxBytes => "documentTemplateMaxBytes", + Url => "url", + Request => "request", + Response => "response", + Headers => "headers", + SearchEmbedder => "searchEmbedder", + IndexingEmbedder => "indexingEmbedder", + Distribution => "distribution", + BinaryQuantized => "binaryQuantized", + } } } impl EmbeddingSettings { - pub const SOURCE: &'static str = "source"; - pub const MODEL: &'static str = "model"; - pub const REVISION: &'static str = "revision"; - pub const API_KEY: &'static str = "apiKey"; - pub const DIMENSIONS: &'static str = "dimensions"; - pub const DOCUMENT_TEMPLATE: &'static str = "documentTemplate"; - pub const DOCUMENT_TEMPLATE_MAX_BYTES: &'static str = "documentTemplateMaxBytes"; + #[allow(clippy::too_many_arguments)] + pub(crate) fn check_settings( + embedder_name: &str, + source: EmbedderSource, + context: NestingContext, + model: &Setting, + revision: &Setting, + pooling: &Setting, + dimensions: &Setting, + api_key: &Setting, + url: &Setting, + request: &Setting, + response: &Setting, + document_template: &Setting, + document_template_max_bytes: &Setting, + headers: &Setting>, + search_embedder: &Setting, + indexing_embedder: &Setting, + binary_quantized: &Setting, + distribution: &Setting, + ) -> Result<(), UserError> { + Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Model, context, model)?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Revision, + context, + revision, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Pooling, + context, + pooling, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Dimensions, + context, + dimensions, + )?; + Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?; + Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Request, + context, + request, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Response, + context, + response, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::DocumentTemplate, + context, + document_template, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::DocumentTemplateMaxBytes, + context, + document_template_max_bytes, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Headers, + context, + headers, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::SearchEmbedder, + context, + search_embedder, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::IndexingEmbedder, + context, + indexing_embedder, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::BinaryQuantized, + context, + binary_quantized, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::Distribution, + context, + distribution, + ) + } - pub const URL: &'static str = "url"; - pub const REQUEST: &'static str = "request"; - pub const RESPONSE: &'static str = "response"; - pub const HEADERS: &'static str = "headers"; + pub(crate) fn allowed_sources_for_field( + field: MetaEmbeddingSetting, + context: NestingContext, + ) -> Vec { + enum_iterator::all() + .filter(|source| { + !matches!(Self::field_status(*source, field, context), FieldStatus::Disallowed) + }) + .collect() + } - pub const DISTRIBUTION: &'static str = "distribution"; + pub(crate) fn allowed_fields_for_source( + source: EmbedderSource, + context: NestingContext, + ) -> Vec<&'static str> { + enum_iterator::all() + .filter(|field| { + !matches!(Self::field_status(source, *field, context), FieldStatus::Disallowed) + }) + .map(|field| field.name()) + .collect() + } - pub const BINARY_QUANTIZED: &'static str = "binaryQuantized"; - - pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { - match field { - Self::SOURCE => &[ - EmbedderSource::HuggingFace, - EmbedderSource::OpenAi, - EmbedderSource::UserProvided, - EmbedderSource::Rest, - EmbedderSource::Ollama, - ], - Self::MODEL => { - &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi, EmbedderSource::Ollama] - } - Self::REVISION => &[EmbedderSource::HuggingFace], - Self::API_KEY => { - &[EmbedderSource::OpenAi, EmbedderSource::Ollama, EmbedderSource::Rest] - } - Self::DIMENSIONS => &[ - EmbedderSource::OpenAi, - EmbedderSource::UserProvided, - EmbedderSource::Ollama, - EmbedderSource::Rest, - ], - Self::DOCUMENT_TEMPLATE => &[ - EmbedderSource::HuggingFace, - EmbedderSource::OpenAi, - EmbedderSource::Ollama, - EmbedderSource::Rest, - ], - Self::URL => &[EmbedderSource::Ollama, EmbedderSource::Rest, EmbedderSource::OpenAi], - Self::REQUEST => &[EmbedderSource::Rest], - Self::RESPONSE => &[EmbedderSource::Rest], - Self::HEADERS => &[EmbedderSource::Rest], - Self::DISTRIBUTION => &[ - EmbedderSource::HuggingFace, - EmbedderSource::Ollama, - EmbedderSource::OpenAi, - EmbedderSource::Rest, - EmbedderSource::UserProvided, - ], - Self::BINARY_QUANTIZED => &[ - EmbedderSource::HuggingFace, - EmbedderSource::Ollama, - EmbedderSource::OpenAi, - EmbedderSource::Rest, - EmbedderSource::UserProvided, - ], - _other => unreachable!("unknown field"), + fn check_setting( + embedder_name: &str, + source: EmbedderSource, + field: MetaEmbeddingSetting, + context: NestingContext, + setting: &Setting, + ) -> Result<(), UserError> { + match (Self::field_status(source, field, context), setting) { + (FieldStatus::Mandatory, Setting::Set(_)) + | (FieldStatus::Allowed, _) + | (FieldStatus::Disallowed, Setting::NotSet) => Ok(()), + (FieldStatus::Disallowed, _) => Err(UserError::InvalidFieldForSource { + embedder_name: context.embedder_name_with_context(embedder_name), + source_: source, + context, + field, + }), + (FieldStatus::Mandatory, _) => Err(UserError::MissingFieldForSource { + field: field.name(), + source_: source, + embedder_name: embedder_name.to_owned(), + }), } } - pub fn allowed_fields_for_source(source: EmbedderSource) -> &'static [&'static str] { - match source { - EmbedderSource::OpenAi => &[ - Self::SOURCE, - Self::MODEL, - Self::API_KEY, - Self::DOCUMENT_TEMPLATE, - Self::DIMENSIONS, - Self::DISTRIBUTION, - Self::URL, - Self::BINARY_QUANTIZED, - ], - EmbedderSource::HuggingFace => &[ - Self::SOURCE, - Self::MODEL, - Self::REVISION, - Self::DOCUMENT_TEMPLATE, - Self::DISTRIBUTION, - Self::BINARY_QUANTIZED, - ], - EmbedderSource::Ollama => &[ - Self::SOURCE, - Self::MODEL, - Self::DOCUMENT_TEMPLATE, - Self::URL, - Self::API_KEY, - Self::DIMENSIONS, - Self::DISTRIBUTION, - Self::BINARY_QUANTIZED, - ], - EmbedderSource::UserProvided => { - &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION, Self::BINARY_QUANTIZED] + pub(crate) fn field_status( + source: EmbedderSource, + field: MetaEmbeddingSetting, + context: NestingContext, + ) -> FieldStatus { + use EmbedderSource::*; + use MetaEmbeddingSetting::*; + use NestingContext::*; + match (source, field, context) { + (_, Distribution | BinaryQuantized, NotNested) => FieldStatus::Allowed, + (_, Distribution | BinaryQuantized, _) => FieldStatus::Disallowed, + (_, DocumentTemplate | DocumentTemplateMaxBytes, Search) => FieldStatus::Disallowed, + ( + OpenAi, + Source + | Model + | ApiKey + | DocumentTemplate + | DocumentTemplateMaxBytes + | Dimensions + | Url, + _, + ) => FieldStatus::Allowed, + ( + OpenAi, + Revision | Pooling | Request | Response | Headers | SearchEmbedder + | IndexingEmbedder, + _, + ) => FieldStatus::Disallowed, + ( + HuggingFace, + Source | Model | Revision | Pooling | DocumentTemplate | DocumentTemplateMaxBytes, + _, + ) => FieldStatus::Allowed, + ( + HuggingFace, + ApiKey | Dimensions | Url | Request | Response | Headers | SearchEmbedder + | IndexingEmbedder, + _, + ) => FieldStatus::Disallowed, + (Ollama, Model, _) => FieldStatus::Mandatory, + ( + Ollama, + Source | DocumentTemplate | DocumentTemplateMaxBytes | Url | ApiKey | Dimensions, + _, + ) => FieldStatus::Allowed, + ( + Ollama, + Revision | Pooling | Request | Response | Headers | SearchEmbedder + | IndexingEmbedder, + _, + ) => FieldStatus::Disallowed, + (UserProvided, Dimensions, _) => FieldStatus::Mandatory, + (UserProvided, Source, _) => FieldStatus::Allowed, + ( + UserProvided, + Model + | Revision + | Pooling + | ApiKey + | DocumentTemplate + | DocumentTemplateMaxBytes + | Url + | Request + | Response + | Headers + | SearchEmbedder + | IndexingEmbedder, + _, + ) => FieldStatus::Disallowed, + (Rest, Url | Request | Response, _) => FieldStatus::Mandatory, + ( + Rest, + Source + | ApiKey + | Dimensions + | DocumentTemplate + | DocumentTemplateMaxBytes + | Headers, + _, + ) => FieldStatus::Allowed, + (Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => { + FieldStatus::Disallowed } - EmbedderSource::Rest => &[ - Self::SOURCE, - Self::API_KEY, - Self::DIMENSIONS, - Self::DOCUMENT_TEMPLATE, - Self::URL, - Self::REQUEST, - Self::RESPONSE, - Self::HEADERS, - Self::DISTRIBUTION, - Self::BINARY_QUANTIZED, - ], + (Composite, SearchEmbedder | IndexingEmbedder, _) => FieldStatus::Mandatory, + (Composite, Source, _) => FieldStatus::Allowed, + ( + Composite, + Model + | Revision + | Pooling + | ApiKey + | Dimensions + | DocumentTemplate + | DocumentTemplateMaxBytes + | Url + | Request + | Response + | Headers, + _, + ) => FieldStatus::Disallowed, } } @@ -537,9 +1435,45 @@ impl EmbeddingSettings { *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) } } + + pub(crate) fn check_nested_source( + embedder_name: &str, + source: EmbedderSource, + context: NestingContext, + ) -> Result<(), UserError> { + match (context, source) { + (NestingContext::NotNested, _) => Ok(()), + ( + NestingContext::Search | NestingContext::Indexing, + EmbedderSource::Composite | EmbedderSource::UserProvided, + ) => Err(UserError::InvalidSourceForNested { + embedder_name: context.embedder_name_with_context(embedder_name), + source_: source, + }), + ( + NestingContext::Search | NestingContext::Indexing, + EmbedderSource::OpenAi + | EmbedderSource::HuggingFace + | EmbedderSource::Ollama + | EmbedderSource::Rest, + ) => Ok(()), + } + } } -#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] +#[derive( + Debug, + Clone, + Copy, + Default, + Serialize, + Deserialize, + PartialEq, + Eq, + Deserr, + ToSchema, + enum_iterator::Sequence, +)] #[serde(deny_unknown_fields, rename_all = "camelCase")] #[deserr(rename_all = camelCase, deny_unknown_fields)] pub enum EmbedderSource { @@ -549,6 +1483,7 @@ pub enum EmbedderSource { Ollama, UserProvided, Rest, + Composite, } impl std::fmt::Display for EmbedderSource { @@ -559,119 +1494,311 @@ impl std::fmt::Display for EmbedderSource { EmbedderSource::UserProvided => "userProvided", EmbedderSource::Ollama => "ollama", EmbedderSource::Rest => "rest", + EmbedderSource::Composite => "composite", }; f.write_str(s) } } +impl EmbeddingSettings { + fn from_hugging_face( + super::hf::EmbedderOptions { + model, + revision, + distribution, + pooling, + }: super::hf::EmbedderOptions, + document_template: Setting, + document_template_max_bytes: Setting, + quantized: Option, + ) -> Self { + Self { + source: Setting::Set(EmbedderSource::HuggingFace), + model: Setting::Set(model), + revision: Setting::some_or_not_set(revision), + pooling: Setting::Set(pooling), + api_key: Setting::NotSet, + dimensions: Setting::NotSet, + document_template, + document_template_max_bytes, + url: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, + headers: Setting::NotSet, + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, + distribution: Setting::some_or_not_set(distribution), + binary_quantized: Setting::some_or_not_set(quantized), + } + } + + fn from_openai( + super::openai::EmbedderOptions { + url, + api_key, + embedding_model, + dimensions, + distribution, + }: super::openai::EmbedderOptions, + document_template: Setting, + document_template_max_bytes: Setting, + quantized: Option, + ) -> Self { + Self { + source: Setting::Set(EmbedderSource::OpenAi), + model: Setting::Set(embedding_model.name().to_owned()), + revision: Setting::NotSet, + pooling: Setting::NotSet, + api_key: Setting::some_or_not_set(api_key), + dimensions: Setting::some_or_not_set(dimensions), + document_template, + document_template_max_bytes, + url: Setting::some_or_not_set(url), + request: Setting::NotSet, + response: Setting::NotSet, + headers: Setting::NotSet, + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, + distribution: Setting::some_or_not_set(distribution), + binary_quantized: Setting::some_or_not_set(quantized), + } + } + + fn from_ollama( + super::ollama::EmbedderOptions { + embedding_model, + url, + api_key, + distribution, + dimensions, + }: super::ollama::EmbedderOptions, + document_template: Setting, + document_template_max_bytes: Setting, + quantized: Option, + ) -> Self { + Self { + source: Setting::Set(EmbedderSource::Ollama), + model: Setting::Set(embedding_model), + revision: Setting::NotSet, + pooling: Setting::NotSet, + api_key: Setting::some_or_not_set(api_key), + dimensions: Setting::some_or_not_set(dimensions), + document_template, + document_template_max_bytes, + url: Setting::some_or_not_set(url), + request: Setting::NotSet, + response: Setting::NotSet, + headers: Setting::NotSet, + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, + distribution: Setting::some_or_not_set(distribution), + binary_quantized: Setting::some_or_not_set(quantized), + } + } + + fn from_user_provided( + super::manual::EmbedderOptions { dimensions, distribution }: super::manual::EmbedderOptions, + quantized: Option, + ) -> Self { + Self { + source: Setting::Set(EmbedderSource::UserProvided), + model: Setting::NotSet, + revision: Setting::NotSet, + pooling: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::Set(dimensions), + document_template: Setting::NotSet, + document_template_max_bytes: Setting::NotSet, + url: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, + headers: Setting::NotSet, + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, + distribution: Setting::some_or_not_set(distribution), + binary_quantized: Setting::some_or_not_set(quantized), + } + } + + fn from_rest( + super::rest::EmbedderOptions { + api_key, + dimensions, + url, + request, + response, + distribution, + headers, + }: super::rest::EmbedderOptions, + document_template: Setting, + document_template_max_bytes: Setting, + quantized: Option, + ) -> Self { + Self { + source: Setting::Set(EmbedderSource::Rest), + model: Setting::NotSet, + revision: Setting::NotSet, + pooling: Setting::NotSet, + api_key: Setting::some_or_not_set(api_key), + dimensions: Setting::some_or_not_set(dimensions), + document_template, + document_template_max_bytes, + url: Setting::Set(url), + request: Setting::Set(request), + response: Setting::Set(response), + distribution: Setting::some_or_not_set(distribution), + headers: Setting::Set(headers), + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, + binary_quantized: Setting::some_or_not_set(quantized), + } + } +} + impl From for EmbeddingSettings { fn from(value: EmbeddingConfig) -> Self { let EmbeddingConfig { embedder_options, prompt, quantized } = value; let document_template_max_bytes = Setting::Set(prompt.max_bytes.unwrap_or(default_max_bytes()).get()); match embedder_options { - super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions { - model, - revision, - distribution, - }) => Self { - source: Setting::Set(EmbedderSource::HuggingFace), - model: Setting::Set(model), - revision: Setting::some_or_not_set(revision), - api_key: Setting::NotSet, - dimensions: Setting::NotSet, - document_template: Setting::Set(prompt.template), + super::EmbedderOptions::HuggingFace(options) => Self::from_hugging_face( + options, + Setting::Set(prompt.template), document_template_max_bytes, - url: Setting::NotSet, - request: Setting::NotSet, - response: Setting::NotSet, - headers: Setting::NotSet, - distribution: Setting::some_or_not_set(distribution), - binary_quantized: Setting::some_or_not_set(quantized), - }, - super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions { - url, - api_key, - embedding_model, - dimensions, - distribution, - }) => Self { - source: Setting::Set(EmbedderSource::OpenAi), - model: Setting::Set(embedding_model.name().to_owned()), - revision: Setting::NotSet, - api_key: Setting::some_or_not_set(api_key), - dimensions: Setting::some_or_not_set(dimensions), - document_template: Setting::Set(prompt.template), + quantized, + ), + super::EmbedderOptions::OpenAi(options) => Self::from_openai( + options, + Setting::Set(prompt.template), document_template_max_bytes, - url: Setting::some_or_not_set(url), - request: Setting::NotSet, - response: Setting::NotSet, - headers: Setting::NotSet, - distribution: Setting::some_or_not_set(distribution), - binary_quantized: Setting::some_or_not_set(quantized), - }, - super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions { - embedding_model, - url, - api_key, - distribution, - dimensions, - }) => Self { - source: Setting::Set(EmbedderSource::Ollama), - model: Setting::Set(embedding_model), - revision: Setting::NotSet, - api_key: Setting::some_or_not_set(api_key), - dimensions: Setting::some_or_not_set(dimensions), - document_template: Setting::Set(prompt.template), + quantized, + ), + super::EmbedderOptions::Ollama(options) => Self::from_ollama( + options, + Setting::Set(prompt.template), document_template_max_bytes, - url: Setting::some_or_not_set(url), - request: Setting::NotSet, - response: Setting::NotSet, - headers: Setting::NotSet, - distribution: Setting::some_or_not_set(distribution), - binary_quantized: Setting::some_or_not_set(quantized), - }, - super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { - dimensions, - distribution, + quantized, + ), + super::EmbedderOptions::UserProvided(options) => { + Self::from_user_provided(options, quantized) + } + super::EmbedderOptions::Rest(options) => Self::from_rest( + options, + Setting::Set(prompt.template), + document_template_max_bytes, + quantized, + ), + super::EmbedderOptions::Composite(super::composite::EmbedderOptions { + search, + index, }) => Self { - source: Setting::Set(EmbedderSource::UserProvided), + source: Setting::Set(EmbedderSource::Composite), model: Setting::NotSet, revision: Setting::NotSet, + pooling: Setting::NotSet, api_key: Setting::NotSet, - dimensions: Setting::Set(dimensions), + dimensions: Setting::NotSet, + binary_quantized: Setting::some_or_not_set(quantized), document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, - distribution: Setting::some_or_not_set(distribution), - binary_quantized: Setting::some_or_not_set(quantized), + distribution: Setting::some_or_not_set(search.distribution()), + search_embedder: Setting::Set(SubEmbeddingSettings::from_options( + search, + Setting::NotSet, + Setting::NotSet, + )), + indexing_embedder: Setting::Set(SubEmbeddingSettings::from_options( + index, + Setting::Set(prompt.template), + document_template_max_bytes, + )), }, - super::EmbedderOptions::Rest(super::rest::EmbedderOptions { - api_key, - dimensions, - url, - request, - response, - distribution, - headers, - }) => Self { - source: Setting::Set(EmbedderSource::Rest), - model: Setting::NotSet, - revision: Setting::NotSet, - api_key: Setting::some_or_not_set(api_key), - dimensions: Setting::some_or_not_set(dimensions), - document_template: Setting::Set(prompt.template), + } + } +} + +impl SubEmbeddingSettings { + fn from_options( + options: SubEmbedderOptions, + document_template: Setting, + document_template_max_bytes: Setting, + ) -> Self { + let settings = match options { + SubEmbedderOptions::HuggingFace(embedder_options) => { + EmbeddingSettings::from_hugging_face( + embedder_options, + document_template, + document_template_max_bytes, + None, + ) + } + SubEmbedderOptions::OpenAi(embedder_options) => EmbeddingSettings::from_openai( + embedder_options, + document_template, document_template_max_bytes, - url: Setting::Set(url), - request: Setting::Set(request), - response: Setting::Set(response), - distribution: Setting::some_or_not_set(distribution), - headers: Setting::Set(headers), - binary_quantized: Setting::some_or_not_set(quantized), - }, + None, + ), + SubEmbedderOptions::Ollama(embedder_options) => EmbeddingSettings::from_ollama( + embedder_options, + document_template, + document_template_max_bytes, + None, + ), + SubEmbedderOptions::UserProvided(embedder_options) => { + EmbeddingSettings::from_user_provided(embedder_options, None) + } + SubEmbedderOptions::Rest(embedder_options) => EmbeddingSettings::from_rest( + embedder_options, + document_template, + document_template_max_bytes, + None, + ), + }; + settings.into() + } +} + +impl From for SubEmbeddingSettings { + fn from(value: EmbeddingSettings) -> Self { + let EmbeddingSettings { + source, + model, + revision, + pooling, + api_key, + dimensions, + document_template, + document_template_max_bytes, + url, + request, + response, + headers, + binary_quantized: _, + search_embedder: _, + indexing_embedder: _, + distribution: _, + } = value; + Self { + source, + model, + revision, + pooling, + api_key, + dimensions, + document_template, + document_template_max_bytes, + url, + request, + response, + headers, + distribution: Setting::NotSet, + binary_quantized: Setting::NotSet, + search_embedder: Setting::NotSet, + indexing_embedder: Setting::NotSet, } } } @@ -683,6 +1810,7 @@ impl From for EmbeddingConfig { source, model, revision, + pooling, api_key, dimensions, document_template, @@ -693,85 +1821,26 @@ impl From for EmbeddingConfig { distribution, headers, binary_quantized, + search_embedder, + mut indexing_embedder, } = value; this.quantized = binary_quantized.set(); - - if let Some(source) = source.set() { - match source { - EmbedderSource::OpenAi => { - let mut options = super::openai::EmbedderOptions::with_default_model(None); - if let Some(model) = model.set() { - if let Some(model) = super::openai::EmbeddingModel::from_name(&model) { - options.embedding_model = model; - } - } - if let Some(url) = url.set() { - options.url = Some(url); - } - if let Some(api_key) = api_key.set() { - options.api_key = Some(api_key); - } - if let Some(dimensions) = dimensions.set() { - options.dimensions = Some(dimensions); - } - options.distribution = distribution.set(); - this.embedder_options = super::EmbedderOptions::OpenAi(options); - } - EmbedderSource::Ollama => { - let mut options: ollama::EmbedderOptions = - super::ollama::EmbedderOptions::with_default_model( - api_key.set(), - url.set(), - dimensions.set(), - ); - if let Some(model) = model.set() { - options.embedding_model = model; - } - - options.distribution = distribution.set(); - this.embedder_options = super::EmbedderOptions::Ollama(options); - } - EmbedderSource::HuggingFace => { - let mut options = super::hf::EmbedderOptions::default(); - if let Some(model) = model.set() { - options.model = model; - // Reset the revision if we are setting the model. - // This allows the following: - // "huggingFace": {} -> default model with default revision - // "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision - // "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision - options.revision = None; - } - if let Some(revision) = revision.set() { - options.revision = Some(revision); - } - options.distribution = distribution.set(); - this.embedder_options = super::EmbedderOptions::HuggingFace(options); - } - EmbedderSource::UserProvided => { - this.embedder_options = - super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { - dimensions: dimensions.set().unwrap(), - distribution: distribution.set(), - }); - } - EmbedderSource::Rest => { - this.embedder_options = - super::EmbedderOptions::Rest(super::rest::EmbedderOptions { - api_key: api_key.set(), - dimensions: dimensions.set(), - url: url.set().unwrap(), - request: request.set().unwrap(), - response: response.set().unwrap(), - distribution: distribution.set(), - headers: headers.set().unwrap_or_default(), - }) - } + if let Some((template, document_template_max_bytes)) = + match (document_template, &mut indexing_embedder) { + (Setting::Set(template), _) => Some((template, document_template_max_bytes)), + // retrieve the prompt from the indexing embedder in case of a composite embedder + ( + _, + Setting::Set(SubEmbeddingSettings { + document_template: Setting::Set(document_template), + document_template_max_bytes, + .. + }), + ) => Some((std::mem::take(document_template), *document_template_max_bytes)), + _ => None, } - } - - if let Setting::Set(template) = document_template { + { let max_bytes = document_template_max_bytes .set() .and_then(NonZeroUsize::new) @@ -780,6 +1849,208 @@ impl From for EmbeddingConfig { this.prompt = PromptData { template, max_bytes: Some(max_bytes) } } + if let Some(source) = source.set() { + this.embedder_options = match source { + EmbedderSource::OpenAi => { + SubEmbedderOptions::openai(model, url, api_key, dimensions, distribution).into() + } + EmbedderSource::Ollama => { + SubEmbedderOptions::ollama(model, url, api_key, dimensions, distribution).into() + } + EmbedderSource::HuggingFace => { + SubEmbedderOptions::hugging_face(model, revision, pooling, distribution).into() + } + EmbedderSource::UserProvided => { + SubEmbedderOptions::user_provided(dimensions.set().unwrap(), distribution) + .into() + } + EmbedderSource::Rest => SubEmbedderOptions::rest( + url.set().unwrap(), + api_key, + request.set().unwrap(), + response.set().unwrap(), + headers, + dimensions, + distribution, + ) + .into(), + EmbedderSource::Composite => { + super::EmbedderOptions::Composite(super::composite::EmbedderOptions { + // it is important to give the distribution to the search here, as this is from where we'll retrieve it + search: SubEmbedderOptions::from_settings( + search_embedder.set().unwrap(), + distribution, + ), + index: SubEmbedderOptions::from_settings( + indexing_embedder.set().unwrap(), + Setting::NotSet, + ), + }) + } + }; + } + this } } + +impl SubEmbedderOptions { + fn from_settings( + settings: SubEmbeddingSettings, + distribution: Setting, + ) -> Self { + let SubEmbeddingSettings { + source, + model, + revision, + pooling, + api_key, + dimensions, + // retrieved by the EmbeddingConfig + document_template: _, + document_template_max_bytes: _, + url, + request, + response, + headers, + // phony parameters + distribution: _, + binary_quantized: _, + search_embedder: _, + indexing_embedder: _, + } = settings; + + match source.set().unwrap() { + EmbedderSource::OpenAi => Self::openai(model, url, api_key, dimensions, distribution), + EmbedderSource::HuggingFace => { + Self::hugging_face(model, revision, pooling, distribution) + } + EmbedderSource::Ollama => Self::ollama(model, url, api_key, dimensions, distribution), + EmbedderSource::UserProvided => { + Self::user_provided(dimensions.set().unwrap(), distribution) + } + EmbedderSource::Rest => Self::rest( + url.set().unwrap(), + api_key, + request.set().unwrap(), + response.set().unwrap(), + headers, + dimensions, + distribution, + ), + EmbedderSource::Composite => panic!("nested composite embedders"), + } + } + + fn openai( + model: Setting, + url: Setting, + api_key: Setting, + dimensions: Setting, + distribution: Setting, + ) -> Self { + let mut options = super::openai::EmbedderOptions::with_default_model(None); + if let Some(model) = model.set() { + if let Some(model) = super::openai::EmbeddingModel::from_name(&model) { + options.embedding_model = model; + } + } + if let Some(url) = url.set() { + options.url = Some(url); + } + if let Some(api_key) = api_key.set() { + options.api_key = Some(api_key); + } + if let Some(dimensions) = dimensions.set() { + options.dimensions = Some(dimensions); + } + options.distribution = distribution.set(); + SubEmbedderOptions::OpenAi(options) + } + fn hugging_face( + model: Setting, + revision: Setting, + pooling: Setting, + distribution: Setting, + ) -> Self { + let mut options = super::hf::EmbedderOptions::default(); + if let Some(model) = model.set() { + options.model = model; + // Reset the revision if we are setting the model. + // This allows the following: + // "huggingFace": {} -> default model with default revision + // "huggingFace": { "model": "name-of-the-default-model" } -> default model without a revision + // "huggingFace": { "model": "some-other-model" } -> most importantly, other model without a revision + options.revision = None; + } + if let Some(revision) = revision.set() { + options.revision = Some(revision); + } + if let Some(pooling) = pooling.set() { + options.pooling = pooling; + } + options.distribution = distribution.set(); + SubEmbedderOptions::HuggingFace(options) + } + fn user_provided(dimensions: usize, distribution: Setting) -> Self { + Self::UserProvided(super::manual::EmbedderOptions { + dimensions, + distribution: distribution.set(), + }) + } + fn rest( + url: String, + api_key: Setting, + request: serde_json::Value, + response: serde_json::Value, + headers: Setting>, + dimensions: Setting, + distribution: Setting, + ) -> Self { + Self::Rest(super::rest::EmbedderOptions { + api_key: api_key.set(), + dimensions: dimensions.set(), + url, + request, + response, + distribution: distribution.set(), + headers: headers.set().unwrap_or_default(), + }) + } + fn ollama( + model: Setting, + url: Setting, + api_key: Setting, + dimensions: Setting, + distribution: Setting, + ) -> Self { + let mut options: ollama::EmbedderOptions = + super::ollama::EmbedderOptions::with_default_model( + api_key.set(), + url.set(), + dimensions.set(), + ); + if let Some(model) = model.set() { + options.embedding_model = model; + } + + options.distribution = distribution.set(); + SubEmbedderOptions::Ollama(options) + } +} + +impl From for EmbedderOptions { + fn from(value: SubEmbedderOptions) -> Self { + match value { + SubEmbedderOptions::HuggingFace(embedder_options) => { + Self::HuggingFace(embedder_options) + } + SubEmbedderOptions::OpenAi(embedder_options) => Self::OpenAi(embedder_options), + SubEmbedderOptions::Ollama(embedder_options) => Self::Ollama(embedder_options), + SubEmbedderOptions::UserProvided(embedder_options) => { + Self::UserProvided(embedder_options) + } + SubEmbedderOptions::Rest(embedder_options) => Self::Rest(embedder_options), + } + } +} diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 61d0697ff..8934cbea4 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -1,29 +1,30 @@ use big_s::S; use bumpalo::Bump; use heed::EnvOpenOptions; -use maplit::hashset; use milli::documents::mmap_from_objects; +use milli::progress::Progress; use milli::update::new::indexer; -use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; +use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::{FacetDistribution, Index, Object, OrderBy}; +use milli::{FacetDistribution, FilterableAttributesRule, Index, Object, OrderBy}; use serde_json::{from_value, json}; #[test] fn test_facet_distribution_with_no_facet_values() { let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = Index::new(options, &path, true).unwrap(); let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_filterable_fields(hashset! { - S("genres"), - S("tags"), - }); + builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("genres")), + FilterableAttributesRule::Field(S("tags")), + ]); builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); @@ -35,7 +36,7 @@ fn test_facet_distribution_with_no_facet_values() { let mut new_fields_ids_map = db_fields_ids_map.clone(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let doc1: Object = from_value( json!({ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] }), @@ -46,7 +47,7 @@ fn test_facet_distribution_with_no_facet_values() { let documents = mmap_from_objects(vec![doc1, doc2]); // index documents - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -57,13 +58,14 @@ fn test_facet_distribution_with_no_facet_values() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -71,7 +73,7 @@ fn test_facet_distribution_with_no_facet_values() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 1287b59d5..906956716 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -7,10 +7,13 @@ use bumpalo::Bump; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{btreemap, hashset}; +use milli::progress::Progress; use milli::update::new::indexer; -use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; +use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; -use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy}; +use milli::{ + AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy, +}; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -31,9 +34,10 @@ pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = Index::new(options, &path, true).unwrap(); let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); @@ -41,14 +45,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.to_vec()); - builder.set_filterable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - S("_geo"), - S("opt1"), - S("opt1.opt2"), - S("tag_in") - }); + builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("tag")), + FilterableAttributesRule::Field(S("asc_desc_rank")), + FilterableAttributesRule::Field(S("_geo")), + FilterableAttributesRule::Field(S("opt1")), + FilterableAttributesRule::Field(S("opt1.opt2")), + FilterableAttributesRule::Field(S("tag_in")), + ]); builder.set_sortable_fields(hashset! { S("tag"), S("asc_desc_rank"), @@ -71,7 +75,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut new_fields_ids_map = db_fields_ids_map.clone(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); file.write_all(CONTENT.as_bytes()).unwrap(); @@ -79,7 +83,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let payload = unsafe { memmap2::Mmap::map(&file).unwrap() }; // index documents - indexer.add_documents(&payload).unwrap(); + indexer.replace_documents(&payload).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, operation_stats, primary_key) = indexer @@ -90,7 +94,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); @@ -101,6 +105,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -108,7 +113,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); @@ -237,11 +242,11 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); } } else if matches!(filter, "opt1 IS NULL" | "NOT opt1 IS NOT NULL") { - id = document.opt1.as_ref().map_or(false, |v| v.is_null()).then(|| document.id.clone()); + id = document.opt1.as_ref().is_some_and(|v| v.is_null()).then(|| document.id.clone()); } else if matches!(filter, "NOT opt1 IS NULL" | "opt1 IS NOT NULL") { - id = document.opt1.as_ref().map_or(true, |v| !v.is_null()).then(|| document.id.clone()); + id = document.opt1.as_ref().is_none_or(|v| !v.is_null()).then(|| document.id.clone()); } else if matches!(filter, "opt1.opt2 IS NULL") { - if document.opt1opt2.as_ref().map_or(false, |v| v.is_null()) { + if document.opt1opt2.as_ref().is_some_and(|v| v.is_null()) { id = Some(document.id.clone()); } else if let Some(opt1) = &document.opt1 { if !opt1.is_null() { @@ -249,15 +254,11 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } } } else if matches!(filter, "opt1 IS EMPTY" | "NOT opt1 IS NOT EMPTY") { - id = document.opt1.as_ref().map_or(false, is_empty_value).then(|| document.id.clone()); + id = document.opt1.as_ref().is_some_and(is_empty_value).then(|| document.id.clone()); } else if matches!(filter, "NOT opt1 IS EMPTY" | "opt1 IS NOT EMPTY") { - id = document - .opt1 - .as_ref() - .map_or(true, |v| !is_empty_value(v)) - .then(|| document.id.clone()); + id = document.opt1.as_ref().is_none_or(|v| !is_empty_value(v)).then(|| document.id.clone()); } else if matches!(filter, "opt1.opt2 IS EMPTY") { - if document.opt1opt2.as_ref().map_or(false, is_empty_value) { + if document.opt1opt2.as_ref().is_some_and(is_empty_value) { id = Some(document.id.clone()); } } else if matches!( diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 3e56eeff0..1acc89484 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -5,8 +5,9 @@ use bumpalo::Bump; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; +use milli::progress::Progress; use milli::update::new::indexer; -use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; +use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; use rand::Rng; @@ -261,9 +262,10 @@ fn criteria_mixup() { #[test] fn criteria_ascdesc() { let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(12 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = Index::new(options, &path, true).unwrap(); let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); @@ -287,7 +289,7 @@ fn criteria_ascdesc() { let mut new_fields_ids_map = db_fields_ids_map.clone(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { @@ -317,7 +319,7 @@ fn criteria_ascdesc() { file.sync_all().unwrap(); let payload = unsafe { memmap2::Mmap::map(&file).unwrap() }; - indexer.add_documents(&payload).unwrap(); + indexer.replace_documents(&payload).unwrap(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( &indexer_alloc, @@ -326,13 +328,14 @@ fn criteria_ascdesc() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -340,7 +343,7 @@ fn criteria_ascdesc() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 7ac9a1e4b..3c0717063 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -3,8 +3,9 @@ use std::collections::BTreeSet; use bumpalo::Bump; use heed::EnvOpenOptions; use milli::documents::mmap_from_objects; +use milli::progress::Progress; use milli::update::new::indexer; -use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; +use milli::update::{IndexerConfig, Settings}; use milli::vector::EmbeddingConfigs; use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy}; use serde_json::from_value; @@ -107,9 +108,10 @@ fn test_typo_tolerance_two_typo() { #[test] fn test_typo_disabled_on_word() { let tmp = tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); + let options = EnvOpenOptions::new(); + let mut options = options.read_txn_without_tls(); options.map_size(4096 * 100); - let index = Index::new(options, tmp.path()).unwrap(); + let index = Index::new(options, tmp.path(), true).unwrap(); let doc1: Object = from_value(json!({ "id": 1usize, "data": "zealand" })).unwrap(); let doc2: Object = from_value(json!({ "id": 2usize, "data": "zearand" })).unwrap(); @@ -122,9 +124,9 @@ fn test_typo_disabled_on_word() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); let embedders = EmbeddingConfigs::default(); - let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); + let mut indexer = indexer::DocumentOperation::new(); - indexer.add_documents(&documents).unwrap(); + indexer.replace_documents(&documents).unwrap(); let indexer_alloc = Bump::new(); let (document_changes, _operation_stats, primary_key) = indexer @@ -135,13 +137,14 @@ fn test_typo_disabled_on_word() { None, &mut new_fields_ids_map, &|| false, - &|_progress| (), + Progress::default(), ) .unwrap(); indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -149,7 +152,7 @@ fn test_typo_disabled_on_word() { &document_changes, embedders, &|| false, - &|_| (), + &Progress::default(), ) .unwrap(); diff --git a/crates/tracing-trace/Cargo.toml b/crates/tracing-trace/Cargo.toml index 9afada3c2..a8a2d5d8b 100644 --- a/crates/tracing-trace/Cargo.toml +++ b/crates/tracing-trace/Cargo.toml @@ -8,17 +8,17 @@ edition = "2021" [dependencies] color-spantrace = "0.2.1" fxprof-processed-profile = "0.7.0" -serde = { version = "1.0.204", features = ["derive"] } -serde_json = "1.0.120" -tracing = "0.1.40" -tracing-error = "0.2.0" -tracing-subscriber = "0.3.18" -byte-unit = { version = "5.1.4", default-features = false, features = [ +serde = { version = "1.0.217", features = ["derive"] } +serde_json = "1.0.135" +tracing = "0.1.41" +tracing-error = "0.2.1" +tracing-subscriber = "0.3.19" +byte-unit = { version = "5.1.6", default-features = false, features = [ "std", "byte", "serde", ] } -tokio = { version = "1.38.0", features = ["sync"] } +tokio = { version = "1.42.0", features = ["sync"] } [target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies] -libproc = "0.14.8" +libproc = "0.14.10" diff --git a/crates/tracing-trace/src/main.rs b/crates/tracing-trace/src/main.rs index c2e4f08a7..4a3d26923 100644 --- a/crates/tracing-trace/src/main.rs +++ b/crates/tracing-trace/src/main.rs @@ -66,7 +66,7 @@ use tracing_error::ExtractSpanTrace as _; use tracing_subscriber::layer::SubscriberExt as _; use tracing_trace::processor; -fn on_panic(info: &std::panic::PanicInfo) { +fn on_panic(info: &std::panic::PanicHookInfo) { let info = info.to_string(); let trace = SpanTrace::capture(); tracing::error!(%info, %trace); diff --git a/crates/tracing-trace/src/processor/firefox_profiler.rs b/crates/tracing-trace/src/processor/firefox_profiler.rs index 9cb9540bb..e1000e04b 100644 --- a/crates/tracing-trace/src/processor/firefox_profiler.rs +++ b/crates/tracing-trace/src/processor/firefox_profiler.rs @@ -282,7 +282,7 @@ struct SpanMarker<'a> { memory_delta: Option, } -impl<'a> ProfilerMarker for SpanMarker<'a> { +impl ProfilerMarker for SpanMarker<'_> { const MARKER_TYPE_NAME: &'static str = "span"; fn schema() -> MarkerSchema { @@ -369,7 +369,7 @@ struct EventMarker<'a> { memory_delta: Option, } -impl<'a> ProfilerMarker for EventMarker<'a> { +impl ProfilerMarker for EventMarker<'_> { const MARKER_TYPE_NAME: &'static str = "tracing-event"; fn schema() -> MarkerSchema { diff --git a/crates/xtask/Cargo.toml b/crates/xtask/Cargo.toml index 79bed3d2e..496e1d362 100644 --- a/crates/xtask/Cargo.toml +++ b/crates/xtask/Cargo.toml @@ -11,34 +11,34 @@ license.workspace = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -anyhow = "1.0.86" +anyhow = "1.0.95" build-info = { version = "1.7.0", path = "../build-info" } -cargo_metadata = "0.18.1" -clap = { version = "4.5.9", features = ["derive"] } -futures-core = "0.3.30" -futures-util = "0.3.30" -reqwest = { version = "0.12.5", features = [ +cargo_metadata = "0.19.1" +clap = { version = "4.5.24", features = ["derive"] } +futures-core = "0.3.31" +futures-util = "0.3.31" +reqwest = { version = "0.12.12", features = [ "stream", "json", "rustls-tls", ], default-features = false } -serde = { version = "1.0.204", features = ["derive"] } -serde_json = "1.0.120" +serde = { version = "1.0.217", features = ["derive"] } +serde_json = "1.0.135" sha2 = "0.10.8" -sysinfo = "0.30.13" -time = { version = "0.3.36", features = [ +sysinfo = "0.33.1" +time = { version = "0.3.37", features = [ "serde", "serde-human-readable", "macros", ] } -tokio = { version = "1.38.0", features = [ +tokio = { version = "1.42.0", features = [ "rt", "net", "time", "process", "signal", ] } -tracing = "0.1.40" -tracing-subscriber = "0.3.18" +tracing = "0.1.41" +tracing-subscriber = "0.3.19" tracing-trace = { version = "0.1.0", path = "../tracing-trace" } -uuid = { version = "1.10.0", features = ["v7", "serde"] } +uuid = { version = "1.11.0", features = ["v7", "serde"] } diff --git a/crates/xtask/src/bench/env_info.rs b/crates/xtask/src/bench/env_info.rs index 08dacf915..877e7c2ff 100644 --- a/crates/xtask/src/bench/env_info.rs +++ b/crates/xtask/src/bench/env_info.rs @@ -27,8 +27,7 @@ impl Environment { let unknown_string = String::from("Unknown"); let mut system = System::new(); - system.refresh_cpu(); - system.refresh_cpu_frequency(); + system.refresh_cpu_all(); system.refresh_memory(); let (cpu, frequency) = match system.cpus().first() { @@ -50,9 +49,7 @@ impl Environment { if let Some(os) = System::os_version() { software.push(VersionInfo { name: os, version: String::from("kernel-release") }); } - if let Some(arch) = System::cpu_arch() { - software.push(VersionInfo { name: arch, version: String::from("arch") }); - } + software.push(VersionInfo { name: System::cpu_arch(), version: String::from("arch") }); Self { hostname: System::host_name(), diff --git a/crates/xtask/src/bench/meili_process.rs b/crates/xtask/src/bench/meili_process.rs index 99f6f4ea6..2aff679fc 100644 --- a/crates/xtask/src/bench/meili_process.rs +++ b/crates/xtask/src/bench/meili_process.rs @@ -1,23 +1,56 @@ use std::collections::BTreeMap; +use std::time::Duration; use anyhow::{bail, Context as _}; +use tokio::process::Command; +use tokio::time; use super::assets::Asset; use super::client::Client; use super::workload::Workload; pub async fn kill(mut meilisearch: tokio::process::Child) { - if let Err(error) = meilisearch.kill().await { - tracing::warn!( - error = &error as &dyn std::error::Error, - "while terminating Meilisearch server" - ) + let Some(id) = meilisearch.id() else { return }; + + match Command::new("kill").args(["--signal=TERM", &id.to_string()]).spawn() { + Ok(mut cmd) => { + let Err(error) = cmd.wait().await else { return }; + tracing::warn!( + error = &error as &dyn std::error::Error, + "while awaiting the Meilisearch server kill" + ); + } + Err(error) => { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server with a kill -s TERM" + ); + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } + return; + } + }; + + match time::timeout(Duration::from_secs(5), meilisearch.wait()).await { + Ok(_) => (), + Err(_) => { + if let Err(error) = meilisearch.kill().await { + tracing::warn!( + error = &error as &dyn std::error::Error, + "while terminating Meilisearch server" + ) + } + } } } #[tracing::instrument] pub async fn build() -> anyhow::Result<()> { - let mut command = tokio::process::Command::new("cargo"); + let mut command = Command::new("cargo"); command.arg("build").arg("--release").arg("-p").arg("meilisearch"); command.kill_on_drop(true); @@ -37,17 +70,8 @@ pub async fn start( master_key: Option<&str>, workload: &Workload, asset_folder: &str, + mut command: Command, ) -> anyhow::Result { - let mut command = tokio::process::Command::new("cargo"); - command - .arg("run") - .arg("--release") - .arg("-p") - .arg("meilisearch") - .arg("--bin") - .arg("meilisearch") - .arg("--"); - command.arg("--db-path").arg("./_xtask_benchmark.ms"); if let Some(master_key) = master_key { command.arg("--master-key").arg(master_key); @@ -86,7 +110,7 @@ async fn wait_for_health( return Ok(()); } - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + time::sleep(Duration::from_millis(500)).await; // check whether the Meilisearch instance exited early (cut the wait) if let Some(exit_code) = meilisearch.try_wait().context("cannot check Meilisearch server process status")? diff --git a/crates/xtask/src/bench/mod.rs b/crates/xtask/src/bench/mod.rs index fdb2c4963..1416c21d9 100644 --- a/crates/xtask/src/bench/mod.rs +++ b/crates/xtask/src/bench/mod.rs @@ -82,6 +82,16 @@ pub struct BenchDeriveArgs { /// Reason for the benchmark invocation #[arg(short, long)] reason: Option, + + /// The maximum time in seconds we allow for fetching the task queue before timing out. + #[arg(long, default_value_t = 60)] + tasks_queue_timeout_secs: u64, + + /// The path to the binary to run. + /// + /// If unspecified, runs `cargo run` after building Meilisearch with `cargo build`. + #[arg(long)] + binary_path: Option, } pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { @@ -127,7 +137,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { let meili_client = Client::new( Some("http://127.0.0.1:7700".into()), args.master_key.as_deref(), - Some(std::time::Duration::from_secs(60)), + Some(std::time::Duration::from_secs(args.tasks_queue_timeout_secs)), )?; // enter runtime @@ -135,7 +145,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { rt.block_on(async { dashboard_client.send_machine_info(&env).await?; - let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap(); + let commit_message = build_info.commit_msg.unwrap_or_default().split('\n').next().unwrap(); let max_workloads = args.workload_file.len(); let reason: Option<&str> = args.reason.as_deref(); let invocation_uuid = dashboard_client.create_invocation(build_info.clone(), commit_message, env, max_workloads, reason).await?; @@ -166,6 +176,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> { args.master_key.as_deref(), workload, &args, + args.binary_path.as_deref(), ) .await?; diff --git a/crates/xtask/src/bench/workload.rs b/crates/xtask/src/bench/workload.rs index 19c8bfae8..39119428f 100644 --- a/crates/xtask/src/bench/workload.rs +++ b/crates/xtask/src/bench/workload.rs @@ -1,6 +1,7 @@ use std::collections::BTreeMap; use std::fs::File; use std::io::{Seek as _, Write as _}; +use std::path::Path; use anyhow::{bail, Context as _}; use futures_util::TryStreamExt as _; @@ -85,13 +86,13 @@ pub async fn execute( master_key: Option<&str>, workload: Workload, args: &BenchDeriveArgs, + binary_path: Option<&Path>, ) -> anyhow::Result<()> { assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?; let workload_uuid = dashboard_client.create_workload(invocation_uuid, &workload).await?; let mut tasks = Vec::new(); - for i in 0..workload.run_count { tasks.push( execute_run( @@ -102,6 +103,7 @@ pub async fn execute( master_key, &workload, args, + binary_path, i, ) .await?, @@ -109,7 +111,6 @@ pub async fn execute( } let mut reports = Vec::with_capacity(workload.run_count as usize); - for task in tasks { reports.push( task.await @@ -133,13 +134,31 @@ async fn execute_run( master_key: Option<&str>, workload: &Workload, args: &BenchDeriveArgs, + binary_path: Option<&Path>, run_number: u16, ) -> anyhow::Result>> { meili_process::delete_db(); - meili_process::build().await?; + let run_command = match binary_path { + Some(binary_path) => tokio::process::Command::new(binary_path), + None => { + meili_process::build().await?; + let mut command = tokio::process::Command::new("cargo"); + command + .arg("run") + .arg("--release") + .arg("-p") + .arg("meilisearch") + .arg("--bin") + .arg("meilisearch") + .arg("--"); + command + } + }; + let meilisearch = - meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?; + meili_process::start(meili_client, master_key, workload, &args.asset_folder, run_command) + .await?; let processor = run_commands( dashboard_client, diff --git a/crates/xtask/src/main.rs b/crates/xtask/src/main.rs index b81424666..f260bd404 100644 --- a/crates/xtask/src/main.rs +++ b/crates/xtask/src/main.rs @@ -6,8 +6,8 @@ use xtask::bench::BenchDeriveArgs; /// List features available in the workspace #[derive(Parser, Debug)] struct ListFeaturesDeriveArgs { - /// Feature to exclude from the list. Repeat the argument to exclude multiple features - #[arg(short, long)] + /// Feature to exclude from the list. Use a comma to separate multiple features. + #[arg(short, long, value_delimiter = ',')] exclude_feature: Vec, } @@ -16,6 +16,7 @@ struct ListFeaturesDeriveArgs { #[command(author, version, about, long_about)] #[command(name = "cargo xtask")] #[command(bin_name = "cargo xtask")] +#[allow(clippy::large_enum_variant)] // please, that's enough... enum Command { ListFeatures(ListFeaturesDeriveArgs), Bench(BenchDeriveArgs), diff --git a/download-latest.sh b/download-latest.sh index c533d6616..b74722586 100644 --- a/download-latest.sh +++ b/download-latest.sh @@ -33,10 +33,12 @@ get_latest() { exit 1 fi - if [ -z "$GITHUB_PAT" ]; then - curl -s "$latest_release" > "$temp_file" || return 1 - else + if [ -n "$GITHUB_TOKEN" ]; then + curl -H "Authorization: Bearer $GITHUB_TOKEN" -s "$latest_release" > "$temp_file" || return 1 + elif [ -n "$GITHUB_PAT" ]; then curl -H "Authorization: token $GITHUB_PAT" -s "$latest_release" > "$temp_file" || return 1 + else + curl -s "$latest_release" > "$temp_file" || return 1 fi latest="$(cat "$temp_file" | grep '"tag_name":' | cut -d ':' -f2 | tr -d '"' | tr -d ',' | tr -d ' ')" diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs deleted file mode 100644 index caccb404b..000000000 --- a/milli/src/vector/mod.rs +++ /dev/null @@ -1,441 +0,0 @@ -use std::collections::HashMap; -use std::sync::Arc; - -use deserr::{DeserializeError, Deserr}; -use ordered_float::OrderedFloat; -use serde::{Deserialize, Serialize}; - -use self::error::{EmbedError, NewEmbedderError}; -use crate::prompt::{Prompt, PromptData}; -use crate::ThreadPoolNoAbort; - -pub mod error; -pub mod hf; -pub mod json_template; -pub mod manual; -pub mod openai; -pub mod parsed_vectors; -pub mod settings; - -pub mod ollama; -pub mod rest; - -pub use self::error::Error; - -pub type Embedding = Vec; - -pub const REQUEST_PARALLELISM: usize = 40; - -/// One or multiple embeddings stored consecutively in a flat vector. -pub struct Embeddings { - data: Vec, - dimension: usize, -} - -impl Embeddings { - /// Declares an empty vector of embeddings of the specified dimensions. - pub fn new(dimension: usize) -> Self { - Self { data: Default::default(), dimension } - } - - /// Declares a vector of embeddings containing a single element. - /// - /// The dimension is inferred from the length of the passed embedding. - pub fn from_single_embedding(embedding: Vec) -> Self { - Self { dimension: embedding.len(), data: embedding } - } - - /// Declares a vector of embeddings from its components. - /// - /// `data.len()` must be a multiple of `dimension`, otherwise an error is returned. - pub fn from_inner(data: Vec, dimension: usize) -> Result> { - let mut this = Self::new(dimension); - this.append(data)?; - Ok(this) - } - - /// Returns the number of embeddings in this vector of embeddings. - pub fn embedding_count(&self) -> usize { - self.data.len() / self.dimension - } - - /// Dimension of a single embedding. - pub fn dimension(&self) -> usize { - self.dimension - } - - /// Deconstructs self into the inner flat vector. - pub fn into_inner(self) -> Vec { - self.data - } - - /// A reference to the inner flat vector. - pub fn as_inner(&self) -> &[F] { - &self.data - } - - /// Iterates over the embeddings contained in the flat vector. - pub fn iter(&self) -> impl Iterator + '_ { - self.data.as_slice().chunks_exact(self.dimension) - } - - /// Push an embedding at the end of the embeddings. - /// - /// If `embedding.len() != self.dimension`, then the push operation fails. - pub fn push(&mut self, mut embedding: Vec) -> Result<(), Vec> { - if embedding.len() != self.dimension { - return Err(embedding); - } - self.data.append(&mut embedding); - Ok(()) - } - - /// Append a flat vector of embeddings a the end of the embeddings. - /// - /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. - pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { - if embeddings.len() % self.dimension != 0 { - return Err(embeddings); - } - self.data.append(&mut embeddings); - Ok(()) - } -} - -/// An embedder can be used to transform text into embeddings. -#[derive(Debug)] -pub enum Embedder { - /// An embedder based on running local models, fetched from the Hugging Face Hub. - HuggingFace(hf::Embedder), - /// An embedder based on making embedding queries against the OpenAI API. - OpenAi(openai::Embedder), - /// An embedder based on the user providing the embeddings in the documents and queries. - UserProvided(manual::Embedder), - /// An embedder based on making embedding queries against an embedding server. - Ollama(ollama::Embedder), - /// An embedder based on making embedding queries against a generic JSON/REST embedding server. - Rest(rest::Embedder), -} - -/// Configuration for an embedder. -#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] -pub struct EmbeddingConfig { - /// Options of the embedder, specific to each kind of embedder - pub embedder_options: EmbedderOptions, - /// Document template - pub prompt: PromptData, - // TODO: add metrics and anything needed -} - -/// Map of embedder configurations. -/// -/// Each configuration is mapped to a name. -#[derive(Clone, Default)] -pub struct EmbeddingConfigs(HashMap, Arc)>); - -impl EmbeddingConfigs { - /// Create the map from its internal component.s - pub fn new(data: HashMap, Arc)>) -> Self { - Self(data) - } - - /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option<(Arc, Arc)> { - self.0.get(name).cloned() - } - - /// Get the default embedder configuration, if any. - pub fn get_default(&self) -> Option<(Arc, Arc)> { - self.get(self.get_default_embedder_name()) - } - - pub fn inner_as_ref(&self) -> &HashMap, Arc)> { - &self.0 - } - - pub fn into_inner(self) -> HashMap, Arc)> { - self.0 - } - - /// Get the name of the default embedder configuration. - /// - /// The default embedder is determined as follows: - /// - /// - If there is only one embedder, it is always the default. - /// - If there are multiple embedders and one of them is called `default`, then that one is the default embedder. - /// - In all other cases, there is no default embedder. - pub fn get_default_embedder_name(&self) -> &str { - let mut it = self.0.keys(); - let first_name = it.next(); - let second_name = it.next(); - match (first_name, second_name) { - (None, _) => "default", - (Some(first), None) => first, - (Some(_), Some(_)) => "default", - } - } -} - -impl IntoIterator for EmbeddingConfigs { - type Item = (String, (Arc, Arc)); - - type IntoIter = std::collections::hash_map::IntoIter, Arc)>; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -/// Options of an embedder, specific to each kind of embedder. -#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] -pub enum EmbedderOptions { - HuggingFace(hf::EmbedderOptions), - OpenAi(openai::EmbedderOptions), - Ollama(ollama::EmbedderOptions), - UserProvided(manual::EmbedderOptions), - Rest(rest::EmbedderOptions), -} - -impl Default for EmbedderOptions { - fn default() -> Self { - Self::HuggingFace(Default::default()) - } -} - -impl Embedder { - /// Spawns a new embedder built from its options. - pub fn new(options: EmbedderOptions) -> std::result::Result { - Ok(match options { - EmbedderOptions::HuggingFace(options) => Self::HuggingFace(hf::Embedder::new(options)?), - EmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?), - EmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?), - EmbedderOptions::UserProvided(options) => { - Self::UserProvided(manual::Embedder::new(options)) - } - EmbedderOptions::Rest(options) => { - Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?) - } - }) - } - - /// Embed one or multiple texts. - /// - /// Each text can be embedded as one or multiple embeddings. - pub fn embed( - &self, - texts: Vec, - ) -> std::result::Result>, EmbedError> { - match self { - Embedder::HuggingFace(embedder) => embedder.embed(texts), - Embedder::OpenAi(embedder) => embedder.embed(texts), - Embedder::Ollama(embedder) => embedder.embed(texts), - Embedder::UserProvided(embedder) => embedder.embed(texts), - Embedder::Rest(embedder) => embedder.embed(texts), - } - } - - pub fn embed_one(&self, text: String) -> std::result::Result { - let mut embeddings = self.embed(vec![text])?; - let embeddings = embeddings.pop().ok_or_else(EmbedError::missing_embedding)?; - Ok(if embeddings.iter().nth(1).is_some() { - tracing::warn!("Ignoring embeddings past the first one in long search query"); - embeddings.iter().next().unwrap().to_vec() - } else { - embeddings.into_inner() - }) - } - - /// Embed multiple chunks of texts. - /// - /// Each chunk is composed of one or multiple texts. - pub fn embed_chunks( - &self, - text_chunks: Vec>, - threads: &ThreadPoolNoAbort, - ) -> std::result::Result>>, EmbedError> { - match self { - Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), - Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks, threads), - Embedder::Ollama(embedder) => embedder.embed_chunks(text_chunks, threads), - Embedder::UserProvided(embedder) => embedder.embed_chunks(text_chunks), - Embedder::Rest(embedder) => embedder.embed_chunks(text_chunks, threads), - } - } - - /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] - pub fn chunk_count_hint(&self) -> usize { - match self { - Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), - Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), - Embedder::Ollama(embedder) => embedder.chunk_count_hint(), - Embedder::UserProvided(_) => 1, - Embedder::Rest(embedder) => embedder.chunk_count_hint(), - } - } - - /// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`] - pub fn prompt_count_in_chunk_hint(&self) -> usize { - match self { - Embedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::UserProvided(_) => 1, - Embedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), - } - } - - /// Indicates the dimensions of a single embedding produced by the embedder. - pub fn dimensions(&self) -> usize { - match self { - Embedder::HuggingFace(embedder) => embedder.dimensions(), - Embedder::OpenAi(embedder) => embedder.dimensions(), - Embedder::Ollama(embedder) => embedder.dimensions(), - Embedder::UserProvided(embedder) => embedder.dimensions(), - Embedder::Rest(embedder) => embedder.dimensions(), - } - } - - /// An optional distribution used to apply an affine transformation to the similarity score of a document. - pub fn distribution(&self) -> Option { - match self { - Embedder::HuggingFace(embedder) => embedder.distribution(), - Embedder::OpenAi(embedder) => embedder.distribution(), - Embedder::Ollama(embedder) => embedder.distribution(), - Embedder::UserProvided(embedder) => embedder.distribution(), - Embedder::Rest(embedder) => embedder.distribution(), - } - } -} - -/// Describes the mean and sigma of distribution of embedding similarity in the embedding space. -/// -/// The intended use is to make the similarity score more comparable to the regular ranking score. -/// This allows to correct effects where results are too "packed" around a certain value. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize, Serialize)] -#[serde(from = "DistributionShiftSerializable")] -#[serde(into = "DistributionShiftSerializable")] -pub struct DistributionShift { - /// Value where the results are "packed". - /// - /// Similarity scores are translated so that they are packed around 0.5 instead - pub current_mean: OrderedFloat, - - /// standard deviation of a similarity score. - /// - /// Set below 0.4 to make the results less packed around the mean, and above 0.4 to make them more packed. - pub current_sigma: OrderedFloat, -} - -impl Deserr for DistributionShift -where - E: DeserializeError, -{ - fn deserialize_from_value( - value: deserr::Value, - location: deserr::ValuePointerRef<'_>, - ) -> Result { - let value = DistributionShiftSerializable::deserialize_from_value(value, location)?; - if value.mean < 0. || value.mean > 1. { - return Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::Unexpected { - msg: format!( - "the distribution mean must be in the range [0, 1], got {}", - value.mean - ), - }, - location, - ))); - } - if value.sigma <= 0. || value.sigma > 1. { - return Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::Unexpected { - msg: format!( - "the distribution sigma must be in the range ]0, 1], got {}", - value.sigma - ), - }, - location, - ))); - } - - Ok(value.into()) - } -} - -#[derive(Serialize, Deserialize, Deserr)] -#[serde(deny_unknown_fields)] -#[deserr(deny_unknown_fields)] -struct DistributionShiftSerializable { - mean: f32, - sigma: f32, -} - -impl From for DistributionShiftSerializable { - fn from( - DistributionShift { - current_mean: OrderedFloat(current_mean), - current_sigma: OrderedFloat(current_sigma), - }: DistributionShift, - ) -> Self { - Self { mean: current_mean, sigma: current_sigma } - } -} - -impl From for DistributionShift { - fn from(DistributionShiftSerializable { mean, sigma }: DistributionShiftSerializable) -> Self { - Self { current_mean: OrderedFloat(mean), current_sigma: OrderedFloat(sigma) } - } -} - -impl DistributionShift { - /// `None` if sigma <= 0. - pub fn new(mean: f32, sigma: f32) -> Option { - if sigma <= 0.0 { - None - } else { - Some(Self { current_mean: OrderedFloat(mean), current_sigma: OrderedFloat(sigma) }) - } - } - - pub fn shift(&self, score: f32) -> f32 { - let current_mean = self.current_mean.0; - let current_sigma = self.current_sigma.0; - // - // We're somewhat abusively mapping the distribution of distances to a gaussian. - // The parameters we're given is the mean and sigma of the native result distribution. - // We're using them to retarget the distribution to a gaussian centered on 0.5 with a sigma of 0.4. - - let target_mean = 0.5; - let target_sigma = 0.4; - - // a^2 sig1^2 = sig2^2 => a^2 = sig2^2 / sig1^2 => a = sig2 / sig1, assuming a, sig1, and sig2 positive. - let factor = target_sigma / current_sigma; - // a*mu1 + b = mu2 => b = mu2 - a*mu1 - let offset = target_mean - (factor * current_mean); - - let mut score = factor * score + offset; - - // clamp the final score in the ]0, 1] interval. - if score <= 0.0 { - score = f32::EPSILON; - } - if score > 1.0 { - score = 1.0; - } - - score - } -} - -/// Whether CUDA is supported in this version of Meilisearch. -pub const fn is_cuda_enabled() -> bool { - cfg!(feature = "cuda") -} - -pub fn arroy_db_range_for_embedder(embedder_id: u8) -> impl Iterator { - let embedder_id = (embedder_id as u16) << 8; - - (0..=u8::MAX).map(move |k| embedder_id | (k as u16)) -} diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 24473f6c8..d5cb5073f 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.79.0" +channel = "1.85.1" components = ["clippy"] diff --git a/workloads/embeddings-movies-subset-hf.json b/workloads/embeddings-movies-subset-hf.json index d7672cf73..4f6c5be35 100644 --- a/workloads/embeddings-movies-subset-hf.json +++ b/workloads/embeddings-movies-subset-hf.json @@ -12,16 +12,6 @@ } }, "precommands": [ - { - "route": "experimental-features", - "method": "PATCH", - "body": { - "inline": { - "vectorStore": true - } - }, - "synchronous": "DontWait" - }, { "route": "indexes/movies/settings", "method": "PATCH", diff --git a/workloads/embeddings-settings-add.json b/workloads/embeddings-settings-add.json index 6ad50769a..67f9709db 100644 --- a/workloads/embeddings-settings-add.json +++ b/workloads/embeddings-settings-add.json @@ -12,16 +12,6 @@ } }, "precommands": [ - { - "route": "experimental-features", - "method": "PATCH", - "body": { - "inline": { - "vectorStore": true - } - }, - "synchronous": "DontWait" - }, { "route": "indexes/movies/settings", "method": "PATCH", diff --git a/workloads/hackernews-add-new-documents.json b/workloads/hackernews-add-new-documents.json new file mode 100644 index 000000000..0470a0792 --- /dev/null +++ b/workloads/hackernews-add-new-documents.json @@ -0,0 +1,105 @@ +{ + "name": "hackernews.add_new_documents", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-05.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} diff --git a/workloads/hackernews-modify-facet-numbers.json b/workloads/hackernews-modify-facet-numbers.json new file mode 100644 index 000000000..5c6acf626 --- /dev/null +++ b/workloads/hackernews-modify-facet-numbers.json @@ -0,0 +1,111 @@ +{ + "name": "hackernews.modify_facet_numbers", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-modified-number-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson", + "sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-modified-number-filters.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} + \ No newline at end of file diff --git a/workloads/hackernews-modify-facet-strings.json b/workloads/hackernews-modify-facet-strings.json new file mode 100644 index 000000000..b5d4235a0 --- /dev/null +++ b/workloads/hackernews-modify-facet-strings.json @@ -0,0 +1,111 @@ +{ + "name": "hackernews.modify_facet_strings", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-modified-string-filters.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson", + "sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-modified-string-filters.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} + \ No newline at end of file diff --git a/workloads/hackernews-modify-searchables.json b/workloads/hackernews-modify-searchables.json new file mode 100644 index 000000000..248026f19 --- /dev/null +++ b/workloads/hackernews-modify-searchables.json @@ -0,0 +1,123 @@ +{ + "name": "hackernews.modify_searchables", + "run_count": 3, + "extra_cli_args": [], + "assets": { + "hackernews-01.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01.ndjson", + "sha256": "cd3627b86c064d865b6754848ed0e73ef1d8142752a25e5f0765c3a1296dd3ae" + }, + "hackernews-02.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02.ndjson", + "sha256": "5d533b83bcf992201dace88b4d0c0be8b4df5225c6c4b763582d986844bcc23b" + }, + "hackernews-03.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/03.ndjson", + "sha256": "f5f351a0d04a8a83643ace12cafa2b7ec8ca8cb7d46fd268e5126492a6c66f2a" + }, + "hackernews-04.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/04.ndjson", + "sha256": "ac1915ee7ce53a6718548c255a6cc59969784b2570745dc5b739f714beda291a" + }, + "hackernews-05.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/05.ndjson", + "sha256": "be31d5632602f798e62d1c10c83bdfda2b4deaa068477eacde05fdd247572b82" + }, + "hackernews-01-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-searchables.ndjson", + "sha256": "e5c08710c6af70031ac7212e0ba242c72ef29c8d4e1fce66c789544641452a7c" + }, + "hackernews-02-modified-searchables.ndjson": { + "local_location": null, + "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-searchables.ndjson", + "sha256": "098b029851117087b1e26ccb7ac408eda9bba54c3008213a2880d6fab607346e" + } + }, + "precommands": [ + { + "route": "indexes/movies/settings", + "method": "PATCH", + "body": { + "inline": { + "displayedAttributes": [ + "title", + "by", + "score", + "time", + "text" + ], + "searchableAttributes": [ + "title", + "text" + ], + "filterableAttributes": [ + "by", + "kids", + "parent" + ], + "sortableAttributes": [ + "score", + "time" + ] + } + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-03.ndjson" + }, + "synchronous": "WaitForResponse" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-04.ndjson" + }, + "synchronous": "WaitForTask" + } + ], + "commands": [ + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-01-modified-searchables.ndjson" + }, + "synchronous": "WaitForTask" + }, + { + "route": "indexes/movies/documents", + "method": "POST", + "body": { + "asset": "hackernews-02-modified-searchables.ndjson" + }, + "synchronous": "WaitForTask" + } + ] +} diff --git a/workloads/search/embeddings-movies-subset-hf.json b/workloads/search/embeddings-movies-subset-hf.json index 36f45cfb9..720d41790 100644 --- a/workloads/search/embeddings-movies-subset-hf.json +++ b/workloads/search/embeddings-movies-subset-hf.json @@ -13,16 +13,6 @@ } }, "precommands": [ - { - "route": "experimental-features", - "method": "PATCH", - "body": { - "inline": { - "vectorStore": true - } - }, - "synchronous": "DontWait" - }, { "route": "indexes/movies/settings", "method": "PATCH",