mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-11 07:05:43 +00:00
Compare commits
159 Commits
prototype-
...
diff-index
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d433abf06 | ||
|
|
d8d45845b2 | ||
|
|
2b7c5829d5 | ||
|
|
9b4627cc8b | ||
|
|
cd40f6b28c | ||
|
|
4941543dfe | ||
|
|
87f8cb8b62 | ||
|
|
81fe3eaa09 | ||
|
|
066221fd2b | ||
|
|
b8fed737ef | ||
|
|
c63ff5298b | ||
|
|
d50408d670 | ||
|
|
e0dc413521 | ||
|
|
0f6a0b1ab8 | ||
|
|
061f490204 | ||
|
|
5c43ff72c1 | ||
|
|
c445e9daec | ||
|
|
178a9802fa | ||
|
|
7d546b9c22 | ||
|
|
c829feb40b | ||
|
|
b88fd7994c | ||
|
|
096d7705c7 | ||
|
|
e8f8730467 | ||
|
|
26ef0b3a07 | ||
|
|
20394fda04 | ||
|
|
27161bcd05 | ||
|
|
04fd44b5e2 | ||
|
|
9078e60024 | ||
|
|
8fb96b8274 | ||
|
|
50ba751244 | ||
|
|
f36c36e368 | ||
|
|
c2dcd66d32 | ||
|
|
d4594306d3 | ||
|
|
93d0680903 | ||
|
|
01101d55ac | ||
|
|
c0fd3dffb8 | ||
|
|
c42fd5375f | ||
|
|
b418c3a756 | ||
|
|
1cde455758 | ||
|
|
ca19bae72f | ||
|
|
705878ff59 | ||
|
|
92c280d1c8 | ||
|
|
181e7a1e53 | ||
|
|
2e5abb4d2c | ||
|
|
44aaf5d9e3 | ||
|
|
ff0ababf65 | ||
|
|
c5336af1c5 | ||
|
|
1567758a56 | ||
|
|
37953afe1a | ||
|
|
de3f992ae4 | ||
|
|
98f0618065 | ||
|
|
86b314626d | ||
|
|
bb79bdb3f8 | ||
|
|
d429e7da99 | ||
|
|
584b772248 | ||
|
|
1806c04a9a | ||
|
|
3485e8f1c4 | ||
|
|
fe697a6685 | ||
|
|
eb4135f8ae | ||
|
|
ec4844c3a6 | ||
|
|
77c3787b78 | ||
|
|
4f902490b9 | ||
|
|
1faee92748 | ||
|
|
5831466525 | ||
|
|
3cdb3e4eaf | ||
|
|
26f34ec7a2 | ||
|
|
07d36180ad | ||
|
|
4c641b79a2 | ||
|
|
76c05d1b20 | ||
|
|
ef31ab52a4 | ||
|
|
34fac115d5 | ||
|
|
791c5cd874 | ||
|
|
5bea1092fb | ||
|
|
056b2c387d | ||
|
|
a09686fcbd | ||
|
|
b4c44603db | ||
|
|
393be40179 | ||
|
|
2c1d60f79b | ||
|
|
487d493f49 | ||
|
|
08af69a33b | ||
|
|
9258e5b5bf | ||
|
|
ddd34a488a | ||
|
|
526c2b3602 | ||
|
|
e8c9367686 | ||
|
|
9636c5f558 | ||
|
|
b310830b5d | ||
|
|
462b4654c4 | ||
|
|
abfa7ded25 | ||
|
|
f2837aaec2 | ||
|
|
11df155598 | ||
|
|
651657c03e | ||
|
|
b9ad59c969 | ||
|
|
66aa682e23 | ||
|
|
256cf33bca | ||
|
|
9945cbf9db | ||
|
|
03d0f628bd | ||
|
|
ea78060916 | ||
|
|
b42d48187a | ||
|
|
679c0b0f97 | ||
|
|
e02d0064bd | ||
|
|
7ef3572f11 | ||
|
|
93285041a9 | ||
|
|
dc3d9c90d9 | ||
|
|
287cf25d39 | ||
|
|
66aa6d5871 | ||
|
|
8ac5b765bc | ||
|
|
cea93e9a37 | ||
|
|
085aad0a94 | ||
|
|
e9b62aacb3 | ||
|
|
456960d2c7 | ||
|
|
3dda176723 | ||
|
|
af0f6f0bf0 | ||
|
|
ccf3ba3f32 | ||
|
|
65528a3e06 | ||
|
|
6db80b0836 | ||
|
|
cdb4b3e024 | ||
|
|
8c0ebd1331 | ||
|
|
5130e06b41 | ||
|
|
08e27ef73f | ||
|
|
914b125c5f | ||
|
|
e59d7f238c | ||
|
|
717b069907 | ||
|
|
7ea154673a | ||
|
|
b947f3bb9d | ||
|
|
4c35817c5f | ||
|
|
c53841e166 | ||
|
|
fd81945597 | ||
|
|
794e491152 | ||
|
|
cab27c2ab4 | ||
|
|
624fa9052f | ||
|
|
359ede4862 | ||
|
|
60c11dbdbd | ||
|
|
dacee40ebc | ||
|
|
6089083a8e | ||
|
|
cc2c19d4c3 | ||
|
|
a5c56fac8a | ||
|
|
e4e49e63d0 | ||
|
|
00bd7bd19a | ||
|
|
ef3d098b4d | ||
|
|
8084cf29f3 | ||
|
|
5a7c1bde84 | ||
|
|
6b2d671be7 | ||
|
|
43c13faeda | ||
|
|
29adfc2f68 | ||
|
|
064ee95b1c | ||
|
|
604d533b31 | ||
|
|
44c1900f36 | ||
|
|
04671d0751 | ||
|
|
4f4c669d50 | ||
|
|
8dc5acf998 | ||
|
|
fc2590fc9d | ||
|
|
35758db9ec | ||
|
|
4988199bb9 | ||
|
|
83991ee770 | ||
|
|
9d061cec26 | ||
|
|
fe819a9d80 | ||
|
|
e338ceb97f | ||
|
|
75c87d5391 | ||
|
|
dd57873f8e |
2
.github/workflows/benchmarks-manual.yml
vendored
2
.github/workflows/benchmarks-manual.yml
vendored
@@ -74,4 +74,4 @@ jobs:
|
||||
echo "${{ steps.file.outputs.basename }}.json has just been pushed."
|
||||
echo 'How to compare this benchmark with another one?'
|
||||
echo ' - Check the available files with: ./benchmarks/scripts/list.sh'
|
||||
echo " - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
|
||||
echo " - Run the following command: ./benchmaks/scripts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
|
||||
|
||||
4
.github/workflows/dependency-issue.yml
vendored
4
.github/workflows/dependency-issue.yml
vendored
@@ -2,8 +2,8 @@ name: Create issue to upgrade dependencies
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Run the first of the month, every 3 month
|
||||
- cron: '0 0 1 */3 *'
|
||||
# Run the first of the month, every 6 month
|
||||
- cron: '0 0 1 */6 *'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
1
.github/workflows/publish-apt-brew-pkg.yml
vendored
1
.github/workflows/publish-apt-brew-pkg.yml
vendored
@@ -53,5 +53,6 @@ jobs:
|
||||
uses: mislav/bump-homebrew-formula-action@v2
|
||||
with:
|
||||
formula-name: meilisearch
|
||||
formula-path: Formula/m/meilisearch.rb
|
||||
env:
|
||||
COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}
|
||||
|
||||
8
.github/workflows/publish-docker-images.yml
vendored
8
.github/workflows/publish-docker-images.yml
vendored
@@ -57,10 +57,10 @@ jobs:
|
||||
echo "date=$commit_date" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v4
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: getmeili/meilisearch
|
||||
# Prevent `latest` to be updated for each new tag pushed.
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}
|
||||
|
||||
- name: Build and push
|
||||
uses: docker/build-push-action@v4
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
278
.github/workflows/sdks-tests.yml
vendored
278
.github/workflows/sdks-tests.yml
vendored
@@ -14,6 +14,7 @@ on:
|
||||
env:
|
||||
MEILI_MASTER_KEY: 'masterKey'
|
||||
MEILI_NO_ANALYTICS: 'true'
|
||||
DISABLE_COVERAGE: 'true'
|
||||
|
||||
jobs:
|
||||
define-docker-image:
|
||||
@@ -30,6 +31,117 @@ jobs:
|
||||
if [[ $event == 'workflow_dispatch' ]]; then
|
||||
echo "docker-image=${{ github.event.inputs.docker_image }}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
- name: Docker image is ${{ steps.define-image.outputs.docker-image }}
|
||||
run: echo "Docker image is ${{ steps.define-image.outputs.docker-image }}"
|
||||
|
||||
##########
|
||||
## SDKs ##
|
||||
##########
|
||||
|
||||
meilisearch-dotnet-tests:
|
||||
needs: define-docker-image
|
||||
name: .NET SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
MEILISEARCH_VERSION: ${{ needs.define-docker-image.outputs.docker-image }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-dotnet
|
||||
- name: Setup .NET Core
|
||||
uses: actions/setup-dotnet@v3
|
||||
with:
|
||||
dotnet-version: "6.0.x"
|
||||
- name: Install dependencies
|
||||
run: dotnet restore
|
||||
- name: Build
|
||||
run: dotnet build --configuration Release --no-restore
|
||||
- name: Meilisearch (latest version) setup with Docker
|
||||
run: docker compose up -d
|
||||
- name: Run tests
|
||||
run: dotnet test --no-restore --verbosity normal
|
||||
|
||||
meilisearch-dart-tests:
|
||||
needs: define-docker-image
|
||||
name: Dart SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-dart
|
||||
- uses: dart-lang/setup-dart@v1
|
||||
with:
|
||||
sdk: 3.1.1
|
||||
- name: Install dependencies
|
||||
run: dart pub get
|
||||
- name: Run integration tests
|
||||
run: dart test --concurrency=4
|
||||
|
||||
meilisearch-go-tests:
|
||||
needs: define-docker-image
|
||||
name: Go SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: stable
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-go
|
||||
- name: Get dependencies
|
||||
run: |
|
||||
go get -v -t -d ./...
|
||||
if [ -f Gopkg.toml ]; then
|
||||
curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
|
||||
dep ensure
|
||||
fi
|
||||
- name: Run integration tests
|
||||
run: go test -v ./...
|
||||
|
||||
meilisearch-java-tests:
|
||||
needs: define-docker-image
|
||||
name: Java SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-java
|
||||
- name: Set up Java
|
||||
uses: actions/setup-java@v3
|
||||
with:
|
||||
java-version: 8
|
||||
distribution: 'zulu'
|
||||
cache: gradle
|
||||
- name: Grant execute permission for gradlew
|
||||
run: chmod +x gradlew
|
||||
- name: Build and run unit and integration tests
|
||||
run: ./gradlew build integrationTest
|
||||
|
||||
meilisearch-js-tests:
|
||||
needs: define-docker-image
|
||||
@@ -66,33 +178,6 @@ jobs:
|
||||
- name: Run Browser env
|
||||
run: yarn test:env:browser
|
||||
|
||||
instant-meilisearch-tests:
|
||||
needs: define-docker-image
|
||||
name: instant-meilisearch tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/instant-meilisearch
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
cache: yarn
|
||||
- name: Install dependencies
|
||||
run: yarn install
|
||||
- name: Run tests
|
||||
run: yarn test
|
||||
- name: Build all the playgrounds and the packages
|
||||
run: yarn build
|
||||
|
||||
meilisearch-php-tests:
|
||||
needs: define-docker-image
|
||||
name: PHP SDK tests
|
||||
@@ -111,8 +196,6 @@ jobs:
|
||||
repository: meilisearch/meilisearch-php
|
||||
- name: Install PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
coverage: none
|
||||
- name: Validate composer.json and composer.lock
|
||||
run: composer validate
|
||||
- name: Install dependencies
|
||||
@@ -149,36 +232,6 @@ jobs:
|
||||
- name: Test with pytest
|
||||
run: pipenv run pytest
|
||||
|
||||
meilisearch-go-tests:
|
||||
needs: define-docker-image
|
||||
name: Go SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- name: Set up Go
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: stable
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-go
|
||||
- name: Get dependencies
|
||||
run: |
|
||||
go get -v -t -d ./...
|
||||
if [ -f Gopkg.toml ]; then
|
||||
curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
|
||||
dep ensure
|
||||
fi
|
||||
- name: Run integration tests
|
||||
run: go test -v ./...
|
||||
|
||||
meilisearch-ruby-tests:
|
||||
needs: define-docker-image
|
||||
name: Ruby SDK tests
|
||||
@@ -224,3 +277,110 @@ jobs:
|
||||
run: cargo build --verbose
|
||||
- name: Run tests
|
||||
run: cargo test --verbose
|
||||
|
||||
meilisearch-swift-tests:
|
||||
needs: define-docker-image
|
||||
name: Swift SDK tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-swift
|
||||
- name: Run tests
|
||||
run: swift test
|
||||
|
||||
########################
|
||||
## FRONT-END PLUGINS ##
|
||||
########################
|
||||
|
||||
meilisearch-js-plugins-tests:
|
||||
needs: define-docker-image
|
||||
name: meilisearch-js-plugins tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js-plugins
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
cache: yarn
|
||||
- name: Install dependencies
|
||||
run: yarn install
|
||||
- name: Run tests
|
||||
run: yarn test
|
||||
- name: Build all the playgrounds and the packages
|
||||
run: yarn build
|
||||
|
||||
########################
|
||||
## BACK-END PLUGINS ###
|
||||
########################
|
||||
|
||||
meilisearch-rails-tests:
|
||||
needs: define-docker-image
|
||||
name: meilisearch-rails tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-rails
|
||||
- name: Set up Ruby 3
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: 3
|
||||
bundler-cache: true
|
||||
- name: Run tests
|
||||
run: bundle exec rspec
|
||||
|
||||
meilisearch-symfony-tests:
|
||||
needs: define-docker-image
|
||||
name: meilisearch-symfony tests
|
||||
runs-on: ubuntu-latest
|
||||
services:
|
||||
meilisearch:
|
||||
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
|
||||
env:
|
||||
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
|
||||
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
|
||||
ports:
|
||||
- '7700:7700'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
repository: meilisearch/meilisearch-symfony
|
||||
- name: Install PHP
|
||||
uses: shivammathur/setup-php@v2
|
||||
with:
|
||||
tools: composer:v2, flex
|
||||
- name: Validate composer.json and composer.lock
|
||||
run: composer validate
|
||||
- name: Install dependencies
|
||||
run: composer install --prefer-dist --no-progress --quiet
|
||||
- name: Remove doctrine/annotations
|
||||
run: composer remove --dev doctrine/annotations
|
||||
- name: Run test suite
|
||||
run: composer test:unit
|
||||
|
||||
25
.github/workflows/test-suite.yml
vendored
25
.github/workflows/test-suite.yml
vendored
@@ -37,13 +37,13 @@ jobs:
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Setup test with Rust nightly
|
||||
if: github.event_name == 'schedule'
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.5.1
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@@ -65,7 +65,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.5.1
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@@ -78,12 +78,12 @@ jobs:
|
||||
args: --locked --release --all
|
||||
|
||||
test-all-features:
|
||||
name: Tests all features on cron schedule only
|
||||
name: Tests all features
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
|
||||
image: ubuntu:18.04
|
||||
if: github.event_name == 'schedule'
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install needed dependencies
|
||||
@@ -110,7 +110,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: ubuntu:18.04
|
||||
if: github.event_name == 'schedule'
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install needed dependencies
|
||||
@@ -123,7 +123,10 @@ jobs:
|
||||
override: true
|
||||
- name: Run cargo tree without default features and check lindera is not present
|
||||
run: |
|
||||
cargo tree -f '{p} {f}' -e normal --no-default-features | grep lindera -vqz
|
||||
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then
|
||||
echo "lindera has been found in the sources and it shouldn't"
|
||||
exit 1
|
||||
fi
|
||||
- name: Run cargo tree with default features and check lindera is pressent
|
||||
run: |
|
||||
cargo tree -f '{p} {f}' -e normal | grep lindera -qz
|
||||
@@ -146,7 +149,7 @@ jobs:
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.5.1
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
- name: Run tests in debug
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@@ -161,11 +164,11 @@ jobs:
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: 1.69.0
|
||||
toolchain: 1.71.1
|
||||
override: true
|
||||
components: clippy
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.5.1
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
- name: Run cargo clippy
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@@ -184,7 +187,7 @@ jobs:
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.5.1
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
- name: Run cargo fmt
|
||||
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
||||
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
||||
|
||||
84
.github/workflows/trigger-benchmarks-on-message.yml
vendored
Normal file
84
.github/workflows/trigger-benchmarks-on-message.yml
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
name: Benchmarks (PR)
|
||||
on: issue_comment
|
||||
permissions:
|
||||
issues: write
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||
|
||||
jobs:
|
||||
run-benchmarks-on-comment:
|
||||
name: Run and upload benchmarks
|
||||
runs-on: benchmarks
|
||||
timeout-minutes: 4320 # 72h
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Check for Command
|
||||
id: command
|
||||
uses: xt0rted/slash-command-action@v2
|
||||
with:
|
||||
command: benchmark
|
||||
reaction-type: "eyes"
|
||||
repo-token: ${{ env.GH_TOKEN }}
|
||||
|
||||
# Set variables
|
||||
- name: Set current branch name
|
||||
shell: bash
|
||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
|
||||
id: current_branch
|
||||
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
|
||||
shell: bash
|
||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
|
||||
id: normalized_current_branch
|
||||
- name: Set shorter commit SHA
|
||||
shell: bash
|
||||
run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT
|
||||
id: commit_sha
|
||||
- name: Set file basename with format "dataset_branch_commitSHA"
|
||||
shell: bash
|
||||
run: echo "basename=$(echo ${{ steps.command.outputs.command-arguments }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT
|
||||
id: file
|
||||
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
|
||||
run: |
|
||||
cd benchmarks
|
||||
cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
|
||||
|
||||
# Generate critcmp files
|
||||
- name: Install critcmp
|
||||
uses: taiki-e/install-action@v2
|
||||
with:
|
||||
tool: critcmp
|
||||
- name: Export cripcmp file
|
||||
run: |
|
||||
critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
|
||||
|
||||
# Upload benchmarks
|
||||
- name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
|
||||
uses: BetaHuhn/do-spaces-action@v2
|
||||
with:
|
||||
access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
|
||||
secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
|
||||
space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
|
||||
space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
|
||||
source: ${{ steps.file.outputs.basename }}.json
|
||||
out_dir: critcmp_results
|
||||
|
||||
# Compute the diff of the benchmarks and send a message on the GitHub PR
|
||||
- name: Compute and send a message in the PR
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||
run: |
|
||||
export base=$(git log --pretty=%p -n 1)
|
||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
||||
echo '```' >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
echo '```' >> body.txt
|
||||
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
|
||||
679
Cargo.lock
generated
679
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -18,7 +18,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.3.0"
|
||||
version = "1.4.0"
|
||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||
description = "Meilisearch HTTP server"
|
||||
homepage = "https://meilisearch.com"
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: spells.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: products.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {
|
||||
"android": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"iphone": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"phone": [
|
||||
"android",
|
||||
"iphone",
|
||||
"smartphone"
|
||||
]
|
||||
},
|
||||
"distinctAttribute": null
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: movies.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness",
|
||||
"release_date:asc"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
||||
@@ -14,6 +14,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
nom = "7.1.3"
|
||||
nom_locate = "4.1.0"
|
||||
unescaper = "0.1.2"
|
||||
|
||||
[dev-dependencies]
|
||||
insta = "1.29.0"
|
||||
|
||||
@@ -62,6 +62,7 @@ pub enum ErrorKind<'a> {
|
||||
MisusedGeoRadius,
|
||||
MisusedGeoBoundingBox,
|
||||
InvalidPrimary,
|
||||
InvalidEscapedNumber,
|
||||
ExpectedEof,
|
||||
ExpectedValue(ExpectedValueKind),
|
||||
MalformedValue,
|
||||
@@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> {
|
||||
let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
|
||||
writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
|
||||
}
|
||||
ErrorKind::InvalidEscapedNumber => {
|
||||
writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
|
||||
}
|
||||
ErrorKind::ExpectedEof => {
|
||||
writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
|
||||
}
|
||||
|
||||
@@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use FilterCondition as Fc;
|
||||
|
||||
use super::*;
|
||||
|
||||
/// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
|
||||
@@ -556,14 +558,22 @@ pub mod tests {
|
||||
unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
|
||||
}
|
||||
|
||||
fn p(s: &str) -> impl std::fmt::Display + '_ {
|
||||
Fc::parse(s).unwrap().unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_escaped() {
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
|
||||
// but it also works with other sequencies
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse() {
|
||||
use FilterCondition as Fc;
|
||||
|
||||
fn p(s: &str) -> impl std::fmt::Display + '_ {
|
||||
Fc::parse(s).unwrap().unwrap()
|
||||
}
|
||||
|
||||
// Test equal
|
||||
insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
|
||||
insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
|
||||
|
||||
@@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult<Token> {
|
||||
})
|
||||
})?;
|
||||
|
||||
Ok((input, value))
|
||||
match unescaper::unescape(value.value()) {
|
||||
Ok(content) => {
|
||||
if content.len() != value.value().len() {
|
||||
Ok((input, Token::new(value.original_span(), Some(content))))
|
||||
} else {
|
||||
Ok((input, value))
|
||||
}
|
||||
}
|
||||
Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
|
||||
Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
|
||||
value.original_span(),
|
||||
ErrorKind::InvalidEscapedNumber,
|
||||
))),
|
||||
Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
|
||||
value.original_span(),
|
||||
ErrorKind::MalformedValue,
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_value_component(c: char) -> bool {
|
||||
@@ -318,17 +335,17 @@ pub mod test {
|
||||
("\"cha'nnel\"", "cha'nnel", false),
|
||||
("I'm tamo", "I", false),
|
||||
// escaped thing but not quote
|
||||
(r#""\\""#, r#"\\"#, false),
|
||||
(r#""\\\\\\""#, r#"\\\\\\"#, false),
|
||||
(r#""aa\\aa""#, r#"aa\\aa"#, false),
|
||||
(r#""\\""#, r#"\"#, true),
|
||||
(r#""\\\\\\""#, r#"\\\"#, true),
|
||||
(r#""aa\\aa""#, r#"aa\aa"#, true),
|
||||
// with double quote
|
||||
(r#""Hello \"world\"""#, r#"Hello "world""#, true),
|
||||
(r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
|
||||
(r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
|
||||
(r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
|
||||
(r#""\"\"""#, r#""""#, true),
|
||||
// with simple quote
|
||||
(r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
|
||||
(r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
|
||||
(r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
|
||||
(r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
|
||||
(r#"'\'\''"#, r#"''"#, true),
|
||||
];
|
||||
@@ -350,7 +367,14 @@ pub mod test {
|
||||
"Filter `{}` was not supposed to be escaped",
|
||||
input
|
||||
);
|
||||
assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
|
||||
assert_eq!(
|
||||
token.value(),
|
||||
expected,
|
||||
"Filter `{}` failed by giving `{}` instead of `{}`.",
|
||||
input,
|
||||
token.value(),
|
||||
expected
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
arbitrary = { version = "1.3.0", features = ["derive"] }
|
||||
clap = { version = "4.3.0", features = ["derive"] }
|
||||
fastrand = "1.9.0"
|
||||
fastrand = "2.0.0"
|
||||
milli = { path = "../milli" }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
|
||||
@@ -67,10 +67,6 @@ pub(crate) enum Batch {
|
||||
op: IndexOperation,
|
||||
must_create_index: bool,
|
||||
},
|
||||
IndexDocumentDeletionByFilter {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
},
|
||||
IndexCreation {
|
||||
index_uid: String,
|
||||
primary_key: Option<String>,
|
||||
@@ -114,6 +110,10 @@ pub(crate) enum IndexOperation {
|
||||
documents: Vec<Vec<String>>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
IndexDocumentDeletionByFilter {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
},
|
||||
DocumentClear {
|
||||
index_uid: String,
|
||||
tasks: Vec<Task>,
|
||||
@@ -155,7 +155,6 @@ impl Batch {
|
||||
| Batch::TaskDeletion(task)
|
||||
| Batch::Dump(task)
|
||||
| Batch::IndexCreation { task, .. }
|
||||
| Batch::IndexDocumentDeletionByFilter { task, .. }
|
||||
| Batch::IndexUpdate { task, .. } => vec![task.uid],
|
||||
Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
@@ -167,6 +166,7 @@ impl Batch {
|
||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
|
||||
IndexOperation::SettingsAndDocumentOperation {
|
||||
document_import_tasks: tasks,
|
||||
settings_tasks: other,
|
||||
@@ -194,8 +194,7 @@ impl Batch {
|
||||
IndexOperation { op, .. } => Some(op.index_uid()),
|
||||
IndexCreation { index_uid, .. }
|
||||
| IndexUpdate { index_uid, .. }
|
||||
| IndexDeletion { index_uid, .. }
|
||||
| IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
|
||||
| IndexDeletion { index_uid, .. } => Some(index_uid),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -205,6 +204,7 @@ impl IndexOperation {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { index_uid, .. }
|
||||
| IndexOperation::DocumentDeletion { index_uid, .. }
|
||||
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
||||
| IndexOperation::DocumentClear { index_uid, .. }
|
||||
| IndexOperation::Settings { index_uid, .. }
|
||||
| IndexOperation::DocumentClearAndSetting { index_uid, .. }
|
||||
@@ -239,9 +239,12 @@ impl IndexScheduler {
|
||||
let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
|
||||
match &task.kind {
|
||||
KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
|
||||
Ok(Some(Batch::IndexDocumentDeletionByFilter {
|
||||
index_uid: index_uid.clone(),
|
||||
task,
|
||||
Ok(Some(Batch::IndexOperation {
|
||||
op: IndexOperation::IndexDocumentDeletionByFilter {
|
||||
index_uid: index_uid.clone(),
|
||||
task,
|
||||
},
|
||||
must_create_index: false,
|
||||
}))
|
||||
}
|
||||
_ => unreachable!(),
|
||||
@@ -896,51 +899,6 @@ impl IndexScheduler {
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
let (index_uid, filter) =
|
||||
if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
|
||||
&task.kind
|
||||
{
|
||||
(index_uid, filter_expr)
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let index = {
|
||||
let rtxn = self.env.read_txn()?;
|
||||
self.index_mapper.index(&rtxn, index_uid)?
|
||||
};
|
||||
let deleted_documents = delete_document_by_filter(filter, index);
|
||||
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: _,
|
||||
}) = task.details
|
||||
{
|
||||
original_filter
|
||||
} else {
|
||||
// In the case of a `documentDeleteByFilter` the details MUST be set
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
match deleted_documents {
|
||||
Ok(deleted_documents) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: Some(deleted_documents),
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: Some(0),
|
||||
});
|
||||
task.error = Some(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(vec![task])
|
||||
}
|
||||
Batch::IndexCreation { index_uid, primary_key, task } => {
|
||||
let wtxn = self.env.write_txn()?;
|
||||
if self.index_mapper.exists(&wtxn, &index_uid)? {
|
||||
@@ -1299,6 +1257,47 @@ impl IndexScheduler {
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
let filter =
|
||||
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
||||
&task.kind
|
||||
{
|
||||
filter_expr
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
|
||||
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: _,
|
||||
}) = task.details
|
||||
{
|
||||
original_filter
|
||||
} else {
|
||||
// In the case of a `documentDeleteByFilter` the details MUST be set
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
match deleted_documents {
|
||||
Ok(deleted_documents) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: Some(deleted_documents),
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: Some(0),
|
||||
});
|
||||
task.error = Some(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(vec![task])
|
||||
}
|
||||
IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
@@ -1498,23 +1497,22 @@ impl IndexScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
|
||||
fn delete_document_by_filter<'a>(
|
||||
wtxn: &mut RwTxn<'a, '_>,
|
||||
filter: &serde_json::Value,
|
||||
index: &'a Index,
|
||||
) -> Result<u64> {
|
||||
let filter = Filter::from_json(filter)?;
|
||||
Ok(if let Some(filter) = filter {
|
||||
let mut wtxn = index.write_txn()?;
|
||||
|
||||
let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
|
||||
let candidates = filter.evaluate(wtxn, index).map_err(|err| match err {
|
||||
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
|
||||
Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
|
||||
}
|
||||
e => e.into(),
|
||||
})?;
|
||||
let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
|
||||
let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
|
||||
delete_operation.delete_documents(&candidates);
|
||||
let deleted_documents =
|
||||
delete_operation.execute().map(|result| result.deleted_documents)?;
|
||||
wtxn.commit()?;
|
||||
deleted_documents
|
||||
delete_operation.execute().map(|result| result.deleted_documents)?
|
||||
} else {
|
||||
0
|
||||
})
|
||||
|
||||
@@ -790,10 +790,19 @@ impl IndexScheduler {
|
||||
|
||||
let mut res = BTreeMap::new();
|
||||
|
||||
let processing_tasks = { self.processing_tasks.read().unwrap().processing.len() };
|
||||
|
||||
res.insert(
|
||||
"statuses".to_string(),
|
||||
enum_iterator::all::<Status>()
|
||||
.map(|s| Ok((s.to_string(), self.get_status(&rtxn, s)?.len())))
|
||||
.map(|s| {
|
||||
let tasks = self.get_status(&rtxn, s)?.len();
|
||||
match s {
|
||||
Status::Enqueued => Ok((s.to_string(), tasks - processing_tasks)),
|
||||
Status::Processing => Ok((s.to_string(), processing_tasks)),
|
||||
s => Ok((s.to_string(), tasks)),
|
||||
}
|
||||
})
|
||||
.collect::<Result<BTreeMap<String, u64>>>()?,
|
||||
);
|
||||
res.insert(
|
||||
@@ -4131,4 +4140,154 @@ mod tests {
|
||||
snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed");
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic_get_stats() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
|
||||
let kind = index_creation_task("catto", "mouse");
|
||||
let _task = index_scheduler.register(kind).unwrap();
|
||||
let kind = index_creation_task("doggo", "sheep");
|
||||
let _task = index_scheduler.register(kind).unwrap();
|
||||
let kind = index_creation_task("whalo", "fish");
|
||||
let _task = index_scheduler.register(kind).unwrap();
|
||||
|
||||
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
|
||||
{
|
||||
"indexes": {
|
||||
"catto": 1,
|
||||
"doggo": 1,
|
||||
"whalo": 1
|
||||
},
|
||||
"statuses": {
|
||||
"canceled": 0,
|
||||
"enqueued": 3,
|
||||
"failed": 0,
|
||||
"processing": 0,
|
||||
"succeeded": 0
|
||||
},
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
"indexSwap": 0,
|
||||
"indexUpdate": 0,
|
||||
"settingsUpdate": 0,
|
||||
"snapshotCreation": 0,
|
||||
"taskCancelation": 0,
|
||||
"taskDeletion": 0
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
handle.advance_till([Start, BatchCreated]);
|
||||
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
|
||||
{
|
||||
"indexes": {
|
||||
"catto": 1,
|
||||
"doggo": 1,
|
||||
"whalo": 1
|
||||
},
|
||||
"statuses": {
|
||||
"canceled": 0,
|
||||
"enqueued": 2,
|
||||
"failed": 0,
|
||||
"processing": 1,
|
||||
"succeeded": 0
|
||||
},
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
"indexSwap": 0,
|
||||
"indexUpdate": 0,
|
||||
"settingsUpdate": 0,
|
||||
"snapshotCreation": 0,
|
||||
"taskCancelation": 0,
|
||||
"taskDeletion": 0
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
handle.advance_till([
|
||||
InsideProcessBatch,
|
||||
InsideProcessBatch,
|
||||
ProcessBatchSucceeded,
|
||||
AfterProcessing,
|
||||
Start,
|
||||
BatchCreated,
|
||||
]);
|
||||
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
|
||||
{
|
||||
"indexes": {
|
||||
"catto": 1,
|
||||
"doggo": 1,
|
||||
"whalo": 1
|
||||
},
|
||||
"statuses": {
|
||||
"canceled": 0,
|
||||
"enqueued": 1,
|
||||
"failed": 0,
|
||||
"processing": 1,
|
||||
"succeeded": 1
|
||||
},
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
"indexSwap": 0,
|
||||
"indexUpdate": 0,
|
||||
"settingsUpdate": 0,
|
||||
"snapshotCreation": 0,
|
||||
"taskCancelation": 0,
|
||||
"taskDeletion": 0
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// now we make one more batch, the started_at field of the new tasks will be past `second_start_time`
|
||||
handle.advance_till([
|
||||
InsideProcessBatch,
|
||||
InsideProcessBatch,
|
||||
ProcessBatchSucceeded,
|
||||
AfterProcessing,
|
||||
Start,
|
||||
BatchCreated,
|
||||
]);
|
||||
snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
|
||||
{
|
||||
"indexes": {
|
||||
"catto": 1,
|
||||
"doggo": 1,
|
||||
"whalo": 1
|
||||
},
|
||||
"statuses": {
|
||||
"canceled": 0,
|
||||
"enqueued": 0,
|
||||
"failed": 0,
|
||||
"processing": 1,
|
||||
"succeeded": 2
|
||||
},
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 0,
|
||||
"documentDeletion": 0,
|
||||
"dumpCreation": 0,
|
||||
"indexCreation": 3,
|
||||
"indexDeletion": 0,
|
||||
"indexSwap": 0,
|
||||
"indexUpdate": 0,
|
||||
"settingsUpdate": 0,
|
||||
"snapshotCreation": 0,
|
||||
"taskCancelation": 0,
|
||||
"taskDeletion": 0
|
||||
}
|
||||
}
|
||||
"###);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -167,7 +167,9 @@ macro_rules! snapshot {
|
||||
let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, Some(&snap_name));
|
||||
settings.bind(|| {
|
||||
let snap = format!("{}", $value);
|
||||
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
|
||||
insta::allow_duplicates! {
|
||||
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
|
||||
}
|
||||
});
|
||||
};
|
||||
($value:expr, @$inline:literal) => {
|
||||
@@ -176,7 +178,9 @@ macro_rules! snapshot {
|
||||
let (settings, _, _) = $crate::default_snapshot_settings_for_test("", Some("_dummy_argument"));
|
||||
settings.bind(|| {
|
||||
let snap = format!("{}", $value);
|
||||
meili_snap::insta::assert_snapshot!(snap, @$inline);
|
||||
insta::allow_duplicates! {
|
||||
meili_snap::insta::assert_snapshot!(snap, @$inline);
|
||||
}
|
||||
});
|
||||
};
|
||||
($value:expr) => {
|
||||
@@ -194,7 +198,9 @@ macro_rules! snapshot {
|
||||
let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, None);
|
||||
settings.bind(|| {
|
||||
let snap = format!("{}", $value);
|
||||
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
|
||||
insta::allow_duplicates! {
|
||||
meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
|
||||
}
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
@@ -129,6 +129,9 @@ impl HeedAuthStore {
|
||||
Action::DumpsAll => {
|
||||
actions.insert(Action::DumpsCreate);
|
||||
}
|
||||
Action::SnapshotsAll => {
|
||||
actions.insert(Action::SnapshotsCreate);
|
||||
}
|
||||
Action::TasksAll => {
|
||||
actions.extend([Action::TasksGet, Action::TasksDelete, Action::TasksCancel]);
|
||||
}
|
||||
|
||||
@@ -15,13 +15,13 @@ actix-web = { version = "4.3.1", default-features = false }
|
||||
anyhow = "1.0.70"
|
||||
convert_case = "0.6.0"
|
||||
csv = "1.2.1"
|
||||
deserr = "0.5.0"
|
||||
deserr = { version = "0.6.0", features = ["actix-web"]}
|
||||
either = { version = "1.8.1", features = ["serde"] }
|
||||
enum-iterator = "1.4.0"
|
||||
file-store = { path = "../file-store" }
|
||||
flate2 = "1.0.25"
|
||||
fst = "0.4.7"
|
||||
memmap2 = "0.5.10"
|
||||
memmap2 = "0.7.1"
|
||||
milli = { path = "../milli" }
|
||||
roaring = { version = "0.10.1", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::borrow::Borrow;
|
||||
use std::fmt::{self, Debug, Display};
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, Write};
|
||||
@@ -42,7 +41,7 @@ impl Display for DocumentFormatError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Io(e) => write!(f, "{e}"),
|
||||
Self::MalformedPayload(me, b) => match me.borrow() {
|
||||
Self::MalformedPayload(me, b) => match me {
|
||||
Error::Json(se) => {
|
||||
let mut message = match se.classify() {
|
||||
Category::Data => {
|
||||
|
||||
@@ -257,6 +257,12 @@ pub enum Action {
|
||||
#[serde(rename = "dumps.create")]
|
||||
#[deserr(rename = "dumps.create")]
|
||||
DumpsCreate,
|
||||
#[serde(rename = "snapshots.*")]
|
||||
#[deserr(rename = "snapshots.*")]
|
||||
SnapshotsAll,
|
||||
#[serde(rename = "snapshots.create")]
|
||||
#[deserr(rename = "snapshots.create")]
|
||||
SnapshotsCreate,
|
||||
#[serde(rename = "version")]
|
||||
#[deserr(rename = "version")]
|
||||
Version,
|
||||
@@ -309,6 +315,7 @@ impl Action {
|
||||
METRICS_GET => Some(Self::MetricsGet),
|
||||
DUMPS_ALL => Some(Self::DumpsAll),
|
||||
DUMPS_CREATE => Some(Self::DumpsCreate),
|
||||
SNAPSHOTS_CREATE => Some(Self::SnapshotsCreate),
|
||||
VERSION => Some(Self::Version),
|
||||
KEYS_CREATE => Some(Self::KeysAdd),
|
||||
KEYS_GET => Some(Self::KeysGet),
|
||||
@@ -353,6 +360,7 @@ pub mod actions {
|
||||
pub const METRICS_GET: u8 = MetricsGet.repr();
|
||||
pub const DUMPS_ALL: u8 = DumpsAll.repr();
|
||||
pub const DUMPS_CREATE: u8 = DumpsCreate.repr();
|
||||
pub const SNAPSHOTS_CREATE: u8 = SnapshotsCreate.repr();
|
||||
pub const VERSION: u8 = Version.repr();
|
||||
pub const KEYS_CREATE: u8 = KeysAdd.repr();
|
||||
pub const KEYS_GET: u8 = KeysGet.repr();
|
||||
|
||||
@@ -39,7 +39,7 @@ byte-unit = { version = "4.0.19", default-features = false, features = [
|
||||
bytes = "1.4.0"
|
||||
clap = { version = "4.2.1", features = ["derive", "env"] }
|
||||
crossbeam-channel = "0.5.8"
|
||||
deserr = "0.5.0"
|
||||
deserr = { version = "0.6.0", features = ["actix-web"]}
|
||||
dump = { path = "../dump" }
|
||||
either = "1.8.1"
|
||||
env_logger = "0.10.0"
|
||||
@@ -50,9 +50,9 @@ futures = "0.3.28"
|
||||
futures-util = "0.3.28"
|
||||
http = "0.2.9"
|
||||
index-scheduler = { path = "../index-scheduler" }
|
||||
indexmap = { version = "1.9.3", features = ["serde-1"] }
|
||||
indexmap = { version = "2.0.0", features = ["serde"] }
|
||||
is-terminal = "0.4.8"
|
||||
itertools = "0.10.5"
|
||||
itertools = "0.11.0"
|
||||
jsonwebtoken = "8.3.0"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.17"
|
||||
@@ -87,7 +87,7 @@ sha2 = "0.10.6"
|
||||
siphasher = "0.3.10"
|
||||
slice-group-by = "0.3.0"
|
||||
static-files = { version = "0.2.3", optional = true }
|
||||
sysinfo = "0.28.4"
|
||||
sysinfo = "0.29.7"
|
||||
tar = "0.4.38"
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
|
||||
@@ -20,7 +20,7 @@ pub struct SearchAggregator;
|
||||
#[allow(dead_code)]
|
||||
impl SearchAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self::default()
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
@@ -32,7 +32,7 @@ pub struct MultiSearchAggregator;
|
||||
#[allow(dead_code)]
|
||||
impl MultiSearchAggregator {
|
||||
pub fn from_queries(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self::default()
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self) {}
|
||||
@@ -44,7 +44,7 @@ pub struct FacetSearchAggregator;
|
||||
#[allow(dead_code)]
|
||||
impl FacetSearchAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self::default()
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
mod mock_analytics;
|
||||
// if we are in release mode and the feature analytics was enabled
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
mod segment_analytics;
|
||||
|
||||
use std::fs;
|
||||
@@ -17,26 +16,25 @@ use serde_json::Value;
|
||||
use crate::routes::indexes::documents::UpdateDocumentsQuery;
|
||||
use crate::routes::tasks::TasksFilterQuery;
|
||||
|
||||
// if we are in debug mode OR the analytics feature is disabled
|
||||
// if the analytics feature is disabled
|
||||
// the `SegmentAnalytics` point to the mock instead of the real analytics
|
||||
#[cfg(any(debug_assertions, not(feature = "analytics")))]
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type SegmentAnalytics = mock_analytics::MockAnalytics;
|
||||
#[cfg(any(debug_assertions, not(feature = "analytics")))]
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type SearchAggregator = mock_analytics::SearchAggregator;
|
||||
#[cfg(any(debug_assertions, not(feature = "analytics")))]
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator;
|
||||
#[cfg(any(debug_assertions, not(feature = "analytics")))]
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator;
|
||||
|
||||
// if we are in release mode and the feature analytics was enabled
|
||||
// we use the real analytics
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
// if the feature analytics is enabled we use the real analytics
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type SearchAggregator = segment_analytics::SearchAggregator;
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator;
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator;
|
||||
|
||||
/// The Meilisearch config dir:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -28,7 +28,7 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH";
|
||||
const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR";
|
||||
const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
|
||||
const MEILI_ENV: &str = "MEILI_ENV";
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
|
||||
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
|
||||
const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH";
|
||||
@@ -159,7 +159,7 @@ pub struct Opt {
|
||||
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
|
||||
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
|
||||
/// at any time.
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
#[serde(default)] // we can't send true
|
||||
#[clap(long, env = MEILI_NO_ANALYTICS)]
|
||||
pub no_analytics: bool,
|
||||
@@ -390,7 +390,7 @@ impl Opt {
|
||||
ignore_missing_dump: _,
|
||||
ignore_dump_if_db_exists: _,
|
||||
config_file_path: _,
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
no_analytics,
|
||||
experimental_enable_metrics: enable_metrics_route,
|
||||
experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
|
||||
@@ -401,7 +401,7 @@ impl Opt {
|
||||
export_to_env_if_not_present(MEILI_MASTER_KEY, master_key);
|
||||
}
|
||||
export_to_env_if_not_present(MEILI_ENV, env);
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
{
|
||||
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
||||
}
|
||||
|
||||
@@ -24,6 +24,7 @@ pub mod features;
|
||||
pub mod indexes;
|
||||
mod metrics;
|
||||
mod multi_search;
|
||||
mod snapshot;
|
||||
mod swap_indexes;
|
||||
pub mod tasks;
|
||||
|
||||
@@ -32,6 +33,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
.service(web::resource("/health").route(web::get().to(get_health)))
|
||||
.service(web::scope("/keys").configure(api_key::configure))
|
||||
.service(web::scope("/dumps").configure(dump::configure))
|
||||
.service(web::scope("/snapshots").configure(snapshot::configure))
|
||||
.service(web::resource("/stats").route(web::get().to(get_stats)))
|
||||
.service(web::resource("/version").route(web::get().to(get_version)))
|
||||
.service(web::scope("/indexes").configure(indexes::configure))
|
||||
|
||||
32
meilisearch/src/routes/snapshot.rs
Normal file
32
meilisearch/src/routes/snapshot.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use index_scheduler::IndexScheduler;
|
||||
use log::debug;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::SummarizedTaskView;
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
|
||||
}
|
||||
|
||||
pub async fn create_snapshot(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
|
||||
|
||||
let task = KindWithContent::SnapshotCreation;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
|
||||
debug!("returns: {:?}", task);
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
@@ -60,8 +60,7 @@ pub async fn swap_indexes(
|
||||
}
|
||||
|
||||
let task = KindWithContent::IndexSwap { swaps };
|
||||
|
||||
let task = index_scheduler.register(task)?;
|
||||
let task: SummarizedTaskView = task.into();
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
||||
@@ -680,6 +680,7 @@ fn compute_semantic_score(query: &[f32], vectors: Value) -> milli::Result<Option
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
Ok(vectors
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|v| OrderedFloat(dot_product_similarity(query, &v)))
|
||||
.max()
|
||||
.map(OrderedFloat::into_inner))
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use std::{thread, time};
|
||||
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn add_valid_api_key() {
|
||||
@@ -162,7 +161,7 @@ async fn add_valid_api_key_null_description() {
|
||||
server.use_api_key("MASTER_KEY");
|
||||
|
||||
let content = json!({
|
||||
"description": Value::Null,
|
||||
"description": json!(null),
|
||||
"indexes": ["products"],
|
||||
"actions": ["documents.add"],
|
||||
"expiresAt": "2050-11-13T00:00:00"
|
||||
@@ -365,7 +364,7 @@ async fn error_add_api_key_invalid_index_uids() {
|
||||
server.use_api_key("MASTER_KEY");
|
||||
|
||||
let content = json!({
|
||||
"description": Value::Null,
|
||||
"description": json!(null),
|
||||
"indexes": ["invalid index # / \\name with spaces"],
|
||||
"actions": [
|
||||
"documents.add"
|
||||
@@ -422,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() {
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###"
|
||||
{
|
||||
"message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
|
||||
"message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
|
||||
"code": "invalid_api_key_actions",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_api_key_actions"
|
||||
@@ -507,7 +506,7 @@ async fn error_add_api_key_invalid_parameters_uid() {
|
||||
async fn error_add_api_key_parameters_uid_already_exist() {
|
||||
let mut server = Server::new_auth().await;
|
||||
server.use_api_key("MASTER_KEY");
|
||||
let content = json!({
|
||||
let content: Value = json!({
|
||||
"uid": "4bc0887a-0e41-4f3b-935d-0c451dcee9c8",
|
||||
"indexes": ["products"],
|
||||
"actions": ["search"],
|
||||
@@ -1146,7 +1145,7 @@ async fn patch_api_key_description() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
|
||||
// Remove the description
|
||||
let content = json!({ "description": serde_json::Value::Null });
|
||||
let content = json!({ "description": null });
|
||||
|
||||
let (response, code) = server.patch_api_key(&uid, content).await;
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]", ".uid" => "[ignored]", ".key" => "[ignored]" }), @r###"
|
||||
|
||||
@@ -3,10 +3,10 @@ use std::collections::{HashMap, HashSet};
|
||||
use ::time::format_description::well_known::Rfc3339;
|
||||
use maplit::{hashmap, hashset};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
|
||||
Lazy::new(|| {
|
||||
@@ -54,6 +54,7 @@ pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'
|
||||
("GET", "/indexes/products/stats") => hashset!{"stats.get", "stats.*", "*"},
|
||||
("GET", "/stats") => hashset!{"stats.get", "stats.*", "*"},
|
||||
("POST", "/dumps") => hashset!{"dumps.create", "dumps.*", "*"},
|
||||
("POST", "/snapshots") => hashset!{"snapshots.create", "snapshots.*", "*"},
|
||||
("GET", "/version") => hashset!{"version", "*"},
|
||||
("GET", "/metrics") => hashset!{"metrics.get", "metrics.*", "*"},
|
||||
("PATCH", "/keys/mykey/") => hashset!{"keys.update", "*"},
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use meili_snap::*;
|
||||
use serde_json::json;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn create_api_key_bad_description() {
|
||||
@@ -90,7 +90,7 @@ async fn create_api_key_bad_actions() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
|
||||
"message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`",
|
||||
"code": "invalid_api_key_actions",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_api_key_actions"
|
||||
|
||||
@@ -7,9 +7,9 @@ mod tenant_token;
|
||||
mod tenant_token_multi_search;
|
||||
|
||||
use actix_web::http::StatusCode;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
impl Server {
|
||||
pub fn use_api_key(&mut self, api_key: impl AsRef<str>) {
|
||||
|
||||
@@ -3,11 +3,11 @@ use std::collections::HashMap;
|
||||
use ::time::format_description::well_known::Rfc3339;
|
||||
use maplit::hashmap;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use super::authorization::{ALL_ACTIONS, AUTHORIZATIONS};
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
fn generate_tenant_token(
|
||||
parent_uid: impl AsRef<str>,
|
||||
@@ -233,31 +233,31 @@ async fn search_authorized_simple_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["*"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sales"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sa*"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
];
|
||||
|
||||
@@ -386,7 +386,7 @@ async fn error_search_token_forbidden_parent_key() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -398,7 +398,7 @@ async fn error_search_token_forbidden_parent_key() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -428,15 +428,15 @@ async fn error_search_forbidden_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"products": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"products": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"products": null}),
|
||||
"exp" => json!(null)
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["products"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null)
|
||||
},
|
||||
// expired token
|
||||
hashmap! {
|
||||
@@ -444,7 +444,7 @@ async fn error_search_forbidden_token() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -456,7 +456,7 @@ async fn error_search_forbidden_token() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
|
||||
@@ -3,11 +3,11 @@ use std::collections::HashMap;
|
||||
use ::time::format_description::well_known::Rfc3339;
|
||||
use maplit::hashmap;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use super::authorization::ALL_ACTIONS;
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
fn generate_tenant_token(
|
||||
parent_uid: impl AsRef<str>,
|
||||
@@ -512,31 +512,31 @@ async fn single_search_authorized_simple_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["*"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sales"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sa*"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
];
|
||||
|
||||
@@ -564,31 +564,31 @@ async fn multi_search_authorized_simple_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["*"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": {}, "products": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null, "products": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"sales": null, "products": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sales", "products"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sa*", "pro*"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
];
|
||||
|
||||
@@ -823,7 +823,7 @@ async fn error_single_search_token_forbidden_parent_key() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -835,7 +835,7 @@ async fn error_single_search_token_forbidden_parent_key() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -864,7 +864,7 @@ async fn error_multi_search_token_forbidden_parent_key() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -876,7 +876,7 @@ async fn error_multi_search_token_forbidden_parent_key() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null, "products": Value::Null}),
|
||||
"searchRules" => json!({"sales": null, "products": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() + Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -919,15 +919,15 @@ async fn error_single_search_forbidden_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"products": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"products": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"products": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["products"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
// expired token
|
||||
hashmap! {
|
||||
@@ -935,7 +935,7 @@ async fn error_single_search_forbidden_token() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -947,7 +947,7 @@ async fn error_single_search_forbidden_token() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -978,15 +978,15 @@ async fn error_multi_search_forbidden_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"products": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"products": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"products": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["products"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": {}}),
|
||||
@@ -998,15 +998,15 @@ async fn error_multi_search_forbidden_token() {
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": {}}),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null}),
|
||||
"exp" => Value::Null
|
||||
"searchRules" => json!({"sales": null}),
|
||||
"exp" => json!(null),
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!(["sales"]),
|
||||
"exp" => Value::Null
|
||||
"exp" => json!(null),
|
||||
},
|
||||
// expired token
|
||||
hashmap! {
|
||||
@@ -1014,7 +1014,7 @@ async fn error_multi_search_forbidden_token() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"*": Value::Null}),
|
||||
"searchRules" => json!({"*": null}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
@@ -1026,7 +1026,7 @@ async fn error_multi_search_forbidden_token() {
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
"searchRules" => json!({"sales": Value::Null, "products": {}}),
|
||||
"searchRules" => json!({"sales": null, "products": {}}),
|
||||
"exp" => json!((OffsetDateTime::now_utc() - Duration::hours(1)).unix_timestamp())
|
||||
},
|
||||
hashmap! {
|
||||
|
||||
@@ -3,12 +3,13 @@ use std::panic::{catch_unwind, resume_unwind, UnwindSafe};
|
||||
use std::time::Duration;
|
||||
|
||||
use actix_web::http::StatusCode;
|
||||
use serde_json::{json, Value};
|
||||
use tokio::time::sleep;
|
||||
use urlencoding::encode as urlencode;
|
||||
|
||||
use super::encoder::Encoder;
|
||||
use super::service::Service;
|
||||
use super::Value;
|
||||
use crate::json;
|
||||
|
||||
pub struct Index<'a> {
|
||||
pub uid: String,
|
||||
@@ -242,7 +243,9 @@ impl Index<'_> {
|
||||
|
||||
pub async fn delete_batch(&self, ids: Vec<u64>) -> (Value, StatusCode) {
|
||||
let url = format!("/indexes/{}/documents/delete-batch", urlencode(self.uid.as_ref()));
|
||||
self.service.post_encoded(url, serde_json::to_value(&ids).unwrap(), self.encoder).await
|
||||
self.service
|
||||
.post_encoded(url, serde_json::to_value(&ids).unwrap().into(), self.encoder)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn delete_batch_raw(&self, body: Value) -> (Value, StatusCode) {
|
||||
|
||||
@@ -3,9 +3,83 @@ pub mod index;
|
||||
pub mod server;
|
||||
pub mod service;
|
||||
|
||||
use std::fmt::{self, Display};
|
||||
|
||||
pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
|
||||
use meili_snap::json_string;
|
||||
use serde::{Deserialize, Serialize};
|
||||
pub use server::{default_settings, Server};
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Value(pub serde_json::Value);
|
||||
|
||||
impl Value {
|
||||
pub fn uid(&self) -> u64 {
|
||||
if let Some(uid) = self["uid"].as_u64() {
|
||||
uid
|
||||
} else if let Some(uid) = self["taskUid"].as_u64() {
|
||||
uid
|
||||
} else {
|
||||
panic!("Didn't find any task id in: {self}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Value> for Value {
|
||||
fn from(value: serde_json::Value) -> Self {
|
||||
Value(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Value {
|
||||
type Target = serde_json::Value;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<serde_json::Value> for Value {
|
||||
fn eq(&self, other: &serde_json::Value) -> bool {
|
||||
&self.0 == other
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<Value> for serde_json::Value {
|
||||
fn eq(&self, other: &Value) -> bool {
|
||||
self == &other.0
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq<&str> for Value {
|
||||
fn eq(&self, other: &&str) -> bool {
|
||||
self.0.eq(other)
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for Value {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
json_string!(self, { ".enqueuedAt" => "[date]", ".processedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<Value>> for Value {
|
||||
fn from(value: Vec<Value>) -> Self {
|
||||
Self(value.into_iter().map(|value| value.0).collect::<serde_json::Value>())
|
||||
}
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! json {
|
||||
($($json:tt)+) => {
|
||||
$crate::common::Value(serde_json::json!($($json)+))
|
||||
};
|
||||
}
|
||||
|
||||
/// Performs a search test on both post and get routes
|
||||
#[macro_export]
|
||||
macro_rules! test_post_get_search {
|
||||
|
||||
@@ -11,13 +11,14 @@ use clap::Parser;
|
||||
use meilisearch::option::{IndexerOpts, MaxMemory, Opt};
|
||||
use meilisearch::{analytics, create_app, setup_meilisearch};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
use tempfile::TempDir;
|
||||
use tokio::time::sleep;
|
||||
|
||||
use super::index::Index;
|
||||
use super::service::Service;
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::Value;
|
||||
use crate::json;
|
||||
|
||||
pub struct Server {
|
||||
pub service: Service,
|
||||
@@ -156,6 +157,10 @@ impl Server {
|
||||
self.service.post("/dumps", json!(null)).await
|
||||
}
|
||||
|
||||
pub async fn create_snapshot(&self) -> (Value, StatusCode) {
|
||||
self.service.post("/snapshots", json!(null)).await
|
||||
}
|
||||
|
||||
pub async fn index_swap(&self, value: Value) -> (Value, StatusCode) {
|
||||
self.service.post("/swap-indexes", value).await
|
||||
}
|
||||
@@ -204,7 +209,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
db_path: dir.as_ref().join("db"),
|
||||
dump_dir: dir.as_ref().join("dumps"),
|
||||
env: "development".to_owned(),
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
#[cfg(feature = "analytics")]
|
||||
no_analytics: true,
|
||||
max_index_size: Byte::from_unit(100.0, ByteUnit::MiB).unwrap(),
|
||||
max_task_db_size: Byte::from_unit(1.0, ByteUnit::GiB).unwrap(),
|
||||
|
||||
@@ -7,9 +7,9 @@ use actix_web::test::TestRequest;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch::{analytics, create_app, Opt};
|
||||
use meilisearch_auth::AuthController;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::Value;
|
||||
|
||||
pub struct Service {
|
||||
pub index_scheduler: Arc<IndexScheduler>,
|
||||
|
||||
@@ -3,9 +3,8 @@
|
||||
mod common;
|
||||
|
||||
use actix_web::test;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
|
||||
enum HttpVerb {
|
||||
Put,
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use actix_web::test;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::{json, Value};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
use crate::common::{GetAllDocumentsOptions, Server, Value};
|
||||
use crate::json;
|
||||
|
||||
/// This is the basic usage of our API and every other tests uses the content-type application/json
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn delete_one_document_unexisting_index() {
|
||||
@@ -154,6 +154,19 @@ async fn delete_document_by_filter() {
|
||||
)
|
||||
.await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
let (stats, _) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 4,
|
||||
"isIndexing": false,
|
||||
"fieldDistribution": {
|
||||
"color": 3,
|
||||
"id": 4
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) =
|
||||
index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
@@ -188,6 +201,18 @@ async fn delete_document_by_filter() {
|
||||
}
|
||||
"###);
|
||||
|
||||
let (stats, _) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"fieldDistribution": {
|
||||
"color": 1,
|
||||
"id": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(documents), @r###"
|
||||
@@ -241,6 +266,18 @@ async fn delete_document_by_filter() {
|
||||
}
|
||||
"###);
|
||||
|
||||
let (stats, _) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 1,
|
||||
"isIndexing": false,
|
||||
"fieldDistribution": {
|
||||
"color": 1,
|
||||
"id": 1
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(documents), @r###"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use meili_snap::*;
|
||||
use serde_json::json;
|
||||
use urlencoding::encode;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn get_all_documents_bad_offset() {
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use actix_web::test;
|
||||
use http::header::ACCEPT_ENCODING;
|
||||
use meili_snap::*;
|
||||
use serde_json::{json, Value};
|
||||
use urlencoding::encode as urlencode;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server};
|
||||
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value};
|
||||
use crate::json;
|
||||
|
||||
// TODO: partial test since we are testing error, amd error is not yet fully implemented in
|
||||
// transplant
|
||||
@@ -40,7 +40,7 @@ async fn get_document() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
index.create(None).await;
|
||||
let documents = serde_json::json!([
|
||||
let documents = json!([
|
||||
{
|
||||
"id": 0,
|
||||
"nested": { "content": "foobar" },
|
||||
@@ -53,7 +53,7 @@ async fn get_document() {
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(
|
||||
response,
|
||||
serde_json::json!({
|
||||
json!({
|
||||
"id": 0,
|
||||
"nested": { "content": "foobar" },
|
||||
})
|
||||
@@ -64,7 +64,7 @@ async fn get_document() {
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(
|
||||
response,
|
||||
serde_json::json!({
|
||||
json!({
|
||||
"id": 0,
|
||||
})
|
||||
);
|
||||
@@ -75,7 +75,7 @@ async fn get_document() {
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(
|
||||
response,
|
||||
serde_json::json!({
|
||||
json!({
|
||||
"nested": { "content": "foobar" },
|
||||
})
|
||||
);
|
||||
@@ -122,7 +122,7 @@ async fn get_all_documents_no_options() {
|
||||
assert_eq!(code, 200);
|
||||
let arr = response["results"].as_array().unwrap();
|
||||
assert_eq!(arr.len(), 20);
|
||||
let first = serde_json::json!({
|
||||
let first = json!({
|
||||
"id":0,
|
||||
"isActive":false,
|
||||
"balance":"$2,668.55",
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use serde_json::json;
|
||||
use meili_snap::snapshot;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn error_document_update_create_index_bad_uid() {
|
||||
@@ -84,7 +85,13 @@ async fn update_document() {
|
||||
|
||||
let (response, code) = index.get_document(1, None).await;
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(response.to_string(), r##"{"doc_id":1,"content":"foo","other":"bar"}"##);
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"doc_id": 1,
|
||||
"content": "foo",
|
||||
"other": "bar"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
@@ -122,7 +129,13 @@ async fn update_document_gzip_encoded() {
|
||||
|
||||
let (response, code) = index.get_document(1, None).await;
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(response.to_string(), r##"{"doc_id":1,"content":"foo","other":"bar"}"##);
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"doc_id": 1,
|
||||
"content": "foo",
|
||||
"other": "bar"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -2,10 +2,10 @@ mod data;
|
||||
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::Opt;
|
||||
use serde_json::json;
|
||||
|
||||
use self::data::GetDump;
|
||||
use crate::common::{default_settings, GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
|
||||
// all the following test are ignored on windows. See #2364
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
/// Feature name to test against.
|
||||
/// This will have to be changed by a different one when that feature is stabilized.
|
||||
|
||||
@@ -2,10 +2,10 @@ use actix_web::http::header::ContentType;
|
||||
use actix_web::test;
|
||||
use http::header::ACCEPT_ENCODING;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn create_index_no_primary_key() {
|
||||
@@ -21,7 +21,7 @@ async fn create_index_no_primary_key() {
|
||||
|
||||
assert_eq!(response["status"], "succeeded");
|
||||
assert_eq!(response["type"], "indexCreation");
|
||||
assert_eq!(response["details"]["primaryKey"], Value::Null);
|
||||
assert_eq!(response["details"]["primaryKey"], json!(null));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
@@ -38,7 +38,7 @@ async fn create_index_with_gzip_encoded_request() {
|
||||
|
||||
assert_eq!(response["status"], "succeeded");
|
||||
assert_eq!(response["type"], "indexCreation");
|
||||
assert_eq!(response["details"]["primaryKey"], Value::Null);
|
||||
assert_eq!(response["details"]["primaryKey"], json!(null));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
@@ -86,7 +86,7 @@ async fn create_index_with_zlib_encoded_request() {
|
||||
|
||||
assert_eq!(response["status"], "succeeded");
|
||||
assert_eq!(response["type"], "indexCreation");
|
||||
assert_eq!(response["details"]["primaryKey"], Value::Null);
|
||||
assert_eq!(response["details"]["primaryKey"], json!(null));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
@@ -103,7 +103,7 @@ async fn create_index_with_brotli_encoded_request() {
|
||||
|
||||
assert_eq!(response["status"], "succeeded");
|
||||
assert_eq!(response["type"], "indexCreation");
|
||||
assert_eq!(response["details"]["primaryKey"], Value::Null);
|
||||
assert_eq!(response["details"]["primaryKey"], json!(null));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
@@ -136,7 +136,7 @@ async fn create_index_with_invalid_primary_key() {
|
||||
|
||||
let (response, code) = index.get().await;
|
||||
assert_eq!(code, 200);
|
||||
assert_eq!(response["primaryKey"], Value::Null);
|
||||
assert_eq!(response["primaryKey"], json!(null));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn create_and_delete_index() {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use meili_snap::*;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn get_indexes_bad_offset() {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn stats() {
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use serde_json::json;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn update_primary_key() {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use meili_snap::*;
|
||||
use serde_json::json;
|
||||
|
||||
use super::DOCUMENTS;
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn search_unexisting_index() {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use meili_snap::snapshot;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use insta::{allow_duplicates, assert_json_snapshot};
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn formatted_contain_wildcard() {
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
@@ -60,3 +61,59 @@ async fn geo_sort_with_geo_strings() {
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn geo_bounding_box_with_string_and_number() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.update_settings_filterable_attributes(json!(["_geo"])).await;
|
||||
index.update_settings_sortable_attributes(json!(["_geo"])).await;
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
index
|
||||
.search(
|
||||
json!({
|
||||
"filter": "_geoBoundingBox([89, 179], [-89, -179])",
|
||||
}),
|
||||
|response, code| {
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"id": 1,
|
||||
"name": "Taco Truck",
|
||||
"address": "444 Salsa Street, Burritoville",
|
||||
"type": "Mexican",
|
||||
"rating": 9,
|
||||
"_geo": {
|
||||
"lat": 34.0522,
|
||||
"lng": -118.2437
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"name": "La Bella Italia",
|
||||
"address": "456 Elm Street, Townsville",
|
||||
"type": "Italian",
|
||||
"rating": 9,
|
||||
"_geo": {
|
||||
"lat": "45.4777599",
|
||||
"lng": "9.1967508"
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[time]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 2
|
||||
}
|
||||
"###);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ mod pagination;
|
||||
mod restrict_searchable;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
@@ -1104,3 +1104,59 @@ async fn camelcased_words() {
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search_with_strange_synonyms() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await;
|
||||
let r = index.wait_task(0).await;
|
||||
meili_snap::snapshot!(r["status"], @r###""succeeded""###);
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
index
|
||||
.search(json!({"q": "How to train"}), |response, code| {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
index
|
||||
.search(json!({"q": "How & train"}), |response, code| {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
index
|
||||
.search(json!({"q": "to"}), |response, code| {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::json;
|
||||
|
||||
use super::{DOCUMENTS, NESTED_DOCUMENTS};
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn search_empty_list() {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
use crate::search::DOCUMENTS;
|
||||
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::index::Index;
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
|
||||
let index = server.index("test");
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn set_and_reset_distinct_attribute() {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use meili_snap::*;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn settings_bad_displayed_attributes() {
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
static DEFAULT_SETTINGS_VALUES: Lazy<HashMap<&'static str, Value>> = Lazy::new(|| {
|
||||
let mut map = HashMap::new();
|
||||
map.insert("displayed_attributes", json!(["*"]));
|
||||
map.insert("searchable_attributes", json!(["*"]));
|
||||
map.insert("filterable_attributes", json!([]));
|
||||
map.insert("distinct_attribute", json!(Value::Null));
|
||||
map.insert("distinct_attribute", json!(null));
|
||||
map.insert(
|
||||
"ranking_rules",
|
||||
json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]),
|
||||
@@ -229,7 +229,7 @@ macro_rules! test_setting_routes {
|
||||
.chars()
|
||||
.map(|c| if c == '_' { '-' } else { c })
|
||||
.collect::<String>());
|
||||
let (response, code) = server.service.$write_method(url, serde_json::Value::Null).await;
|
||||
let (response, code) = server.service.$write_method(url, serde_json::Value::Null.into()).await;
|
||||
assert_eq!(code, 202, "{}", response);
|
||||
server.index("").wait_task(0).await;
|
||||
let (response, code) = server.index("test").get().await;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn set_and_reset() {
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use actix_rt::time::sleep;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::option::ScheduleSnapshot;
|
||||
use meilisearch::Opt;
|
||||
|
||||
use crate::common::server::default_settings;
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
|
||||
macro_rules! verify_snapshot {
|
||||
(
|
||||
@@ -44,7 +46,7 @@ async fn perform_snapshot() {
|
||||
|
||||
let index = server.index("test");
|
||||
index
|
||||
.update_settings(serde_json::json! ({
|
||||
.update_settings(json! ({
|
||||
"searchableAttributes": [],
|
||||
}))
|
||||
.await;
|
||||
@@ -90,3 +92,95 @@ async fn perform_snapshot() {
|
||||
server.index("test1").settings(),
|
||||
);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn perform_on_demand_snapshot() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
let snapshot_dir = tempfile::tempdir().unwrap();
|
||||
|
||||
let options =
|
||||
Opt { snapshot_dir: snapshot_dir.path().to_owned(), ..default_settings(temp.path()) };
|
||||
|
||||
let server = Server::new_with_options(options).await.unwrap();
|
||||
|
||||
let index = server.index("catto");
|
||||
index
|
||||
.update_settings(json! ({
|
||||
"searchableAttributes": [],
|
||||
}))
|
||||
.await;
|
||||
|
||||
index.load_test_set().await;
|
||||
|
||||
server.index("doggo").create(Some("bone")).await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
server.index("doggo").create(Some("bone")).await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
let (task, code) = server.create_snapshot().await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
snapshot!(json_string!(task, { ".enqueuedAt" => "[date]" }), @r###"
|
||||
{
|
||||
"taskUid": 4,
|
||||
"indexUid": null,
|
||||
"status": "enqueued",
|
||||
"type": "snapshotCreation",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
let task = index.wait_task(task.uid()).await;
|
||||
snapshot!(json_string!(task, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###"
|
||||
{
|
||||
"uid": 4,
|
||||
"indexUid": null,
|
||||
"status": "succeeded",
|
||||
"type": "snapshotCreation",
|
||||
"canceledBy": null,
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
|
||||
let snapshots: Vec<String> = std::fs::read_dir(&snapshot_dir)
|
||||
.unwrap()
|
||||
.map(|entry| entry.unwrap().path().file_name().unwrap().to_str().unwrap().to_string())
|
||||
.collect();
|
||||
meili_snap::snapshot!(format!("{snapshots:?}"), @r###"["db.snapshot"]"###);
|
||||
|
||||
let snapshot_path = snapshot_dir.path().to_owned().join("db.snapshot");
|
||||
#[cfg_attr(windows, allow(unused))]
|
||||
let snapshot_meta = std::fs::metadata(&snapshot_path).unwrap();
|
||||
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
let mode = snapshot_meta.permissions().mode();
|
||||
// rwxrwxrwx
|
||||
meili_snap::snapshot!(format!("{:b}", mode), @"1000000100100100");
|
||||
}
|
||||
|
||||
let options = Opt { import_snapshot: Some(snapshot_path), ..default_settings(temp.path()) };
|
||||
|
||||
let snapshot_server = Server::new_with_options(options).await.unwrap();
|
||||
|
||||
verify_snapshot!(server, snapshot_server, |server| =>
|
||||
server.list_indexes(None, None),
|
||||
// for some reason the db sizes differ. this may be due to the compaction options we have
|
||||
// set when performing the snapshot
|
||||
//server.stats(),
|
||||
|
||||
// The original instance contains the snapshotCreation task, while the snapshotted-instance does not. For this reason we need to compare the task queue **after** the task 4
|
||||
server.tasks_filter("?from=2"),
|
||||
|
||||
server.index("catto").get_all_documents(GetAllDocumentsOptions::default()),
|
||||
server.index("catto").settings(),
|
||||
server.index("doggo").get_all_documents(GetAllDocumentsOptions::default()),
|
||||
server.index("doggo").settings(),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use serde_json::json;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn get_settings_unexisting_index() {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use meili_snap::*;
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn swap_indexes_bad_format() {
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
mod errors;
|
||||
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use serde_json::json;
|
||||
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn swap_indexes() {
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
mod errors;
|
||||
|
||||
use meili_snap::insta::assert_json_snapshot;
|
||||
use serde_json::json;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn error_get_unexisting_task_status() {
|
||||
@@ -33,7 +33,7 @@ async fn get_task_status() {
|
||||
index.create(None).await;
|
||||
index
|
||||
.add_documents(
|
||||
serde_json::json!([{
|
||||
json!([{
|
||||
"id": 1,
|
||||
"content": "foobar",
|
||||
}]),
|
||||
|
||||
@@ -17,10 +17,10 @@ bincode = "1.3.3"
|
||||
bstr = "1.4.0"
|
||||
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.4.3"
|
||||
charabia = { version = "0.8.2", default-features = false }
|
||||
charabia = { version = "0.8.3", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.8"
|
||||
deserr = "0.5.0"
|
||||
deserr = { version = "0.6.0", features = ["actix-web"]}
|
||||
either = { version = "1.8.1", features = ["serde"] }
|
||||
flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
@@ -32,18 +32,18 @@ grenad = { version = "0.4.4", default-features = false, features = [
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
||||
"lmdb", "read-txn-no-tls"
|
||||
] }
|
||||
indexmap = { version = "1.9.3", features = ["serde"] }
|
||||
indexmap = { version = "2.0.0", features = ["serde"] }
|
||||
instant-distance = { version = "0.6.1", features = ["with-serde"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memmap2 = "0.5.10"
|
||||
memmap2 = "0.7.1"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.17.1"
|
||||
ordered-float = "3.6.0"
|
||||
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||
rayon = "1.7.0"
|
||||
roaring = "0.10.1"
|
||||
rstar = { version = "0.10.0", features = ["serde"] }
|
||||
rstar = { version = "0.11.0", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
slice-group-by = "0.3.0"
|
||||
@@ -63,7 +63,7 @@ uuid = { version = "1.3.1", features = ["v4"] }
|
||||
filter-parser = { path = "../filter-parser" }
|
||||
|
||||
# documents words self-join
|
||||
itertools = "0.10.5"
|
||||
itertools = "0.11.0"
|
||||
|
||||
# profiling
|
||||
puffin = "0.16.0"
|
||||
@@ -79,6 +79,7 @@ big_s = "1.0.2"
|
||||
insta = "1.29.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
|
||||
@@ -122,22 +122,28 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
.field,
|
||||
match .valid_fields.is_empty() {
|
||||
true => "This index does not have configured sortable attributes.".to_string(),
|
||||
false => format!("Available sortable attributes are: `{}`.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
false => format!("Available sortable attributes are: `{}{}`.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
),
|
||||
}
|
||||
)]
|
||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
|
||||
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
|
||||
#[error("Attribute `{}` is not facet-searchable. {}",
|
||||
.field,
|
||||
match .valid_fields.is_empty() {
|
||||
true => "This index does not have configured facet-searchable attributes. To make it facet-searchable add it to the `filterableAttributes` index settings.".to_string(),
|
||||
false => format!("Available facet-searchable attributes are: `{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
false => format!("Available facet-searchable attributes are: `{}{}`. To make it facet-searchable add it to the `filterableAttributes` index settings.",
|
||||
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
|
||||
),
|
||||
}
|
||||
)]
|
||||
InvalidFacetSearchFacetName { field: String, valid_fields: BTreeSet<String> },
|
||||
InvalidFacetSearchFacetName {
|
||||
field: String,
|
||||
valid_fields: BTreeSet<String>,
|
||||
hidden_fields: bool,
|
||||
},
|
||||
#[error("Attribute `{}` is not searchable. Available searchable attributes are: `{}{}`.",
|
||||
.field,
|
||||
.valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
|
||||
@@ -340,8 +346,11 @@ fn conditionally_lookup_for_error_message() {
|
||||
];
|
||||
|
||||
for (list, suffix) in messages {
|
||||
let err =
|
||||
UserError::InvalidSortableAttribute { field: "name".to_string(), valid_fields: list };
|
||||
let err = UserError::InvalidSortableAttribute {
|
||||
field: "name".to_string(),
|
||||
valid_fields: list,
|
||||
hidden_fields: false,
|
||||
};
|
||||
|
||||
assert_eq!(err.to_string(), format!("{} {}", prefix, suffix));
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
@@ -60,12 +61,16 @@ impl CboRoaringBitmapCodec {
|
||||
/// if the merged values length is under the threshold, values are directly
|
||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||
/// values and is serialized in the buffer.
|
||||
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = A>,
|
||||
A: AsRef<[u8]>,
|
||||
{
|
||||
let mut roaring = RoaringBitmap::new();
|
||||
let mut vec = Vec::new();
|
||||
|
||||
for bytes in slices {
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut reader = bytes.as_ref();
|
||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||
vec.push(integer);
|
||||
@@ -85,7 +90,7 @@ impl CboRoaringBitmapCodec {
|
||||
}
|
||||
} else {
|
||||
// We can unwrap safely because the vector is sorted upper.
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
} else {
|
||||
@@ -95,6 +100,28 @@ impl CboRoaringBitmapCodec {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into(
|
||||
deladd: KvReaderDelAdd<'_>,
|
||||
previous: &[u8],
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> io::Result<()> {
|
||||
// Deserialize the bitmap that is already there
|
||||
let mut previous = Self::deserialize_from(previous)?;
|
||||
|
||||
// Remove integers we no more want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||
previous -= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
// Insert the new integers we want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||
previous |= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
previous.serialize_into(buffer)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::mem::size_of;
|
||||
use std::path::Path;
|
||||
|
||||
use charabia::{Language, Script};
|
||||
@@ -14,7 +13,6 @@ use time::OffsetDateTime;
|
||||
|
||||
use crate::distance::NDotProductPoint;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||
@@ -55,7 +53,6 @@ pub mod main_key {
|
||||
/// e.g. vector-hnsw0x0032.
|
||||
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
|
||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
||||
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
|
||||
@@ -64,7 +61,6 @@ pub mod main_key {
|
||||
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
|
||||
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
|
||||
pub const DICTIONARY_KEY: &str = "dictionary";
|
||||
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
|
||||
pub const SYNONYMS_KEY: &str = "synonyms";
|
||||
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
|
||||
pub const WORDS_FST_KEY: &str = "words-fst";
|
||||
@@ -119,16 +115,16 @@ pub struct Index {
|
||||
pub(crate) main: PolyDatabase,
|
||||
|
||||
/// A word and all the documents ids containing the word.
|
||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// A prefix of word and all the documents ids containing this prefix.
|
||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
@@ -655,6 +651,26 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/* remove hidden fields */
|
||||
pub fn remove_hidden_fields(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
fields: impl IntoIterator<Item = impl AsRef<str>>,
|
||||
) -> Result<(BTreeSet<String>, bool)> {
|
||||
let mut valid_fields =
|
||||
fields.into_iter().map(|f| f.as_ref().to_string()).collect::<BTreeSet<String>>();
|
||||
|
||||
let fields_len = valid_fields.len();
|
||||
|
||||
if let Some(dn) = self.displayed_fields(rtxn)? {
|
||||
let displayable_names = dn.iter().map(|s| s.to_string()).collect();
|
||||
valid_fields = &valid_fields & &displayable_names;
|
||||
}
|
||||
|
||||
let hidden_fields = fields_len > valid_fields.len();
|
||||
Ok((valid_fields, hidden_fields))
|
||||
}
|
||||
|
||||
/* searchable fields */
|
||||
|
||||
/// Write the user defined searchable fields and generate the real searchable fields from the specified fields ids map.
|
||||
@@ -906,44 +922,6 @@ impl Index {
|
||||
|
||||
/* faceted documents ids */
|
||||
|
||||
/// Writes the documents ids that are faceted under this field id for the given facet type.
|
||||
pub fn put_faceted_documents_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
let key = match facet_type {
|
||||
FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||
FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||
};
|
||||
let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
|
||||
buffer[..key.len()].copy_from_slice(key.as_bytes());
|
||||
buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
|
||||
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||
}
|
||||
|
||||
/// Retrieve all the documents ids that are faceted under this field id for the given facet type.
|
||||
pub fn faceted_documents_ids(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
) -> heed::Result<RoaringBitmap> {
|
||||
let key = match facet_type {
|
||||
FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||
FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||
};
|
||||
let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
|
||||
buffer[..key.len()].copy_from_slice(key.as_bytes());
|
||||
buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
|
||||
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||
Some(docids) => Ok(docids),
|
||||
None => Ok(RoaringBitmap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve all the documents which contain this field id set as null
|
||||
pub fn null_faceted_documents_ids(
|
||||
&self,
|
||||
@@ -1820,11 +1798,11 @@ pub(crate) mod tests {
|
||||
.unwrap();
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "_geo": { "lat": 0, "lng": 0 } },
|
||||
{ "id": 1, "_geo": { "lat": 0, "lng": -175 } },
|
||||
{ "id": 2, "_geo": { "lat": 0, "lng": 175 } },
|
||||
{ "id": 0, "_geo": { "lat": "0", "lng": "0" } },
|
||||
{ "id": 1, "_geo": { "lat": 0, "lng": "-175" } },
|
||||
{ "id": 2, "_geo": { "lat": "0", "lng": 175 } },
|
||||
{ "id": 3, "_geo": { "lat": 85, "lng": 0 } },
|
||||
{ "id": 4, "_geo": { "lat": -85, "lng": 0 } },
|
||||
{ "id": 4, "_geo": { "lat": "-85", "lng": "0" } },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ const MAX_LMDB_KEY_LENGTH: usize = 500;
|
||||
///
|
||||
/// This number is determined by the keys of the different facet databases
|
||||
/// and adding a margin of safety.
|
||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;
|
||||
pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;
|
||||
|
||||
/// The maximum length a word can be
|
||||
pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
|
||||
@@ -293,15 +293,15 @@ pub fn normalize_facet(original: &str) -> String {
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
#[serde(transparent)]
|
||||
pub struct VectorOrArrayOfVectors {
|
||||
#[serde(with = "either::serde_untagged")]
|
||||
inner: either::Either<Vec<f32>, Vec<Vec<f32>>>,
|
||||
#[serde(with = "either::serde_untagged_optional")]
|
||||
inner: Option<either::Either<Vec<f32>, Vec<Vec<f32>>>>,
|
||||
}
|
||||
|
||||
impl VectorOrArrayOfVectors {
|
||||
pub fn into_array_of_vectors(self) -> Vec<Vec<f32>> {
|
||||
match self.inner {
|
||||
either::Either::Left(vector) => vec![vector],
|
||||
either::Either::Right(vectors) => vectors,
|
||||
pub fn into_array_of_vectors(self) -> Option<Vec<Vec<f32>>> {
|
||||
match self.inner? {
|
||||
either::Either::Left(vector) => Some(vec![vector]),
|
||||
either::Either::Right(vectors) => Some(vectors),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,9 +280,13 @@ impl<'a> SearchForFacetValues<'a> {
|
||||
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
if !filterable_fields.contains(&self.facet) {
|
||||
let (valid_fields, hidden_fields) =
|
||||
index.remove_hidden_fields(rtxn, filterable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidFacetSearchFacetName {
|
||||
field: self.facet.clone(),
|
||||
valid_fields: filterable_fields.into_iter().collect(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
|
||||
@@ -91,11 +91,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
/// Update the universes accordingly and inform the logger.
|
||||
macro_rules! back {
|
||||
() => {
|
||||
assert!(
|
||||
ranking_rule_universes[cur_ranking_rule_index].is_empty(),
|
||||
"The ranking rule {} did not sort its bucket exhaustively",
|
||||
ranking_rules[cur_ranking_rule_index].id()
|
||||
);
|
||||
// FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013>
|
||||
// assert!(
|
||||
// ranking_rule_universes[cur_ranking_rule_index].is_empty(),
|
||||
// "The ranking rule {} did not sort its bucket exhaustively",
|
||||
// ranking_rules[cur_ranking_rule_index].id()
|
||||
// );
|
||||
logger.end_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
|
||||
@@ -11,9 +11,7 @@ use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
///
|
||||
@@ -168,7 +166,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@@ -182,7 +180,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@@ -230,7 +228,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
@@ -244,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
&mut self,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
|
||||
@@ -418,19 +418,11 @@ impl<'t> Matcher<'t, '_> {
|
||||
} else {
|
||||
match &self.matches {
|
||||
Some((tokens, matches)) => {
|
||||
// If the text has to be cropped,
|
||||
// compute the best interval to crop around.
|
||||
let matches = match format_options.crop {
|
||||
Some(crop_size) if crop_size > 0 => {
|
||||
self.find_best_match_interval(matches, crop_size)
|
||||
}
|
||||
_ => matches,
|
||||
};
|
||||
|
||||
// If the text has to be cropped,
|
||||
// crop around the best interval.
|
||||
let (byte_start, byte_end) = match format_options.crop {
|
||||
Some(crop_size) if crop_size > 0 => {
|
||||
let matches = self.find_best_match_interval(matches, crop_size);
|
||||
self.crop_bounds(tokens, matches, crop_size)
|
||||
}
|
||||
_ => (0, self.text.len()),
|
||||
@@ -450,6 +442,11 @@ impl<'t> Matcher<'t, '_> {
|
||||
for m in matches {
|
||||
let token = &tokens[m.token_position];
|
||||
|
||||
// skip matches out of the crop window.
|
||||
if token.byte_start < byte_start || token.byte_end > byte_end {
|
||||
continue;
|
||||
}
|
||||
|
||||
if byte_index < token.byte_start {
|
||||
formatted.push(&self.text[byte_index..token.byte_start]);
|
||||
}
|
||||
@@ -800,6 +797,37 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop_phrase_query() {
|
||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
|
||||
]))
|
||||
.unwrap();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
||||
let mut matcher = builder.build(text);
|
||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…had the power to split <em>the</em> <em>world</em> between those who…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
||||
let mut matcher = builder.build(text);
|
||||
// should highlight "those" and the phrase "and those".
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
|
||||
@@ -20,7 +20,7 @@ mod sort;
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::collections::HashSet;
|
||||
|
||||
use bucket_sort::{bucket_sort, BucketSortOutput};
|
||||
use charabia::TokenizerBuilder;
|
||||
@@ -108,24 +108,11 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
(None, None) => continue,
|
||||
// The field is not searchable => User error
|
||||
(_fid, Some(false)) => {
|
||||
let mut valid_fields: BTreeSet<_> =
|
||||
fids_map.names().map(String::from).collect();
|
||||
let (valid_fields, hidden_fields) = match searchable_names {
|
||||
Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?,
|
||||
None => self.index.remove_hidden_fields(self.txn, fids_map.names())?,
|
||||
};
|
||||
|
||||
// Filter by the searchable names
|
||||
if let Some(sn) = searchable_names {
|
||||
let searchable_names = sn.iter().map(|s| s.to_string()).collect();
|
||||
valid_fields = &valid_fields & &searchable_names;
|
||||
}
|
||||
|
||||
let searchable_count = valid_fields.len();
|
||||
|
||||
// Remove hidden fields
|
||||
if let Some(dn) = self.index.displayed_fields(self.txn)? {
|
||||
let displayable_names = dn.iter().map(|s| s.to_string()).collect();
|
||||
valid_fields = &valid_fields & &displayable_names;
|
||||
}
|
||||
|
||||
let hidden_fields = searchable_count > valid_fields.len();
|
||||
let field = field_name.to_string();
|
||||
return Err(UserError::InvalidSearchableAttribute {
|
||||
field,
|
||||
@@ -604,16 +591,24 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>
|
||||
for asc_desc in sort_criteria {
|
||||
match asc_desc.member() {
|
||||
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: field.to_string(),
|
||||
valid_fields: sortable_fields.into_iter().collect(),
|
||||
})?
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
})?;
|
||||
}
|
||||
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: "_geo".to_string(),
|
||||
valid_fields: sortable_fields.into_iter().collect(),
|
||||
})?
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
})?;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use meili_snap::insta;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
|
||||
@@ -359,31 +359,7 @@ pub fn snap_external_documents_ids(index: &Index) -> String {
|
||||
|
||||
snap
|
||||
}
|
||||
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let number_faceted_documents_ids =
|
||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
|
||||
.unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let string_faceted_documents_ids =
|
||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
|
||||
.unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_words_fst(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let words_fst = index.words_fst(&rtxn).unwrap();
|
||||
@@ -531,12 +507,6 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, external_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, number_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, string_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, words_fst) => {{
|
||||
$crate::snapshot_tests::snap_words_fst(&$index)
|
||||
}};
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||
@@ -51,7 +50,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
|
||||
// We retrieve the number of documents ids that we are deleting.
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
|
||||
// We clean some of the main engine datastructures.
|
||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||
@@ -64,22 +62,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
|
||||
// We clean all the faceted documents ids.
|
||||
for field_id in faceted_fields {
|
||||
self.index.put_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
FacetType::Number,
|
||||
&empty_roaring,
|
||||
)?;
|
||||
self.index.put_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
FacetType::String,
|
||||
&empty_roaring,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Clear the other databases.
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
|
||||
104
milli/src/update/del_add.rs
Normal file
104
milli/src/update/del_add.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
use obkv::Key;
|
||||
|
||||
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
||||
|
||||
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||
///
|
||||
/// Its used in an OBKV to be serialized in grenad files.
|
||||
#[repr(u8)]
|
||||
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
||||
pub enum DelAdd {
|
||||
Deletion = 0,
|
||||
Addition = 1,
|
||||
}
|
||||
|
||||
impl Key for DelAdd {
|
||||
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
||||
type BYTES = [u8; Self::BYTES_SIZE];
|
||||
|
||||
fn to_be_bytes(&self) -> Self::BYTES {
|
||||
u8::to_be_bytes(*self as u8)
|
||||
}
|
||||
|
||||
fn from_be_bytes(array: Self::BYTES) -> Self {
|
||||
match u8::from_be_bytes(array) {
|
||||
0 => Self::Deletion,
|
||||
1 => Self::Addition,
|
||||
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
||||
///
|
||||
/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
|
||||
/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
|
||||
/// if both deletion and addition are `true, the value will be inserted in both keys.
|
||||
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
reader: obkv::KvReader<K>,
|
||||
deletion: bool,
|
||||
addition: bool,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for (key, value) in reader.iter() {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
if deletion {
|
||||
value_writer.insert(DelAdd::Deletion, value)?;
|
||||
}
|
||||
if addition {
|
||||
value_writer.insert(DelAdd::Addition, value)?;
|
||||
}
|
||||
value_writer.finish()?;
|
||||
writer.insert(key, &value_buffer)?;
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
||||
///
|
||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
deletion: obkv::KvReader<K>,
|
||||
addition: obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
value_buffer.clear();
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Right((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Both((k, deletion), (_, addition)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
|
||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||
}
|
||||
@@ -16,9 +16,7 @@ use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::index::Hnsw;
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
|
||||
};
|
||||
use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
|
||||
|
||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@@ -384,12 +382,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
for facet_type in [FacetType::Number, FacetType::String] {
|
||||
let mut affected_facet_values = HashMap::new();
|
||||
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
|
||||
// Remove docids from the number faceted documents ids
|
||||
let mut docids =
|
||||
self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?;
|
||||
docids -= &self.to_delete_docids;
|
||||
self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?;
|
||||
|
||||
let facet_values = remove_docids_from_field_id_docid_facet_value(
|
||||
self.index,
|
||||
self.wtxn,
|
||||
@@ -495,7 +487,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
||||
|
||||
fn remove_from_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
db: &Database<Str, RoaringBitmapCodec>,
|
||||
db: &Database<Str, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
) -> Result<fst::Set<Vec<u8>>> {
|
||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||
@@ -523,7 +515,7 @@ fn remove_from_word_prefix_docids(
|
||||
|
||||
fn remove_from_word_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||
db: &heed::Database<Str, CboRoaringBitmapCodec>,
|
||||
to_remove: &RoaringBitmap,
|
||||
words_to_keep: &mut BTreeSet<String>,
|
||||
words_to_remove: &mut BTreeSet<String>,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
@@ -12,6 +12,7 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
|
||||
@@ -20,9 +21,6 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
///
|
||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||
/// higher levels are cleared and recomputed from the content of level 0.
|
||||
///
|
||||
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
|
||||
/// is updated to contain the new set of faceted documents.
|
||||
pub struct FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
group_size: u8,
|
||||
@@ -30,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
new_data: Option<grenad::Reader<File>>,
|
||||
delta_data: Option<grenad::Reader<File>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
@@ -38,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
@@ -48,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
group_size,
|
||||
min_level_size,
|
||||
facet_type,
|
||||
new_data: Some(new_data),
|
||||
delta_data: Some(delta_data),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
new_data: None,
|
||||
delta_data: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||
|
||||
let db = match facet_type {
|
||||
FacetType::String => index
|
||||
@@ -80,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
}
|
||||
};
|
||||
|
||||
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
|
||||
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
||||
|
||||
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
|
||||
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
|
||||
Ok(())
|
||||
})?;
|
||||
inner.update(wtxn, &field_ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -94,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
pub new_data: Option<grenad::Reader<R>>,
|
||||
pub delta_data: Option<grenad::Reader<R>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
pub fn update(
|
||||
mut self,
|
||||
wtxn: &mut RwTxn,
|
||||
field_ids: &[u16],
|
||||
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
|
||||
self.update_level0(wtxn)?;
|
||||
for &field_id in field_ids.iter() {
|
||||
self.clear_levels(wtxn, field_id)?;
|
||||
}
|
||||
|
||||
for &field_id in field_ids.iter() {
|
||||
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
handle_all_docids(wtxn, field_id, all_docids)?;
|
||||
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
for level_reader in level_readers {
|
||||
let mut cursor = level_reader.into_cursor()?;
|
||||
@@ -132,19 +120,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
let new_data = match self.new_data.take() {
|
||||
let delta_data = match self.delta_data.take() {
|
||||
Some(x) => x,
|
||||
None => return Ok(()),
|
||||
};
|
||||
if self.db.is_empty(wtxn)? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||
let mut cursor = new_data.into_cursor()?;
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value: KvReader<DelAdd> = KvReader::new(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.clear();
|
||||
// the group size for level 0
|
||||
buffer.push(1);
|
||||
@@ -156,11 +152,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
let mut buffer = Vec::new();
|
||||
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
let mut cursor = new_data.into_cursor()?;
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value: KvReader<DelAdd> = KvReader::new(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
buffer.clear();
|
||||
@@ -169,12 +168,15 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
let old_bitmap = &prev_value[1..];
|
||||
CboRoaringBitmapCodec::merge_into(
|
||||
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
|
||||
&mut buffer,
|
||||
)?;
|
||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||
}
|
||||
None => {
|
||||
// it is safe to ignore the del in that case.
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
// won't put the key in DB as the value would be empty
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.extend_from_slice(value);
|
||||
}
|
||||
};
|
||||
@@ -187,16 +189,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
txn: &RoTxn,
|
||||
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
|
||||
let mut all_docids = RoaringBitmap::new();
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
||||
for bitmap in bitmaps {
|
||||
all_docids |= bitmap;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
) -> Result<Vec<grenad::Reader<File>>> {
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||
|
||||
Ok((subwriters, all_docids))
|
||||
Ok(subwriters)
|
||||
}
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn read_level_0<'t>(
|
||||
@@ -490,7 +486,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -160,7 +160,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
||||
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
@@ -178,7 +177,6 @@ mod tests {
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
||||
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
|
||||
}
|
||||
|
||||
// Same test as above but working with string values for the facets
|
||||
@@ -219,7 +217,6 @@ mod tests {
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
@@ -237,7 +234,6 @@ mod tests {
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
||||
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -274,7 +270,6 @@ mod tests {
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
|
||||
@@ -291,12 +286,6 @@ mod tests {
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
||||
db_snap!(index, string_faceted_documents_ids, 2, @r###"
|
||||
0 []
|
||||
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||
2 [292, 324, 358, 381, 493, 839, 852, ]
|
||||
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||
"###);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
@@ -11,8 +11,9 @@ use crate::heed_codec::facet::{
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
enum InsertionResult {
|
||||
InPlace,
|
||||
@@ -27,27 +28,21 @@ enum DeletionResult {
|
||||
|
||||
/// Algorithm to incrementally insert and delete elememts into the
|
||||
/// `facet_id_(string/f64)_docids` databases.
|
||||
///
|
||||
/// Rhe `faceted_documents_ids` value in the main database of `Index`
|
||||
/// is also updated to contain the new set of faceted documents.
|
||||
pub struct FacetsUpdateIncremental<'i> {
|
||||
index: &'i Index,
|
||||
pub struct FacetsUpdateIncremental {
|
||||
inner: FacetsUpdateIncrementalInner,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateIncremental<'i> {
|
||||
impl FacetsUpdateIncremental {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
index: &Index,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
max_group_size: u8,
|
||||
) -> Self {
|
||||
FacetsUpdateIncremental {
|
||||
index,
|
||||
inner: FacetsUpdateIncrementalInner {
|
||||
db: match facet_type {
|
||||
FacetType::String => index
|
||||
@@ -61,31 +56,41 @@ impl<'i> FacetsUpdateIncremental<'i> {
|
||||
max_group_size,
|
||||
min_level_size,
|
||||
},
|
||||
facet_type,
|
||||
new_data,
|
||||
delta_data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||
|
||||
let mut cursor = self.new_data.into_cursor()?;
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||
let mut cursor = self.delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
||||
.ok_or(heed::Error::Encoding)?;
|
||||
let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||
let value = KvReader::new(value);
|
||||
|
||||
let docids_to_delete = value
|
||||
.get(DelAdd::Deletion)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||
|
||||
let docids_to_add = value
|
||||
.get(DelAdd::Addition)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||
|
||||
if let Some(docids_to_delete) = docids_to_delete {
|
||||
let docids_to_delete = docids_to_delete?;
|
||||
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
|
||||
}
|
||||
|
||||
if let Some(docids_to_add) = docids_to_add {
|
||||
let docids_to_add = docids_to_add?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (field_id, new_docids) in new_faceted_docids {
|
||||
let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
|
||||
docids |= new_docids;
|
||||
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValu
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::index_documents::create_sorter;
|
||||
use crate::update::merge_btreeset_string;
|
||||
use crate::{BEU16StrCodec, Index, Result, BEU16};
|
||||
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod delete;
|
||||
@@ -108,13 +108,14 @@ pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
}
|
||||
impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
||||
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
|
||||
pub fn new(index: &'i Index, facet_type: FacetType, delta_data: grenad::Reader<File>) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
.facet_id_string_docids
|
||||
@@ -130,26 +131,26 @@ impl<'i> FacetsUpdate<'i> {
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
new_data,
|
||||
delta_data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
if self.new_data.is_empty() {
|
||||
if self.delta_data.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||
if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.new_data,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
@@ -158,7 +159,7 @@ impl<'i> FacetsUpdate<'i> {
|
||||
let incremental_update = FacetsUpdateIncremental::new(
|
||||
self.index,
|
||||
self.facet_type,
|
||||
self.new_data,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
self.max_group_size,
|
||||
@@ -191,7 +192,16 @@ impl<'i> FacetsUpdate<'i> {
|
||||
for result in database.iter(wtxn)? {
|
||||
let (facet_group_key, ()) = result?;
|
||||
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
||||
let normalized_facet = left_bound.normalize(&options);
|
||||
let mut normalized_facet = left_bound.normalize(&options);
|
||||
let normalized_truncated_facet: String;
|
||||
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
|
||||
normalized_truncated_facet = normalized_facet
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
normalized_facet = normalized_truncated_facet.into();
|
||||
}
|
||||
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
||||
let key = (field_id, normalized_facet.as_ref());
|
||||
let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
|
||||
@@ -449,7 +459,7 @@ pub(crate) mod test_helpers {
|
||||
|
||||
let update = FacetsUpdateBulkInner {
|
||||
db: self.content,
|
||||
new_data: Some(reader),
|
||||
delta_data: Some(reader),
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
@@ -584,7 +594,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
||||
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
|
||||
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
||||
|
||||
let mut documents = vec![];
|
||||
@@ -607,7 +616,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
||||
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
||||
|
||||
// Then replace the last document while disabling soft_deletion
|
||||
@@ -632,7 +640,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
||||
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,18 +4,16 @@ use std::fs::File;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::KvReader;
|
||||
use obkv::{KvReader, KvWriterU16};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::index_documents::MergeFn;
|
||||
use crate::{
|
||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||
};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
@@ -28,8 +26,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
allowed_separators: Option<&Vec<&str>>,
|
||||
dictionary: Option<&Vec<&str>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
@@ -38,32 +36,29 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut script_language_docids = HashMap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
concat_u32s_array,
|
||||
keep_latest_obkv,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut buffers = Buffers::default();
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
// let dictionary: Vec<_> = dictionary.iter().map(String::as_str).collect();
|
||||
tokenizer_builder.words_dict(dictionary.as_slice());
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
// let separators: Vec<_> = separators.iter().map(String::as_str).collect();
|
||||
tokenizer_builder.separators(separators.as_slice());
|
||||
}
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
// initialize buffers.
|
||||
let mut del_buffers = Buffers::default();
|
||||
let mut add_buffers = Buffers::default();
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
@@ -72,66 +67,85 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
||||
continue;
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
buffers.key_buffer.clear();
|
||||
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
// Update key buffer prefix.
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
extract_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
&mut buffers,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
|
||||
// if we detect a potetial mistake in the language detection,
|
||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||
if script_language_word_count
|
||||
.values()
|
||||
.map(Vec::as_slice)
|
||||
.any(potential_language_detection_error)
|
||||
{
|
||||
// build an allow list with the most frequent detected languages in the document.
|
||||
let script_language: HashMap<_, _> =
|
||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||
|
||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
let tokenizer = tokenizer_builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
|
||||
// rerun the extraction.
|
||||
extract_tokens_from_document(
|
||||
// Tokenize deletions and additions in 2 diffferent threads.
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
max_positions_per_attributes,
|
||||
&mut buffers,
|
||||
&mut script_language_word_count,
|
||||
&mut docid_word_positions_sorter,
|
||||
)?;
|
||||
}
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
)
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
let (del_obkv, del_script_language_word_count) = del?;
|
||||
let (add_obkv, add_script_language_word_count) = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::new(del_obkv),
|
||||
KvReader::<FieldId>::new(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write them into the sorter.
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||
}
|
||||
|
||||
for (script, languages_frequency) in script_language_word_count {
|
||||
// update script_language_docids deletions.
|
||||
for (script, languages_frequency) in del_script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(RoaringBitmap::new);
|
||||
entry.push(document_id);
|
||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||
entry.0.push(document_id);
|
||||
}
|
||||
}
|
||||
|
||||
// update script_language_docids additions.
|
||||
for (script, languages_frequency) in add_script_language_word_count {
|
||||
for (language, _) in languages_frequency {
|
||||
let entry = script_language_docids
|
||||
.entry((script, language))
|
||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||
entry.1.push(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -140,52 +154,183 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
}
|
||||
|
||||
fn extract_tokens_from_document(
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
) -> bool {
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
// if both contains a value and values are the same, check the next field.
|
||||
(Some(del), Some(add)) if del == add => (),
|
||||
// otherwise the fields are different, return true.
|
||||
_otherwise => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<&[u8]>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
if let Some(script_language) = script_language {
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
/// Extract words maped with their positions of a document,
|
||||
/// ensuring no Language detection mistakes was made.
|
||||
fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
buffers,
|
||||
&mut script_language_word_count,
|
||||
)?;
|
||||
|
||||
// if we detect a potetial mistake in the language detection,
|
||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||
if script_language_word_count
|
||||
.values()
|
||||
.map(Vec::as_slice)
|
||||
.any(potential_language_detection_error)
|
||||
{
|
||||
// build an allow list with the most frequent detected languages in the document.
|
||||
let script_language: HashMap<_, _> =
|
||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||
|
||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words,
|
||||
dictionary,
|
||||
allowed_separators,
|
||||
Some(&script_language),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
|
||||
// rerun the extraction.
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
buffers,
|
||||
&mut script_language_word_count,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((&buffers.obkv_buffer, script_language_word_count))
|
||||
}
|
||||
|
||||
/// Extract words maped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
buffers: &mut Buffers,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
) -> Result<&'a [u8]> {
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
for (index, token) in tokens {
|
||||
// if a language has been detected for the token, we update the counter.
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry =
|
||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||
Some((_, n)) => *n += 1,
|
||||
None => entry.push((language, 1)),
|
||||
// prepare writting destination.
|
||||
buffers.obkv_positions_buffer.clear();
|
||||
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||
|
||||
// convert json into an unique string.
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// if a language has been detected for the token, we update the counter.
|
||||
if let Some(language) = token.language {
|
||||
let script = token.script;
|
||||
let entry =
|
||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||
Some((_, n)) => *n += 1,
|
||||
None => entry.push((language, 1)),
|
||||
}
|
||||
}
|
||||
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
writer.insert(position, token.as_bytes())?;
|
||||
}
|
||||
}
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
buffers.key_buffer.truncate(mem::size_of::<u32>());
|
||||
buffers.key_buffer.extend_from_slice(token.as_bytes());
|
||||
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let position = absolute_from_relative_position(field_id, position);
|
||||
docid_word_positions_sorter
|
||||
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
|
||||
}
|
||||
// write positions into document.
|
||||
let positions = writer.into_inner()?;
|
||||
document_writer.insert(field_id, positions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
@@ -228,9 +373,9 @@ fn process_tokens<'a>(
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
.scan((0, None), |(offset, prev_kind), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
@@ -246,7 +391,7 @@ fn process_tokens<'a>(
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
_ => token.kind = TokenKind::Unknown,
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
@@ -288,10 +433,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
|
||||
|
||||
#[derive(Default)]
|
||||
struct Buffers {
|
||||
// the key buffer is the concatenation of the internal document id with the field id.
|
||||
// The buffer has to be completelly cleared between documents,
|
||||
// and the field id part must be cleared between each field.
|
||||
key_buffer: Vec<u8>,
|
||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||
field_buffer: String,
|
||||
// buffer used to store the value data containing an obkv.
|
||||
obkv_buffer: Vec<u8>,
|
||||
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||
obkv_positions_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
@@ -4,11 +4,12 @@ use std::io;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
@@ -17,7 +18,7 @@ use crate::Result;
|
||||
/// documents ids from the given chunk of docid facet number positions.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_number: grenad::Reader<R>,
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
@@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut cursor = docid_fid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, _)) = cursor.move_on_next()? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id, document_id, number) =
|
||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::{io, str};
|
||||
|
||||
use heed::BytesEncode;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
@@ -24,15 +25,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
@@ -40,21 +42,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
let normalised_truncated_value: String;
|
||||
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
|
||||
normalised_truncated_value = normalised_value
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
normalised_value = normalised_truncated_value.as_str();
|
||||
}
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
// document id is encoded in native-endian because of the CBO roaring bitmap codec
|
||||
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||
|
||||
@@ -1,24 +1,36 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use grenad::Sorter;
|
||||
use heed::zerocopy::AsBytes;
|
||||
use heed::BytesEncode;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
use FilterableValues::{Empty, Null, Values};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
|
||||
};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||
@@ -28,11 +40,13 @@ pub struct ExtractedFacetValues {
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
puffin::profile_function!();
|
||||
|
||||
@@ -56,68 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||
// The tuples represents the Del and Add side for a bitmap
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
|
||||
// We create two buffer for mutable ref issues with closures.
|
||||
let mut numbers_key_buffer = Vec::new();
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if faceted_fields.contains(&field_id) {
|
||||
key_buffer.clear();
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
// Here, we know already that the document must be added to the “field id exists” database
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = BEU32::from(document).get();
|
||||
|
||||
facet_exists_docids.entry(field_id).or_default().insert(document);
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||
None => None,
|
||||
};
|
||||
|
||||
match extract_facet_values(&value) {
|
||||
FilterableValues::Null => {
|
||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Empty => {
|
||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
||||
}
|
||||
FilterableValues::Values { numbers, strings } => {
|
||||
// insert facet numbers in sorter
|
||||
for number in numbers {
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
|
||||
fid_docid_facet_numbers_sorter
|
||||
.insert(&key_buffer, ().as_bytes())?;
|
||||
}
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
|
||||
let geo_support =
|
||||
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, geo_support));
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for (normalized, original) in
|
||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
||||
{
|
||||
let normalized_truncated_value: String = normalized
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
|
||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
||||
fid_docid_facet_strings_sorter
|
||||
.insert(&key_buffer, original.as_bytes())?;
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -125,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut facet_exists_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_exists_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
@@ -141,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
@@ -152,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||
buffer: &mut Vec<u8>,
|
||||
del_bitmap: &RoaringBitmap,
|
||||
add_bitmap: &RoaringBitmap,
|
||||
) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||
obkv.finish()
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_string(s: String) -> String {
|
||||
s.char_indices()
|
||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_numbers_diff<MF>(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_numbers: Vec<f64>,
|
||||
mut add_numbers: Vec<f64>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
// We sort and dedup the float numbers
|
||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
|
||||
let merged_numbers_iter = itertools::merge_join_by(
|
||||
del_numbers.into_iter().map(OrderedFloat),
|
||||
add_numbers.into_iter().map(OrderedFloat),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for eob in merged_numbers_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, ().as_bytes())?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add strings and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_strings_diff<MF>(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_strings: Vec<(String, String)>,
|
||||
mut add_strings: Vec<(String, String)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
// We sort and dedup the normalized and original strings
|
||||
del_strings.sort_unstable();
|
||||
add_strings.sort_unstable();
|
||||
del_strings.dedup();
|
||||
add_strings.dedup();
|
||||
|
||||
let merged_strings_iter = itertools::merge_join_by(
|
||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for eob in merged_strings_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left((normalized, original)) => {
|
||||
let truncated = truncate_string(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
EitherOrBoth::Right((normalized, original)) => {
|
||||
let truncated = truncate_string(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
@@ -177,12 +409,14 @@ enum FilterableValues {
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> FilterableValues {
|
||||
/// Extracts the facet values of a JSON field.
|
||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
geo_field: bool,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
@@ -193,13 +427,30 @@ fn extract_facet_values(value: &Value) -> FilterableValues {
|
||||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
// if we're working on a geofield it MUST be something we can parse or else there was an internal error
|
||||
// in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
|
||||
if geo_field {
|
||||
if let Ok(float) = original.parse() {
|
||||
output_numbers.push(float);
|
||||
} else {
|
||||
log::warn!(
|
||||
"Internal error, could not parse a geofield that has been validated. Please open an issue."
|
||||
)
|
||||
}
|
||||
}
|
||||
let normalized = crate::normalize_facet(original);
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
inner_extract_facet_values(
|
||||
value,
|
||||
false,
|
||||
output_numbers,
|
||||
output_strings,
|
||||
geo_field,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -215,7 +466,7 @@ fn extract_facet_values(value: &Value) -> FilterableValues {
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,17 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use grenad::Sorter;
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters, MergeFn,
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
@@ -35,63 +36,21 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
max_memory,
|
||||
);
|
||||
|
||||
// This map is assumed to not consume a lot of memory.
|
||||
let mut document_fid_wordcount = HashMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||
if curr_document_id != document_id {
|
||||
drain_document_fid_wordcount_into_sorter(
|
||||
&mut fid_word_count_docids_sorter,
|
||||
&mut document_fid_wordcount,
|
||||
curr_document_id,
|
||||
)?;
|
||||
current_document_id = Some(document_id);
|
||||
}
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
let (field_id, _) = relative_from_absolute_position(position);
|
||||
|
||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
||||
*value += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// We must make sure that don't lose the current document field id
|
||||
// word count map if we break because we reached the end of the chunk.
|
||||
drain_document_fid_wordcount_into_sorter(
|
||||
&mut fid_word_count_docids_sorter,
|
||||
&mut document_fid_wordcount,
|
||||
document_id,
|
||||
)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
||||
|
||||
fn drain_document_fid_wordcount_into_sorter(
|
||||
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
|
||||
document_fid_wordcount: &mut HashMap<FieldId, u32>,
|
||||
document_id: DocumentId,
|
||||
) -> Result<()> {
|
||||
let mut key_buffer = Vec::new();
|
||||
|
||||
for (fid, count) in document_fid_wordcount.drain() {
|
||||
if count <= 30 {
|
||||
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
|
||||
if word_count <= MAX_COUNTED_WORDS {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
key_buffer.push(count as u8);
|
||||
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// first we retrieve the _vectors field
|
||||
@@ -52,12 +52,14 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
}
|
||||
};
|
||||
|
||||
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
let index = u16::try_from(i).unwrap();
|
||||
let mut key = docid_bytes.to_vec();
|
||||
key.extend_from_slice(&index.to_be_bytes());
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(key, bytes)?;
|
||||
if let Some(vectors) = vectors {
|
||||
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
let index = u16::try_from(i).unwrap();
|
||||
let mut key = docid_bytes.to_vec();
|
||||
key.extend_from_slice(&index.to_be_bytes());
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(key, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
// else => the `_vectors` object was `null`, there is nothing to do
|
||||
|
||||
@@ -1,18 +1,20 @@
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use heed::BytesDecode;
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
||||
try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::index_documents::helpers::read_u32_ne_bytes;
|
||||
use crate::{relative_from_absolute_position, FieldId, Result};
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
@@ -26,65 +28,148 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
exact_attributes: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 2),
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
let mut add_words = BTreeSet::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::new(&addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
&mut key_buffer,
|
||||
&del_words,
|
||||
&add_words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
|
||||
del_words.clear();
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 2),
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, positions)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
||||
|
||||
// If there are no exact attributes, we do not need to iterate over positions.
|
||||
if exact_attributes.is_empty() {
|
||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||
if exact_attributes.contains(&fid) {
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||
} else {
|
||||
let mut added_to_exact = false;
|
||||
let mut added_to_word_docids = false;
|
||||
for position in read_u32_ne_bytes(positions) {
|
||||
// as soon as we know that this word had been to both readers, we don't need to
|
||||
// iterate over the positions.
|
||||
if added_to_exact && added_to_word_docids {
|
||||
break;
|
||||
}
|
||||
let (fid, _) = relative_from_absolute_position(position);
|
||||
if exact_attributes.contains(&fid) && !added_to_exact {
|
||||
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
added_to_exact = true;
|
||||
} else if !added_to_word_docids {
|
||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||
added_to_word_docids = true;
|
||||
}
|
||||
}
|
||||
word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||
writer_into_reader(word_fid_docids_writer)?,
|
||||
))
|
||||
}
|
||||
|
||||
fn words_into_sorter(
|
||||
document_id: DocumentId,
|
||||
fid: FieldId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_words: &BTreeSet<Vec<u8>>,
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let word_bytes = match eob {
|
||||
Left(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Right(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Both(word_bytes, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||
try_split_array_at, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::{relative_from_absolute_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
||||
#[logging_timer::time]
|
||||
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
for position in read_u32_ne_bytes(value) {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
let (fid, _) = relative_from_absolute_position(position);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_fid_docids_reader)
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user