mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-02 18:55:36 +00:00
Compare commits
80 Commits
diff-index
...
panic-repo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b06e04fb9b | ||
|
|
905dc241ae | ||
|
|
146908f062 | ||
|
|
84f701679d | ||
|
|
355d3b7e45 | ||
|
|
69354a6144 | ||
|
|
2b5d9042d1 | ||
|
|
5b57fbab08 | ||
|
|
48865470d7 | ||
|
|
c810df4d9f | ||
|
|
5e3df76699 | ||
|
|
02765fb267 | ||
|
|
841165d529 | ||
|
|
ea4a266f08 | ||
|
|
49f069ed97 | ||
|
|
be16b99d40 | ||
|
|
ec0c09d17c | ||
|
|
a9230f6e6c | ||
|
|
62ea81bef6 | ||
|
|
f28f09ae2f | ||
|
|
eae9eab181 | ||
|
|
cf8dad1ca0 | ||
|
|
dd619913da | ||
|
|
9b55ff16e9 | ||
|
|
e761db582f | ||
|
|
d8c649b3cd | ||
|
|
5e0485d8dd | ||
|
|
27eec21415 | ||
|
|
62cc97ba70 | ||
|
|
fed59cc1d5 | ||
|
|
2b3adef796 | ||
|
|
956cfc5487 | ||
|
|
12fc878640 | ||
|
|
0a2e8b92a9 | ||
|
|
c7a3f80de6 | ||
|
|
029d4de043 | ||
|
|
549f1bcccf | ||
|
|
689ec7c7ad | ||
|
|
3655d4bdca | ||
|
|
055ca3935b | ||
|
|
1b8871a585 | ||
|
|
bf8fac6676 | ||
|
|
f2a9e1ebbb | ||
|
|
c45c6cf54c | ||
|
|
513e61e9a3 | ||
|
|
90a626bf80 | ||
|
|
0d4acf2daa | ||
|
|
58db8d85ec | ||
|
|
62dfd09dc6 | ||
|
|
656dadabea | ||
|
|
c5f7893fbb | ||
|
|
8cf2ccf168 | ||
|
|
0913373a5e | ||
|
|
1a7f1282af | ||
|
|
bc747aac3a | ||
|
|
be92376ab3 | ||
|
|
cf7e355735 | ||
|
|
5f09d89ad1 | ||
|
|
6ecb26a3f8 | ||
|
|
76c6f554d6 | ||
|
|
f343ef5f2f | ||
|
|
96982a768a | ||
|
|
fca78fbc46 | ||
|
|
67a678cfb6 | ||
|
|
d1331d8abf | ||
|
|
19ba129165 | ||
|
|
d4da06ff47 | ||
|
|
3e0471edae | ||
|
|
432df03c4c | ||
|
|
11958016dd | ||
|
|
63c250a04d | ||
|
|
06d8cd5b72 | ||
|
|
c0f2724c2d | ||
|
|
d772073dfa | ||
|
|
8fe8ddea79 | ||
|
|
8a95bf28e5 | ||
|
|
43989fe2e4 | ||
|
|
c668a29ed5 | ||
|
|
b10eeb0e41 | ||
|
|
4a8515e9fc |
8
.github/ISSUE_TEMPLATE/sprint_issue.md
vendored
8
.github/ISSUE_TEMPLATE/sprint_issue.md
vendored
@@ -7,19 +7,17 @@ assignees: ''
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Related product team resources: [roadmap card]() (_internal only_) and [PRD]() (_internal only_)
|
Related product team resources: [PRD]() (_internal only_)
|
||||||
Related product discussion:
|
Related product discussion:
|
||||||
Related spec: WIP
|
Related spec: WIP
|
||||||
|
|
||||||
## Motivation
|
## Motivation
|
||||||
|
|
||||||
<!---Copy/paste the information in the roadmap resources or briefly detail the product motivation. Ask product team if any hesitation.-->
|
<!---Copy/paste the information in PRD or briefly detail the product motivation. Ask product team if any hesitation.-->
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
<!---Write a quick description of the usage if the usage has already been defined-->
|
<!---Link to the public part of the PRD, or to the related product discussion for experimental features-->
|
||||||
|
|
||||||
Refer to the final spec to know the details and the final decisions about the usage.
|
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
|
|||||||
@@ -8,11 +8,11 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run-benchmarks-on-comment:
|
run-benchmarks-on-comment:
|
||||||
|
if: startsWith(github.event.comment.body, '/benchmark')
|
||||||
name: Run and upload benchmarks
|
name: Run and upload benchmarks
|
||||||
runs-on: benchmarks
|
runs-on: benchmarks
|
||||||
timeout-minutes: 4320 # 72h
|
timeout-minutes: 4320 # 72h
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- uses: actions-rs/toolchain@v1
|
- uses: actions-rs/toolchain@v1
|
||||||
with:
|
with:
|
||||||
profile: minimal
|
profile: minimal
|
||||||
@@ -27,14 +27,25 @@ jobs:
|
|||||||
reaction-type: "eyes"
|
reaction-type: "eyes"
|
||||||
repo-token: ${{ env.GH_TOKEN }}
|
repo-token: ${{ env.GH_TOKEN }}
|
||||||
|
|
||||||
|
- uses: xt0rted/pull-request-comment-branch@v2
|
||||||
|
id: comment-branch
|
||||||
|
with:
|
||||||
|
repo_token: ${{ env.GH_TOKEN }}
|
||||||
|
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
if: success()
|
||||||
|
with:
|
||||||
|
fetch-depth: 0 # fetch full history to be able to get main commit sha
|
||||||
|
ref: ${{ steps.comment-branch.outputs.head_ref }}
|
||||||
|
|
||||||
# Set variables
|
# Set variables
|
||||||
- name: Set current branch name
|
- name: Set current branch name
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
|
run: echo "name=$(git rev-parse --abbrev-ref HEAD)" >> $GITHUB_OUTPUT
|
||||||
id: current_branch
|
id: current_branch
|
||||||
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
|
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
|
run: echo "name=$(git rev-parse --abbrev-ref HEAD | tr '/' '_')" >> $GITHUB_OUTPUT
|
||||||
id: normalized_current_branch
|
id: normalized_current_branch
|
||||||
- name: Set shorter commit SHA
|
- name: Set shorter commit SHA
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -76,9 +87,12 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||||
run: |
|
run: |
|
||||||
export base=$(git log --pretty=%p -n 1)
|
set -x
|
||||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
||||||
|
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
||||||
|
export bench_name=$(echo ${{ steps.command.outputs.command-arguments }})
|
||||||
|
echo "Here are your $bench_name benchmarks diff 👊" >> body.txt
|
||||||
echo '```' >> body.txt
|
echo '```' >> body.txt
|
||||||
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
|
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||||
echo '```' >> body.txt
|
echo '```' >> body.txt
|
||||||
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
|
gh pr comment ${{ steps.current_branch.outputs.name }} --body-file body.txt
|
||||||
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@@ -50,7 +50,7 @@ jobs:
|
|||||||
needs: check-version
|
needs: check-version
|
||||||
steps:
|
steps:
|
||||||
- name: Create PR to Homebrew
|
- name: Create PR to Homebrew
|
||||||
uses: mislav/bump-homebrew-formula-action@v2
|
uses: mislav/bump-homebrew-formula-action@v3
|
||||||
with:
|
with:
|
||||||
formula-name: meilisearch
|
formula-name: meilisearch
|
||||||
formula-path: Formula/m/meilisearch.rb
|
formula-path: Formula/m/meilisearch.rb
|
||||||
|
|||||||
2
.github/workflows/publish-docker-images.yml
vendored
2
.github/workflows/publish-docker-images.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
|||||||
uses: docker/setup-buildx-action@v3
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
- name: Login to Docker Hub
|
- name: Login to Docker Hub
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
|
|||||||
4
.github/workflows/sdks-tests.yml
vendored
4
.github/workflows/sdks-tests.yml
vendored
@@ -160,7 +160,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
repository: meilisearch/meilisearch-js
|
repository: meilisearch/meilisearch-js
|
||||||
- name: Setup node
|
- name: Setup node
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
cache: 'yarn'
|
cache: 'yarn'
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@@ -318,7 +318,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
repository: meilisearch/meilisearch-js-plugins
|
repository: meilisearch/meilisearch-js-plugins
|
||||||
- name: Setup node
|
- name: Setup node
|
||||||
uses: actions/setup-node@v3
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
cache: yarn
|
cache: yarn
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
|||||||
10
.github/workflows/test-suite.yml
vendored
10
.github/workflows/test-suite.yml
vendored
@@ -43,7 +43,7 @@ jobs:
|
|||||||
toolchain: nightly
|
toolchain: nightly
|
||||||
override: true
|
override: true
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo check without any default features
|
- name: Run cargo check without any default features
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@@ -65,7 +65,7 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo check without any default features
|
- name: Run cargo check without any default features
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@@ -149,7 +149,7 @@ jobs:
|
|||||||
toolchain: stable
|
toolchain: stable
|
||||||
override: true
|
override: true
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run tests in debug
|
- name: Run tests in debug
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@@ -168,7 +168,7 @@ jobs:
|
|||||||
override: true
|
override: true
|
||||||
components: clippy
|
components: clippy
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo clippy
|
- name: Run cargo clippy
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
@@ -187,7 +187,7 @@ jobs:
|
|||||||
override: true
|
override: true
|
||||||
components: rustfmt
|
components: rustfmt
|
||||||
- name: Cache dependencies
|
- name: Cache dependencies
|
||||||
uses: Swatinem/rust-cache@v2.6.2
|
uses: Swatinem/rust-cache@v2.7.1
|
||||||
- name: Run cargo fmt
|
- name: Run cargo fmt
|
||||||
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
||||||
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
||||||
|
|||||||
73
Cargo.lock
generated
73
Cargo.lock
generated
@@ -231,9 +231,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "addr2line"
|
name = "addr2line"
|
||||||
version = "0.20.0"
|
version = "0.21.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f4fa78e18c64fce05e902adecd7a5eed15a5e0a3439f7b0e169f0252214865e3"
|
checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"gimli",
|
"gimli",
|
||||||
]
|
]
|
||||||
@@ -435,9 +435,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "backtrace"
|
name = "backtrace"
|
||||||
version = "0.3.68"
|
version = "0.3.69"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4319208da049c43661739c5fade2ba182f09d1dc2299b32298d3a31692b17e12"
|
checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"addr2line",
|
"addr2line",
|
||||||
"cc",
|
"cc",
|
||||||
@@ -468,7 +468,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "benchmarks"
|
name = "benchmarks"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -1206,7 +1206,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "dump"
|
name = "dump"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"big_s",
|
"big_s",
|
||||||
@@ -1417,7 +1417,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "file-store"
|
name = "file-store"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"faux",
|
"faux",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
@@ -1439,7 +1439,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filter-parser"
|
name = "filter-parser"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"nom",
|
"nom",
|
||||||
@@ -1459,7 +1459,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flatten-serde-json"
|
name = "flatten-serde-json"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -1577,7 +1577,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fuzzers"
|
name = "fuzzers"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"arbitrary",
|
"arbitrary",
|
||||||
"clap",
|
"clap",
|
||||||
@@ -1638,9 +1638,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "gimli"
|
name = "gimli"
|
||||||
version = "0.27.3"
|
version = "0.28.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e"
|
checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "git2"
|
name = "git2"
|
||||||
@@ -1891,9 +1891,10 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "index-scheduler"
|
name = "index-scheduler"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"backtrace",
|
||||||
"big_s",
|
"big_s",
|
||||||
"bincode",
|
"bincode",
|
||||||
"crossbeam",
|
"crossbeam",
|
||||||
@@ -2088,7 +2089,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "json-depth-checker"
|
name = "json-depth-checker"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"criterion",
|
"criterion",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -2500,7 +2501,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meili-snap"
|
name = "meili-snap"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"insta",
|
"insta",
|
||||||
"md5",
|
"md5",
|
||||||
@@ -2509,7 +2510,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch"
|
name = "meilisearch"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
"actix-http",
|
"actix-http",
|
||||||
@@ -2564,7 +2565,6 @@ dependencies = [
|
|||||||
"platform-dirs",
|
"platform-dirs",
|
||||||
"prometheus",
|
"prometheus",
|
||||||
"puffin",
|
"puffin",
|
||||||
"puffin_http",
|
|
||||||
"rand",
|
"rand",
|
||||||
"rayon",
|
"rayon",
|
||||||
"regex",
|
"regex",
|
||||||
@@ -2600,7 +2600,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-auth"
|
name = "meilisearch-auth"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"base64 0.21.2",
|
"base64 0.21.2",
|
||||||
"enum-iterator",
|
"enum-iterator",
|
||||||
@@ -2619,7 +2619,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "meilisearch-types"
|
name = "meilisearch-types"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
@@ -2673,7 +2673,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "milli"
|
name = "milli"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"big_s",
|
"big_s",
|
||||||
"bimap",
|
"bimap",
|
||||||
@@ -2704,7 +2704,6 @@ dependencies = [
|
|||||||
"logging_timer",
|
"logging_timer",
|
||||||
"maplit",
|
"maplit",
|
||||||
"md5",
|
"md5",
|
||||||
"meili-snap",
|
|
||||||
"memmap2",
|
"memmap2",
|
||||||
"mimalloc",
|
"mimalloc",
|
||||||
"obkv",
|
"obkv",
|
||||||
@@ -2858,9 +2857,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "object"
|
name = "object"
|
||||||
version = "0.31.1"
|
version = "0.32.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8bda667d9f2b5051b8833f59f3bf748b28ef54f850f4fcb389a252aa383866d1"
|
checksum = "9cf5f9dd3933bd50a9e1f149ec995f39ae2c496d31fd772c1fd45ebc27e902b0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
@@ -2996,7 +2995,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "permissive-json-pointer"
|
name = "permissive-json-pointer"
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"big_s",
|
"big_s",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
@@ -3194,7 +3193,7 @@ dependencies = [
|
|||||||
"byteorder",
|
"byteorder",
|
||||||
"hex",
|
"hex",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"rustix 0.36.15",
|
"rustix 0.36.16",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3237,18 +3236,6 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "puffin_http"
|
|
||||||
version = "0.13.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "13bffc600c35913d282ae1e96a6ffcdf36dc7a7cdb9310e0ba15914d258c8193"
|
|
||||||
dependencies = [
|
|
||||||
"anyhow",
|
|
||||||
"crossbeam-channel",
|
|
||||||
"log",
|
|
||||||
"puffin",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quote"
|
name = "quote"
|
||||||
version = "1.0.32"
|
version = "1.0.32"
|
||||||
@@ -3479,9 +3466,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustix"
|
name = "rustix"
|
||||||
version = "0.36.15"
|
version = "0.36.16"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c37f1bd5ef1b5422177b7646cba67430579cfe2ace80f284fee876bca52ad941"
|
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 1.3.2",
|
"bitflags 1.3.2",
|
||||||
"errno",
|
"errno",
|
||||||
@@ -3654,9 +3641,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_json"
|
name = "serde_json"
|
||||||
version = "1.0.104"
|
version = "1.0.108"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "076066c5f1078eac5b722a31827a8832fe108bed65dfa75e233c89f8206e976c"
|
checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"indexmap 2.0.0",
|
"indexmap 2.0.0",
|
||||||
"itoa",
|
"itoa",
|
||||||
@@ -4444,9 +4431,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "webpki"
|
name = "webpki"
|
||||||
version = "0.22.1"
|
version = "0.22.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
|
checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ring",
|
"ring",
|
||||||
"untrusted",
|
"untrusted",
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ members = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "1.4.0"
|
version = "1.4.1"
|
||||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||||
description = "Meilisearch HTTP server"
|
description = "Meilisearch HTTP server"
|
||||||
homepage = "https://meilisearch.com"
|
homepage = "https://meilisearch.com"
|
||||||
@@ -28,6 +28,7 @@ license = "MIT"
|
|||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
codegen-units = 1
|
codegen-units = 1
|
||||||
|
debug = true
|
||||||
|
|
||||||
[profile.dev.package.flate2]
|
[profile.dev.package.flate2]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
# Profiling Meilisearch
|
# Profiling Meilisearch
|
||||||
|
|
||||||
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options.
|
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options [in Puffin Viewer](https://github.com/embarkstudios/puffin#ui).
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
## Profiling the Indexing Process
|
## Profiling the Indexing Process
|
||||||
|
|
||||||
When you enable the `profile-with-puffin` feature of Meilisearch, a Puffin HTTP server will run on Meilisearch and listen on the default _0.0.0.0:8585_ address. This server will record a "frame" whenever it executes the `IndexScheduler::tick` method.
|
When you enable [the `exportPuffinReports` experimental feature](https://www.meilisearch.com/docs/learn/experimental/overview) of Meilisearch, Puffin reports with the `.puffin` extension will be automatically exported to disk. When this option is enabled, the engine will automatically create a "frame" whenever it executes the `IndexScheduler::tick` method.
|
||||||
|
|
||||||
Once your Meilisearch is running and awaits new indexation operations, you must [install and run the `puffin_viewer` tool](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) to see the profiling results. I advise you to run the viewer with the `RUST_LOG=puffin_http::client=debug` environment variable to see the client trying to connect to your server.
|
[Puffin Viewer](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) is used to analyze the reports. Those reports show areas where Meilisearch spent time during indexing.
|
||||||
|
|
||||||
Another piece of advice on the Puffin viewer UI interface is to consider the _Merge children with same ID_ option. It can hide the exact actual timings at which events were sent. Please turn it off when you see strange gaps on the Flamegraph. It can help.
|
Another piece of advice on the Puffin viewer UI interface is to consider the _Merge children with same ID_ option. It can hide the exact actual timings at which events were sent. Please turn it off when you see strange gaps on the Flamegraph. It can help.
|
||||||
|
|
||||||
|
|||||||
@@ -526,12 +526,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -541,12 +541,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -571,12 +571,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -617,12 +617,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -632,12 +632,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -647,12 +647,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
|
|||||||
pub type Unchecked = settings::Unchecked;
|
pub type Unchecked = settings::Unchecked;
|
||||||
|
|
||||||
pub type Task = updates::UpdateEntry;
|
pub type Task = updates::UpdateEntry;
|
||||||
|
pub type Kind = updates::UpdateMeta;
|
||||||
|
|
||||||
// everything related to the errors
|
// everything related to the errors
|
||||||
pub type ResponseError = errors::ResponseError;
|
pub type ResponseError = errors::ResponseError;
|
||||||
@@ -107,8 +108,11 @@ impl V2Reader {
|
|||||||
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
||||||
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
||||||
V2IndexReader::new(
|
V2IndexReader::new(
|
||||||
index.uid.clone(),
|
|
||||||
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
||||||
|
index,
|
||||||
|
BufReader::new(
|
||||||
|
File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
|
||||||
|
),
|
||||||
)
|
)
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
@@ -143,16 +147,41 @@ pub struct V2IndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl V2IndexReader {
|
impl V2IndexReader {
|
||||||
pub fn new(name: String, path: &Path) -> Result<Self> {
|
pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
|
||||||
let meta = File::open(path.join("meta.json"))?;
|
let meta = File::open(path.join("meta.json"))?;
|
||||||
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
||||||
|
|
||||||
|
let mut created_at = None;
|
||||||
|
let mut updated_at = None;
|
||||||
|
|
||||||
|
for line in tasks.lines() {
|
||||||
|
let task: Task = serde_json::from_str(&line?)?;
|
||||||
|
if !(task.uuid == index_uuid.uuid && task.is_finished()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let new_created_at = match task.update.meta() {
|
||||||
|
Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
|
||||||
|
_ => None,
|
||||||
|
};
|
||||||
|
let new_updated_at = task.update.finished_at();
|
||||||
|
|
||||||
|
if created_at.is_none() || created_at > new_created_at {
|
||||||
|
created_at = new_created_at;
|
||||||
|
}
|
||||||
|
|
||||||
|
if updated_at.is_none() || updated_at < new_updated_at {
|
||||||
|
updated_at = new_updated_at;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let current_time = OffsetDateTime::now_utc();
|
||||||
|
|
||||||
let metadata = IndexMetadata {
|
let metadata = IndexMetadata {
|
||||||
uid: name,
|
uid: index_uuid.uid.clone(),
|
||||||
primary_key: meta.primary_key,
|
primary_key: meta.primary_key,
|
||||||
// FIXME: Iterate over the whole task queue to find the creation and last update date.
|
created_at: created_at.unwrap_or(current_time),
|
||||||
created_at: OffsetDateTime::now_utc(),
|
updated_at: updated_at.unwrap_or(current_time),
|
||||||
updated_at: OffsetDateTime::now_utc(),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let ret = V2IndexReader {
|
let ret = V2IndexReader {
|
||||||
@@ -248,12 +277,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -263,12 +292,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -293,12 +322,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -340,12 +369,12 @@ pub(crate) mod test {
|
|||||||
assert!(indexes.is_empty());
|
assert!(indexes.is_empty());
|
||||||
|
|
||||||
// products
|
// products
|
||||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "products",
|
"uid": "products",
|
||||||
"primaryKey": "sku",
|
"primaryKey": "sku",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -355,12 +384,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||||
|
|
||||||
// movies
|
// movies
|
||||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "movies",
|
"uid": "movies",
|
||||||
"primaryKey": "id",
|
"primaryKey": "id",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -370,12 +399,12 @@ pub(crate) mod test {
|
|||||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||||
|
|
||||||
// spells
|
// spells
|
||||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||||
{
|
{
|
||||||
"uid": "dnd_spells",
|
"uid": "dnd_spells",
|
||||||
"primaryKey": "index",
|
"primaryKey": "index",
|
||||||
"createdAt": "[now]",
|
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||||
"updatedAt": "[now]"
|
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
|
|||||||
@@ -227,4 +227,14 @@ impl UpdateStatus {
|
|||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn finished_at(&self) -> Option<OffsetDateTime> {
|
||||||
|
match self {
|
||||||
|
UpdateStatus::Processing(_) => None,
|
||||||
|
UpdateStatus::Enqueued(_) => None,
|
||||||
|
UpdateStatus::Processed(u) => Some(u.processed_at),
|
||||||
|
UpdateStatus::Aborted(_) => None,
|
||||||
|
UpdateStatus::Failed(u) => Some(u.failed_at),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.70"
|
anyhow = "1.0.70"
|
||||||
|
backtrace = "0.3.69"
|
||||||
bincode = "1.3.3"
|
bincode = "1.3.3"
|
||||||
csv = "1.2.1"
|
csv = "1.2.1"
|
||||||
derive_builder = "0.12.0"
|
derive_builder = "0.12.0"
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ one indexing operation.
|
|||||||
|
|
||||||
use std::collections::{BTreeSet, HashSet};
|
use std::collections::{BTreeSet, HashSet};
|
||||||
use std::ffi::OsStr;
|
use std::ffi::OsStr;
|
||||||
|
use std::fmt;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::BufWriter;
|
use std::io::BufWriter;
|
||||||
|
|
||||||
@@ -199,6 +200,29 @@ impl Batch {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Batch {
|
||||||
|
/// A text used when we debug the profiling reports.
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
let index_uid = self.index_uid();
|
||||||
|
let tasks = self.ids();
|
||||||
|
match self {
|
||||||
|
Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?,
|
||||||
|
Batch::TaskDeletion(_) => f.write_str("TaskDeletion")?,
|
||||||
|
Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?,
|
||||||
|
Batch::Dump(_) => f.write_str("Dump")?,
|
||||||
|
Batch::IndexOperation { op, .. } => write!(f, "{op}")?,
|
||||||
|
Batch::IndexCreation { .. } => f.write_str("IndexCreation")?,
|
||||||
|
Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?,
|
||||||
|
Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?,
|
||||||
|
Batch::IndexSwap { .. } => f.write_str("IndexSwap")?,
|
||||||
|
};
|
||||||
|
match index_uid {
|
||||||
|
Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")),
|
||||||
|
None => f.write_fmt(format_args!(" from tasks: {tasks:?}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl IndexOperation {
|
impl IndexOperation {
|
||||||
pub fn index_uid(&self) -> &str {
|
pub fn index_uid(&self) -> &str {
|
||||||
match self {
|
match self {
|
||||||
@@ -213,6 +237,30 @@ impl IndexOperation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for IndexOperation {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
match self {
|
||||||
|
IndexOperation::DocumentOperation { .. } => {
|
||||||
|
f.write_str("IndexOperation::DocumentOperation")
|
||||||
|
}
|
||||||
|
IndexOperation::DocumentDeletion { .. } => {
|
||||||
|
f.write_str("IndexOperation::DocumentDeletion")
|
||||||
|
}
|
||||||
|
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||||
|
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||||
|
}
|
||||||
|
IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"),
|
||||||
|
IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"),
|
||||||
|
IndexOperation::DocumentClearAndSetting { .. } => {
|
||||||
|
f.write_str("IndexOperation::DocumentClearAndSetting")
|
||||||
|
}
|
||||||
|
IndexOperation::SettingsAndDocumentOperation { .. } => {
|
||||||
|
f.write_str("IndexOperation::SettingsAndDocumentOperation")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl IndexScheduler {
|
impl IndexScheduler {
|
||||||
/// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`].
|
/// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`].
|
||||||
///
|
///
|
||||||
@@ -581,7 +629,7 @@ impl IndexScheduler {
|
|||||||
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
|
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
puffin::profile_function!(format!("{:?}", batch));
|
puffin::profile_function!(batch.to_string());
|
||||||
|
|
||||||
match batch {
|
match batch {
|
||||||
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
|
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
|
||||||
@@ -777,6 +825,10 @@ impl IndexScheduler {
|
|||||||
// 2. dump the tasks
|
// 2. dump the tasks
|
||||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||||
for ret in self.all_tasks.iter(&rtxn)? {
|
for ret in self.all_tasks.iter(&rtxn)? {
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
|
|
||||||
let (_, mut t) = ret?;
|
let (_, mut t) = ret?;
|
||||||
let status = t.status;
|
let status = t.status;
|
||||||
let content_file = t.content_uuid();
|
let content_file = t.content_uuid();
|
||||||
@@ -797,6 +849,9 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||||
if let Some(content_file) = content_file {
|
if let Some(content_file) = content_file {
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
if status == Status::Enqueued {
|
if status == Status::Enqueued {
|
||||||
let content_file = self.file_store.get_update(content_file)?;
|
let content_file = self.file_store.get_update(content_file)?;
|
||||||
|
|
||||||
@@ -836,6 +891,9 @@ impl IndexScheduler {
|
|||||||
|
|
||||||
// 3.1. Dump the documents
|
// 3.1. Dump the documents
|
||||||
for ret in index.all_documents(&rtxn)? {
|
for ret in index.all_documents(&rtxn)? {
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
let (_id, doc) = ret?;
|
let (_id, doc) = ret?;
|
||||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||||
index_dumper.push_document(&document)?;
|
index_dumper.push_document(&document)?;
|
||||||
@@ -848,13 +906,16 @@ impl IndexScheduler {
|
|||||||
})?;
|
})?;
|
||||||
|
|
||||||
// 4. Dump experimental feature settings
|
// 4. Dump experimental feature settings
|
||||||
let features = self.features()?.runtime_features();
|
let features = self.features().runtime_features();
|
||||||
dump.create_experimental_features(features)?;
|
dump.create_experimental_features(features)?;
|
||||||
|
|
||||||
let dump_uid = started_at.format(format_description!(
|
let dump_uid = started_at.format(format_description!(
|
||||||
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
||||||
)).unwrap();
|
)).unwrap();
|
||||||
|
|
||||||
|
if self.must_stop_processing.get() {
|
||||||
|
return Err(Error::AbortedTask);
|
||||||
|
}
|
||||||
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
|
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
|
||||||
let file = File::create(path)?;
|
let file = File::create(path)?;
|
||||||
dump.persist_to(BufWriter::new(file))?;
|
dump.persist_to(BufWriter::new(file))?;
|
||||||
|
|||||||
@@ -108,6 +108,8 @@ pub enum Error {
|
|||||||
TaskDeletionWithEmptyQuery,
|
TaskDeletionWithEmptyQuery,
|
||||||
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
||||||
TaskCancelationWithEmptyQuery,
|
TaskCancelationWithEmptyQuery,
|
||||||
|
#[error("Aborted task")]
|
||||||
|
AbortedTask,
|
||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Dump(#[from] dump::Error),
|
Dump(#[from] dump::Error),
|
||||||
@@ -115,8 +117,13 @@ pub enum Error {
|
|||||||
Heed(#[from] heed::Error),
|
Heed(#[from] heed::Error),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Milli(#[from] milli::Error),
|
Milli(#[from] milli::Error),
|
||||||
#[error("An unexpected crash occurred when processing the task.")]
|
#[error("An unexpected crash occurred when processing the task. {}", {
|
||||||
ProcessBatchPanicked,
|
match .0 {
|
||||||
|
Some(report) => format!("Get /reports/{}", report),
|
||||||
|
None => "No report was saved.".into(),
|
||||||
|
}
|
||||||
|
})]
|
||||||
|
ProcessBatchPanicked(Option<uuid::Uuid>),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
FileStore(#[from] file_store::Error),
|
FileStore(#[from] file_store::Error),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
@@ -175,10 +182,11 @@ impl Error {
|
|||||||
| Error::TaskNotFound(_)
|
| Error::TaskNotFound(_)
|
||||||
| Error::TaskDeletionWithEmptyQuery
|
| Error::TaskDeletionWithEmptyQuery
|
||||||
| Error::TaskCancelationWithEmptyQuery
|
| Error::TaskCancelationWithEmptyQuery
|
||||||
|
| Error::AbortedTask
|
||||||
| Error::Dump(_)
|
| Error::Dump(_)
|
||||||
| Error::Heed(_)
|
| Error::Heed(_)
|
||||||
| Error::Milli(_)
|
| Error::Milli(_)
|
||||||
| Error::ProcessBatchPanicked
|
| Error::ProcessBatchPanicked(_)
|
||||||
| Error::FileStore(_)
|
| Error::FileStore(_)
|
||||||
| Error::IoError(_)
|
| Error::IoError(_)
|
||||||
| Error::Persist(_)
|
| Error::Persist(_)
|
||||||
@@ -221,7 +229,7 @@ impl ErrorCode for Error {
|
|||||||
Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice,
|
Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice,
|
||||||
Error::Dump(e) => e.error_code(),
|
Error::Dump(e) => e.error_code(),
|
||||||
Error::Milli(e) => e.error_code(),
|
Error::Milli(e) => e.error_code(),
|
||||||
Error::ProcessBatchPanicked => Code::Internal,
|
Error::ProcessBatchPanicked(_) => Code::Internal,
|
||||||
Error::Heed(e) => e.error_code(),
|
Error::Heed(e) => e.error_code(),
|
||||||
Error::HeedTransaction(e) => e.error_code(),
|
Error::HeedTransaction(e) => e.error_code(),
|
||||||
Error::FileStore(e) => e.error_code(),
|
Error::FileStore(e) => e.error_code(),
|
||||||
@@ -236,6 +244,9 @@ impl ErrorCode for Error {
|
|||||||
Error::TaskDatabaseUpdate(_) => Code::Internal,
|
Error::TaskDatabaseUpdate(_) => Code::Internal,
|
||||||
Error::CreateBatch(_) => Code::Internal,
|
Error::CreateBatch(_) => Code::Internal,
|
||||||
|
|
||||||
|
// This one should never be seen by the end user
|
||||||
|
Error::AbortedTask => Code::Internal,
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
Error::PlannedFailure => Code::Internal,
|
Error::PlannedFailure => Code::Internal,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
|
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
|
||||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||||
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
|
use meilisearch_types::heed::{Database, Env, RwTxn};
|
||||||
|
|
||||||
use crate::error::FeatureNotEnabledError;
|
use crate::error::FeatureNotEnabledError;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
@@ -9,20 +11,19 @@ const EXPERIMENTAL_FEATURES: &str = "experimental-features";
|
|||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct FeatureData {
|
pub(crate) struct FeatureData {
|
||||||
runtime: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
|
persisted: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
|
||||||
instance: InstanceTogglableFeatures,
|
runtime: Arc<RwLock<RuntimeTogglableFeatures>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct RoFeatures {
|
pub struct RoFeatures {
|
||||||
runtime: RuntimeTogglableFeatures,
|
runtime: RuntimeTogglableFeatures,
|
||||||
instance: InstanceTogglableFeatures,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RoFeatures {
|
impl RoFeatures {
|
||||||
fn new(txn: RoTxn<'_>, data: &FeatureData) -> Result<Self> {
|
fn new(data: &FeatureData) -> Self {
|
||||||
let runtime = data.runtime_features(txn)?;
|
let runtime = data.runtime_features();
|
||||||
Ok(Self { runtime, instance: data.instance })
|
Self { runtime }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
pub fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
||||||
@@ -43,13 +44,13 @@ impl RoFeatures {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn check_metrics(&self) -> Result<()> {
|
pub fn check_metrics(&self) -> Result<()> {
|
||||||
if self.instance.metrics {
|
if self.runtime.metrics {
|
||||||
Ok(())
|
Ok(())
|
||||||
} else {
|
} else {
|
||||||
Err(FeatureNotEnabledError {
|
Err(FeatureNotEnabledError {
|
||||||
disabled_action: "Getting metrics",
|
disabled_action: "Getting metrics",
|
||||||
feature: "metrics",
|
feature: "metrics",
|
||||||
issue_link: "https://github.com/meilisearch/meilisearch/discussions/3518",
|
issue_link: "https://github.com/meilisearch/product/discussions/625",
|
||||||
}
|
}
|
||||||
.into())
|
.into())
|
||||||
}
|
}
|
||||||
@@ -67,15 +68,36 @@ impl RoFeatures {
|
|||||||
.into())
|
.into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn check_puffin(&self) -> Result<()> {
|
||||||
|
if self.runtime.export_puffin_reports {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
Err(FeatureNotEnabledError {
|
||||||
|
disabled_action: "Outputting Puffin reports to disk",
|
||||||
|
feature: "export puffin reports",
|
||||||
|
issue_link: "https://github.com/meilisearch/product/discussions/693",
|
||||||
|
}
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FeatureData {
|
impl FeatureData {
|
||||||
pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result<Self> {
|
pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result<Self> {
|
||||||
let mut wtxn = env.write_txn()?;
|
let mut wtxn = env.write_txn()?;
|
||||||
let runtime_features = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
|
let runtime_features_db = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
|
|
||||||
Ok(Self { runtime: runtime_features, instance: instance_features })
|
let txn = env.read_txn()?;
|
||||||
|
let persisted_features: RuntimeTogglableFeatures =
|
||||||
|
runtime_features_db.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default();
|
||||||
|
let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures {
|
||||||
|
metrics: instance_features.metrics || persisted_features.metrics,
|
||||||
|
..persisted_features
|
||||||
|
}));
|
||||||
|
|
||||||
|
Ok(Self { persisted: runtime_features_db, runtime })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_runtime_features(
|
pub fn put_runtime_features(
|
||||||
@@ -83,16 +105,25 @@ impl FeatureData {
|
|||||||
mut wtxn: RwTxn,
|
mut wtxn: RwTxn,
|
||||||
features: RuntimeTogglableFeatures,
|
features: RuntimeTogglableFeatures,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.runtime.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
|
self.persisted.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
|
|
||||||
|
// safe to unwrap, the lock will only fail if:
|
||||||
|
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
|
||||||
|
// 2. there's a panic while the thread is held -> it is only used for an assignment here.
|
||||||
|
let mut toggled_features = self.runtime.write().unwrap();
|
||||||
|
*toggled_features = features;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn runtime_features(&self, txn: RoTxn) -> Result<RuntimeTogglableFeatures> {
|
fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
||||||
Ok(self.runtime.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default())
|
// sound to unwrap, the lock will only fail if:
|
||||||
|
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
|
||||||
|
// 2. there's a panic while the thread is held -> it is only used for copying the data here
|
||||||
|
*self.runtime.read().unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn features(&self, txn: RoTxn) -> Result<RoFeatures> {
|
pub fn features(&self) -> RoFeatures {
|
||||||
RoFeatures::new(txn, self)
|
RoFeatures::new(self)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
|||||||
index_mapper,
|
index_mapper,
|
||||||
features: _,
|
features: _,
|
||||||
max_number_of_tasks: _,
|
max_number_of_tasks: _,
|
||||||
|
puffin_frame: _,
|
||||||
wake_up: _,
|
wake_up: _,
|
||||||
dumps_path: _,
|
dumps_path: _,
|
||||||
snapshots_path: _,
|
snapshots_path: _,
|
||||||
@@ -38,6 +39,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
|||||||
test_breakpoint_sdr: _,
|
test_breakpoint_sdr: _,
|
||||||
planned_failures: _,
|
planned_failures: _,
|
||||||
run_loop_iteration: _,
|
run_loop_iteration: _,
|
||||||
|
panic_reader: _,
|
||||||
} = scheduler;
|
} = scheduler;
|
||||||
|
|
||||||
let rtxn = env.read_txn().unwrap();
|
let rtxn = env.read_txn().unwrap();
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ mod index_mapper;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod insta_snapshot;
|
mod insta_snapshot;
|
||||||
mod lru;
|
mod lru;
|
||||||
|
mod panic_hook;
|
||||||
mod utils;
|
mod utils;
|
||||||
mod uuid_codec;
|
mod uuid_codec;
|
||||||
|
|
||||||
@@ -33,6 +34,7 @@ pub type Result<T> = std::result::Result<T, Error>;
|
|||||||
pub type TaskId = u32;
|
pub type TaskId = u32;
|
||||||
|
|
||||||
use std::collections::{BTreeMap, HashMap};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
|
use std::fs::File;
|
||||||
use std::ops::{Bound, RangeBounds};
|
use std::ops::{Bound, RangeBounds};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::sync::atomic::AtomicBool;
|
use std::sync::atomic::AtomicBool;
|
||||||
@@ -52,6 +54,9 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
|||||||
use meilisearch_types::milli::update::IndexerConfig;
|
use meilisearch_types::milli::update::IndexerConfig;
|
||||||
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
||||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||||
|
use panic_hook::ReportReader;
|
||||||
|
pub use panic_hook::{Panic, Report, ReportRegistry};
|
||||||
|
use puffin::FrameView;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use synchronoise::SignalEvent;
|
use synchronoise::SignalEvent;
|
||||||
use time::format_description::well_known::Rfc3339;
|
use time::format_description::well_known::Rfc3339;
|
||||||
@@ -314,6 +319,9 @@ pub struct IndexScheduler {
|
|||||||
/// the finished tasks automatically.
|
/// the finished tasks automatically.
|
||||||
pub(crate) max_number_of_tasks: usize,
|
pub(crate) max_number_of_tasks: usize,
|
||||||
|
|
||||||
|
/// A frame to output the indexation profiling files to disk.
|
||||||
|
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
|
||||||
|
|
||||||
/// The path used to create the dumps.
|
/// The path used to create the dumps.
|
||||||
pub(crate) dumps_path: PathBuf,
|
pub(crate) dumps_path: PathBuf,
|
||||||
|
|
||||||
@@ -326,6 +334,8 @@ pub struct IndexScheduler {
|
|||||||
/// The path to the version file of Meilisearch.
|
/// The path to the version file of Meilisearch.
|
||||||
pub(crate) version_file_path: PathBuf,
|
pub(crate) version_file_path: PathBuf,
|
||||||
|
|
||||||
|
pub(crate) panic_reader: ReportReader,
|
||||||
|
|
||||||
// ================= test
|
// ================= test
|
||||||
// The next entry is dedicated to the tests.
|
// The next entry is dedicated to the tests.
|
||||||
/// Provide a way to set a breakpoint in multiple part of the scheduler.
|
/// Provide a way to set a breakpoint in multiple part of the scheduler.
|
||||||
@@ -364,6 +374,7 @@ impl IndexScheduler {
|
|||||||
wake_up: self.wake_up.clone(),
|
wake_up: self.wake_up.clone(),
|
||||||
autobatching_enabled: self.autobatching_enabled,
|
autobatching_enabled: self.autobatching_enabled,
|
||||||
max_number_of_tasks: self.max_number_of_tasks,
|
max_number_of_tasks: self.max_number_of_tasks,
|
||||||
|
puffin_frame: self.puffin_frame.clone(),
|
||||||
snapshots_path: self.snapshots_path.clone(),
|
snapshots_path: self.snapshots_path.clone(),
|
||||||
dumps_path: self.dumps_path.clone(),
|
dumps_path: self.dumps_path.clone(),
|
||||||
auth_path: self.auth_path.clone(),
|
auth_path: self.auth_path.clone(),
|
||||||
@@ -375,6 +386,7 @@ impl IndexScheduler {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
run_loop_iteration: self.run_loop_iteration.clone(),
|
run_loop_iteration: self.run_loop_iteration.clone(),
|
||||||
features: self.features.clone(),
|
features: self.features.clone(),
|
||||||
|
panic_reader: self.panic_reader.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -432,6 +444,12 @@ impl IndexScheduler {
|
|||||||
let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?;
|
let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?;
|
||||||
wtxn.commit()?;
|
wtxn.commit()?;
|
||||||
|
|
||||||
|
const MAX_REPORT_COUNT: usize = 20;
|
||||||
|
|
||||||
|
let panic_reader = panic_hook::ReportReader::install_panic_hook(
|
||||||
|
std::num::NonZeroUsize::new(MAX_REPORT_COUNT).unwrap(),
|
||||||
|
);
|
||||||
|
|
||||||
// allow unreachable_code to get rids of the warning in the case of a test build.
|
// allow unreachable_code to get rids of the warning in the case of a test build.
|
||||||
let this = Self {
|
let this = Self {
|
||||||
must_stop_processing: MustStopProcessing::default(),
|
must_stop_processing: MustStopProcessing::default(),
|
||||||
@@ -457,6 +475,7 @@ impl IndexScheduler {
|
|||||||
env,
|
env,
|
||||||
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
|
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
|
||||||
wake_up: Arc::new(SignalEvent::auto(true)),
|
wake_up: Arc::new(SignalEvent::auto(true)),
|
||||||
|
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
|
||||||
autobatching_enabled: options.autobatching_enabled,
|
autobatching_enabled: options.autobatching_enabled,
|
||||||
max_number_of_tasks: options.max_number_of_tasks,
|
max_number_of_tasks: options.max_number_of_tasks,
|
||||||
dumps_path: options.dumps_path,
|
dumps_path: options.dumps_path,
|
||||||
@@ -471,6 +490,7 @@ impl IndexScheduler {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
run_loop_iteration: Arc::new(RwLock::new(0)),
|
run_loop_iteration: Arc::new(RwLock::new(0)),
|
||||||
features,
|
features,
|
||||||
|
panic_reader,
|
||||||
};
|
};
|
||||||
|
|
||||||
this.run();
|
this.run();
|
||||||
@@ -572,17 +592,46 @@ impl IndexScheduler {
|
|||||||
run.wake_up.wait();
|
run.wake_up.wait();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
|
let puffin_enabled = run.features().check_puffin().is_ok();
|
||||||
|
puffin::set_scopes_on(puffin_enabled);
|
||||||
|
puffin::GlobalProfiler::lock().new_frame();
|
||||||
|
|
||||||
match run.tick() {
|
match run.tick() {
|
||||||
Ok(TickOutcome::TickAgain(_)) => (),
|
Ok(TickOutcome::TickAgain(_)) => (),
|
||||||
Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(),
|
Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
log::error!("{}", e);
|
log::error!("{e}");
|
||||||
// Wait one second when an irrecoverable error occurs.
|
// Wait one second when an irrecoverable error occurs.
|
||||||
if !e.is_recoverable() {
|
if !e.is_recoverable() {
|
||||||
std::thread::sleep(Duration::from_secs(1));
|
std::thread::sleep(Duration::from_secs(1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Let's write the previous frame to disk but only if
|
||||||
|
// the user wanted to profile with puffin.
|
||||||
|
if puffin_enabled {
|
||||||
|
let mut frame_view = run.puffin_frame.lock();
|
||||||
|
if !frame_view.is_empty() {
|
||||||
|
let now = OffsetDateTime::now_utc();
|
||||||
|
let mut file = match File::create(format!("{}.puffin", now)) {
|
||||||
|
Ok(file) => file,
|
||||||
|
Err(e) => {
|
||||||
|
log::error!("{e}");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Err(e) = frame_view.save_to_writer(&mut file) {
|
||||||
|
log::error!("{e}");
|
||||||
|
}
|
||||||
|
if let Err(e) = file.sync_all() {
|
||||||
|
log::error!("{e}");
|
||||||
|
}
|
||||||
|
// We erase this frame view as it is no more useful. We want to
|
||||||
|
// measure the new frames now that we exported the previous ones.
|
||||||
|
*frame_view = FrameView::default();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -1062,8 +1111,6 @@ impl IndexScheduler {
|
|||||||
self.breakpoint(Breakpoint::Start);
|
self.breakpoint(Breakpoint::Start);
|
||||||
}
|
}
|
||||||
|
|
||||||
puffin::GlobalProfiler::lock().new_frame();
|
|
||||||
|
|
||||||
self.cleanup_task_queue()?;
|
self.cleanup_task_queue()?;
|
||||||
|
|
||||||
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
||||||
@@ -1096,7 +1143,10 @@ impl IndexScheduler {
|
|||||||
.name(String::from("batch-operation"))
|
.name(String::from("batch-operation"))
|
||||||
.spawn(move || cloned_index_scheduler.process_batch(batch))
|
.spawn(move || cloned_index_scheduler.process_batch(batch))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
handle.join().unwrap_or(Err(Error::ProcessBatchPanicked))
|
|
||||||
|
self.panic_reader
|
||||||
|
.join_thread(handle)
|
||||||
|
.unwrap_or_else(|maybe_report| Err(Error::ProcessBatchPanicked(maybe_report)))
|
||||||
};
|
};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -1133,7 +1183,8 @@ impl IndexScheduler {
|
|||||||
// If we have an abortion error we must stop the tick here and re-schedule tasks.
|
// If we have an abortion error we must stop the tick here and re-schedule tasks.
|
||||||
Err(Error::Milli(milli::Error::InternalError(
|
Err(Error::Milli(milli::Error::InternalError(
|
||||||
milli::InternalError::AbortedIndexation,
|
milli::InternalError::AbortedIndexation,
|
||||||
))) => {
|
)))
|
||||||
|
| Err(Error::AbortedTask) => {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
self.breakpoint(Breakpoint::AbortedIndexation);
|
self.breakpoint(Breakpoint::AbortedIndexation);
|
||||||
wtxn.abort().map_err(Error::HeedTransaction)?;
|
wtxn.abort().map_err(Error::HeedTransaction)?;
|
||||||
@@ -1259,9 +1310,8 @@ impl IndexScheduler {
|
|||||||
Ok(IndexStats { is_indexing, inner_stats: index_stats })
|
Ok(IndexStats { is_indexing, inner_stats: index_stats })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn features(&self) -> Result<RoFeatures> {
|
pub fn features(&self) -> RoFeatures {
|
||||||
let rtxn = self.read_txn()?;
|
self.features.features()
|
||||||
self.features.features(rtxn)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn put_runtime_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {
|
pub fn put_runtime_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {
|
||||||
@@ -1277,6 +1327,10 @@ impl IndexScheduler {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn reports(&self) -> Arc<RwLock<ReportRegistry>> {
|
||||||
|
self.panic_reader.registry()
|
||||||
|
}
|
||||||
|
|
||||||
/// Blocks the thread until the test handle asks to progress to/through this breakpoint.
|
/// Blocks the thread until the test handle asks to progress to/through this breakpoint.
|
||||||
///
|
///
|
||||||
/// Two messages are sent through the channel for each breakpoint.
|
/// Two messages are sent through the channel for each breakpoint.
|
||||||
@@ -4290,4 +4344,26 @@ mod tests {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cancel_processing_dump() {
|
||||||
|
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||||
|
|
||||||
|
let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None };
|
||||||
|
let dump_cancellation = KindWithContent::TaskCancelation {
|
||||||
|
query: "cancel dump".to_owned(),
|
||||||
|
tasks: RoaringBitmap::from_iter([0]),
|
||||||
|
};
|
||||||
|
let _ = index_scheduler.register(dump_creation).unwrap();
|
||||||
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register");
|
||||||
|
handle.advance_till([Start, BatchCreated, InsideProcessBatch]);
|
||||||
|
|
||||||
|
let _ = index_scheduler.register(dump_cancellation).unwrap();
|
||||||
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered");
|
||||||
|
|
||||||
|
snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation");
|
||||||
|
|
||||||
|
handle.advance_one_successful_batch();
|
||||||
|
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
211
index-scheduler/src/panic_hook.rs
Normal file
211
index-scheduler/src/panic_hook.rs
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
//! Panic hook designed to fetch a panic from a subthread and recover it on join.
|
||||||
|
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
use std::num::NonZeroUsize;
|
||||||
|
use std::panic::PanicInfo;
|
||||||
|
use std::sync::{Arc, RwLock};
|
||||||
|
use std::thread::{JoinHandle, ThreadId};
|
||||||
|
|
||||||
|
use backtrace::Backtrace;
|
||||||
|
|
||||||
|
// Represents a panic in a shallowy structured fashion
|
||||||
|
pub struct Panic {
|
||||||
|
pub payload: Option<String>,
|
||||||
|
pub location: Option<String>,
|
||||||
|
pub thread_name: Option<String>,
|
||||||
|
pub thread_id: ThreadId,
|
||||||
|
pub backtrace: Backtrace,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A panic enriched with a unique id
|
||||||
|
#[derive(serde::Serialize)]
|
||||||
|
pub struct Report {
|
||||||
|
pub id: uuid::Uuid,
|
||||||
|
#[serde(serialize_with = "serialize_panic")]
|
||||||
|
pub panic: Panic,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_panic<S>(panic: &Panic, s: S) -> std::result::Result<S::Ok, S::Error>
|
||||||
|
where
|
||||||
|
S: serde::Serializer,
|
||||||
|
{
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
panic.to_json().serialize(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Report {
|
||||||
|
pub fn new(panic: Panic) -> Self {
|
||||||
|
Self { id: uuid::Uuid::new_v4(), panic }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Panic {
|
||||||
|
pub fn to_json(&self) -> serde_json::Value {
|
||||||
|
json::panic_to_json(self)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mod json {
|
||||||
|
use backtrace::{Backtrace, BacktraceFrame, BacktraceSymbol};
|
||||||
|
use serde_json::{json, Value};
|
||||||
|
|
||||||
|
use super::Panic;
|
||||||
|
|
||||||
|
fn symbol_to_json(symbol: &BacktraceSymbol) -> Value {
|
||||||
|
let address = symbol.addr().map(|addr| format!("{:p}", addr));
|
||||||
|
let column = symbol.colno();
|
||||||
|
let line = symbol.lineno();
|
||||||
|
let function = symbol.name().map(|name| name.to_string());
|
||||||
|
let filename = symbol.filename();
|
||||||
|
json!({
|
||||||
|
"function": function,
|
||||||
|
"filename": filename,
|
||||||
|
"line": line,
|
||||||
|
"column": column,
|
||||||
|
"address": address,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn frame_to_json(frame: &BacktraceFrame) -> Value {
|
||||||
|
let symbols: Vec<_> = frame.symbols().iter().map(symbol_to_json).collect();
|
||||||
|
match symbols.as_slice() {
|
||||||
|
[] => {
|
||||||
|
let address = format!("{:p}", frame.ip());
|
||||||
|
json!({"address": address})
|
||||||
|
}
|
||||||
|
[symbol] => json!(symbol),
|
||||||
|
symbols => json!(symbols),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn backtrace_to_json(backtrace: &Backtrace) -> Value {
|
||||||
|
let frames: Vec<_> = backtrace.frames().iter().map(frame_to_json).collect();
|
||||||
|
json!(frames)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn panic_to_json(panic: &Panic) -> Value {
|
||||||
|
let thread_id = format!("{:?}", panic.thread_id);
|
||||||
|
serde_json::json!({
|
||||||
|
"payload": panic.payload,
|
||||||
|
"location": panic.location,
|
||||||
|
"thread": {
|
||||||
|
"id": thread_id,
|
||||||
|
"name": panic.thread_name,
|
||||||
|
},
|
||||||
|
"backtrace": backtrace_to_json(&panic.backtrace),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ReportWriter(Arc<RwLock<ReportRegistry>>);
|
||||||
|
|
||||||
|
/// A FIFO queue of reports.
|
||||||
|
pub struct ReportRegistry {
|
||||||
|
reports: std::collections::VecDeque<Report>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReportRegistry {
|
||||||
|
pub fn new(capacity: NonZeroUsize) -> Self {
|
||||||
|
Self { reports: VecDeque::with_capacity(capacity.get()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn push(&mut self, report: Report) -> Option<Report> {
|
||||||
|
let popped = if self.reports.len() == self.reports.capacity() {
|
||||||
|
self.reports.pop_back()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
self.reports.push_front(report);
|
||||||
|
popped
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter(&self) -> impl Iterator<Item = &Report> {
|
||||||
|
self.reports.iter()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn find(&self, report_id: uuid::Uuid) -> Option<&Report> {
|
||||||
|
self.iter().find(|report| report.id == report_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReportWriter {
|
||||||
|
#[track_caller]
|
||||||
|
fn write_panic(&self, panic_info: &PanicInfo<'_>) {
|
||||||
|
let payload = panic_info
|
||||||
|
.payload()
|
||||||
|
.downcast_ref::<&str>()
|
||||||
|
.map(ToString::to_string)
|
||||||
|
.or_else(|| panic_info.payload().downcast_ref::<String>().cloned());
|
||||||
|
let location = panic_info.location().map(|loc| {
|
||||||
|
format!(
|
||||||
|
"{file}:{line}:{column}",
|
||||||
|
file = loc.file(),
|
||||||
|
line = loc.line(),
|
||||||
|
column = loc.column()
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
let thread_name = std::thread::current().name().map(ToString::to_string);
|
||||||
|
let thread_id = std::thread::current().id();
|
||||||
|
let backtrace = backtrace::Backtrace::new();
|
||||||
|
|
||||||
|
let panic = Panic { payload, location, thread_name, thread_id, backtrace };
|
||||||
|
|
||||||
|
let report = Report::new(panic);
|
||||||
|
|
||||||
|
log::error!(
|
||||||
|
"An unexpected panic occurred on thread {name} at {location}: {payload}. See report '{report}' for details.",
|
||||||
|
payload = report.panic.payload.as_deref().unwrap_or("Box<dyn Any>"),
|
||||||
|
name = report.panic.thread_name.as_deref().unwrap_or("<unnamed>"),
|
||||||
|
location = report.panic.location.as_deref().unwrap_or("<unknown>"),
|
||||||
|
report = report.id,
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Ok(mut registry) = self.0.write() {
|
||||||
|
if let Some(old_report) = registry.push(report) {
|
||||||
|
log::trace!("Forgetting report {} to make space for new report.", old_report.id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Reads the reports written in case of a panic.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct ReportReader(Arc<RwLock<ReportRegistry>>);
|
||||||
|
|
||||||
|
impl ReportReader {
|
||||||
|
/// Installs a new global panic hook, overriding any existing hook.
|
||||||
|
///
|
||||||
|
/// The hook writes any incoming panic in reports.
|
||||||
|
/// The reports can then be read by the returned [`ReportReader`].
|
||||||
|
pub fn install_panic_hook(capacity: NonZeroUsize) -> Self {
|
||||||
|
let registry = Arc::new(RwLock::new(ReportRegistry::new(capacity)));
|
||||||
|
let reader = ReportReader(registry.clone());
|
||||||
|
let writer = ReportWriter(registry.clone());
|
||||||
|
|
||||||
|
std::panic::set_hook(Box::new(move |panic_info| writer.write_panic(panic_info)));
|
||||||
|
reader
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Join the thread corresponding to the passed handle, recovering either its value
|
||||||
|
/// or, in case the thread panicked, the id of the report corresponding to the panic.
|
||||||
|
///
|
||||||
|
/// The id can be used to read the report from the [`self.registry()`].
|
||||||
|
pub fn join_thread<T>(&self, thread: JoinHandle<T>) -> Result<T, Option<uuid::Uuid>> {
|
||||||
|
let thread_id = thread.thread().id();
|
||||||
|
thread.join().map_err(|_e| {
|
||||||
|
self.0
|
||||||
|
.read()
|
||||||
|
.unwrap()
|
||||||
|
.iter()
|
||||||
|
.find(|report| report.panic.thread_id == thread_id)
|
||||||
|
.map(|report| report.id)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a registry that can be used to read the reports written during a panic.
|
||||||
|
pub fn registry(&self) -> Arc<RwLock<ReportRegistry>> {
|
||||||
|
self.0.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
### Autobatching Enabled = true
|
||||||
|
### Processing Tasks:
|
||||||
|
[]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### All Tasks:
|
||||||
|
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Status:
|
||||||
|
enqueued [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Kind:
|
||||||
|
"dumpCreation" [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Tasks:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Mapper:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Canceled By:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Enqueued At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Started At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Finished At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### File Store:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
### Autobatching Enabled = true
|
||||||
|
### Processing Tasks:
|
||||||
|
[]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### All Tasks:
|
||||||
|
0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||||
|
1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Status:
|
||||||
|
enqueued []
|
||||||
|
succeeded [1,]
|
||||||
|
canceled [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Kind:
|
||||||
|
"taskCancelation" [1,]
|
||||||
|
"dumpCreation" [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Tasks:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Mapper:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Canceled By:
|
||||||
|
1 [0,]
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Enqueued At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Started At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Finished At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### File Store:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
### Autobatching Enabled = true
|
||||||
|
### Processing Tasks:
|
||||||
|
[0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### All Tasks:
|
||||||
|
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||||
|
1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Status:
|
||||||
|
enqueued [0,1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Kind:
|
||||||
|
"taskCancelation" [1,]
|
||||||
|
"dumpCreation" [0,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Tasks:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Index Mapper:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Canceled By:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Enqueued At:
|
||||||
|
[timestamp] [0,]
|
||||||
|
[timestamp] [1,]
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Started At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### Finished At:
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
### File Store:
|
||||||
|
|
||||||
|
----------------------------------------------------------------------
|
||||||
|
|
||||||
@@ -88,7 +88,6 @@ pub trait ErrorCode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::enum_variant_names)]
|
|
||||||
enum ErrorType {
|
enum ErrorType {
|
||||||
Internal,
|
Internal,
|
||||||
InvalidRequest,
|
InvalidRequest,
|
||||||
@@ -298,6 +297,7 @@ MissingSwapIndexes , InvalidRequest , BAD_REQUEST ;
|
|||||||
MissingTaskFilters , InvalidRequest , BAD_REQUEST ;
|
MissingTaskFilters , InvalidRequest , BAD_REQUEST ;
|
||||||
NoSpaceLeftOnDevice , System , UNPROCESSABLE_ENTITY;
|
NoSpaceLeftOnDevice , System , UNPROCESSABLE_ENTITY;
|
||||||
PayloadTooLarge , InvalidRequest , PAYLOAD_TOO_LARGE ;
|
PayloadTooLarge , InvalidRequest , PAYLOAD_TOO_LARGE ;
|
||||||
|
ReportNotFound , InvalidRequest , NOT_FOUND ;
|
||||||
TaskNotFound , InvalidRequest , NOT_FOUND ;
|
TaskNotFound , InvalidRequest , NOT_FOUND ;
|
||||||
TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ;
|
TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ;
|
||||||
UnretrievableDocument , Internal , BAD_REQUEST ;
|
UnretrievableDocument , Internal , BAD_REQUEST ;
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ use serde::{Deserialize, Serialize};
|
|||||||
pub struct RuntimeTogglableFeatures {
|
pub struct RuntimeTogglableFeatures {
|
||||||
pub score_details: bool,
|
pub score_details: bool,
|
||||||
pub vector_store: bool,
|
pub vector_store: bool,
|
||||||
|
pub metrics: bool,
|
||||||
|
pub export_puffin_reports: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Debug, Clone, Copy)]
|
#[derive(Default, Debug, Clone, Copy)]
|
||||||
|
|||||||
@@ -69,8 +69,7 @@ permissive-json-pointer = { path = "../permissive-json-pointer" }
|
|||||||
pin-project-lite = "0.2.9"
|
pin-project-lite = "0.2.9"
|
||||||
platform-dirs = "0.3.0"
|
platform-dirs = "0.3.0"
|
||||||
prometheus = { version = "0.13.3", features = ["process"] }
|
prometheus = { version = "0.13.3", features = ["process"] }
|
||||||
puffin = "0.16.0"
|
puffin = { version = "0.16.0", features = ["serialization"] }
|
||||||
puffin_http = { version = "0.13.0", optional = true }
|
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
rayon = "1.7.0"
|
rayon = "1.7.0"
|
||||||
regex = "1.7.3"
|
regex = "1.7.3"
|
||||||
@@ -135,7 +134,6 @@ zip = { version = "0.6.4", optional = true }
|
|||||||
[features]
|
[features]
|
||||||
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
|
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
|
||||||
analytics = ["segment"]
|
analytics = ["segment"]
|
||||||
profile-with-puffin = ["dep:puffin_http"]
|
|
||||||
mini-dashboard = [
|
mini-dashboard = [
|
||||||
"actix-web-static-files",
|
"actix-web-static-files",
|
||||||
"static-files",
|
"static-files",
|
||||||
|
|||||||
@@ -51,6 +51,8 @@ pub enum MeilisearchHttpError {
|
|||||||
DocumentFormat(#[from] DocumentFormatError),
|
DocumentFormat(#[from] DocumentFormatError),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Join(#[from] JoinError),
|
Join(#[from] JoinError),
|
||||||
|
#[error("Report `{0}` not found. Either its id is incorrect, or it was deleted. To save on memory, only a limited amount of reports are kept.")]
|
||||||
|
ReportNotFound(uuid::Uuid),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ErrorCode for MeilisearchHttpError {
|
impl ErrorCode for MeilisearchHttpError {
|
||||||
@@ -74,6 +76,7 @@ impl ErrorCode for MeilisearchHttpError {
|
|||||||
MeilisearchHttpError::FileStore(_) => Code::Internal,
|
MeilisearchHttpError::FileStore(_) => Code::Internal,
|
||||||
MeilisearchHttpError::DocumentFormat(e) => e.error_code(),
|
MeilisearchHttpError::DocumentFormat(e) => e.error_code(),
|
||||||
MeilisearchHttpError::Join(_) => Code::Internal,
|
MeilisearchHttpError::Join(_) => Code::Internal,
|
||||||
|
MeilisearchHttpError::ReportNotFound(_) => Code::ReportNotFound,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -114,10 +114,7 @@ pub fn create_app(
|
|||||||
.configure(routes::configure)
|
.configure(routes::configure)
|
||||||
.configure(|s| dashboard(s, enable_dashboard));
|
.configure(|s| dashboard(s, enable_dashboard));
|
||||||
|
|
||||||
let app = app.wrap(actix_web::middleware::Condition::new(
|
let app = app.wrap(middleware::RouteMetrics);
|
||||||
opt.experimental_enable_metrics,
|
|
||||||
middleware::RouteMetrics,
|
|
||||||
));
|
|
||||||
app.wrap(
|
app.wrap(
|
||||||
Cors::default()
|
Cors::default()
|
||||||
.send_wildcard()
|
.send_wildcard()
|
||||||
|
|||||||
@@ -30,10 +30,6 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
|
|||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let (opt, config_read_from) = Opt::try_build()?;
|
let (opt, config_read_from) = Opt::try_build()?;
|
||||||
|
|
||||||
#[cfg(feature = "profile-with-puffin")]
|
|
||||||
let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
|
|
||||||
puffin::set_scopes_on(cfg!(feature = "profile-with-puffin"));
|
|
||||||
|
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
|
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
|
||||||
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
|
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
|
||||||
|
|||||||
@@ -3,8 +3,10 @@
|
|||||||
use std::future::{ready, Ready};
|
use std::future::{ready, Ready};
|
||||||
|
|
||||||
use actix_web::dev::{self, Service, ServiceRequest, ServiceResponse, Transform};
|
use actix_web::dev::{self, Service, ServiceRequest, ServiceResponse, Transform};
|
||||||
|
use actix_web::web::Data;
|
||||||
use actix_web::Error;
|
use actix_web::Error;
|
||||||
use futures_util::future::LocalBoxFuture;
|
use futures_util::future::LocalBoxFuture;
|
||||||
|
use index_scheduler::IndexScheduler;
|
||||||
use prometheus::HistogramTimer;
|
use prometheus::HistogramTimer;
|
||||||
|
|
||||||
pub struct RouteMetrics;
|
pub struct RouteMetrics;
|
||||||
@@ -47,19 +49,27 @@ where
|
|||||||
|
|
||||||
fn call(&self, req: ServiceRequest) -> Self::Future {
|
fn call(&self, req: ServiceRequest) -> Self::Future {
|
||||||
let mut histogram_timer: Option<HistogramTimer> = None;
|
let mut histogram_timer: Option<HistogramTimer> = None;
|
||||||
let request_path = req.path();
|
|
||||||
let is_registered_resource = req.resource_map().has_resource(request_path);
|
// calling unwrap here is safe because index scheduler is added to app data while creating actix app.
|
||||||
if is_registered_resource {
|
// also, the tests will fail if this is not present.
|
||||||
let request_method = req.method().to_string();
|
let index_scheduler = req.app_data::<Data<IndexScheduler>>().unwrap();
|
||||||
histogram_timer = Some(
|
let features = index_scheduler.features();
|
||||||
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
|
||||||
|
if features.check_metrics().is_ok() {
|
||||||
|
let request_path = req.path();
|
||||||
|
let is_registered_resource = req.resource_map().has_resource(request_path);
|
||||||
|
if is_registered_resource {
|
||||||
|
let request_method = req.method().to_string();
|
||||||
|
histogram_timer = Some(
|
||||||
|
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
||||||
|
.with_label_values(&[&request_method, request_path])
|
||||||
|
.start_timer(),
|
||||||
|
);
|
||||||
|
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
|
||||||
.with_label_values(&[&request_method, request_path])
|
.with_label_values(&[&request_method, request_path])
|
||||||
.start_timer(),
|
.inc();
|
||||||
);
|
}
|
||||||
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
|
};
|
||||||
.with_label_values(&[&request_method, request_path])
|
|
||||||
.inc();
|
|
||||||
}
|
|
||||||
|
|
||||||
let fut = self.service.call(req);
|
let fut = self.service.call(req);
|
||||||
|
|
||||||
|
|||||||
@@ -29,12 +29,12 @@ async fn get_features(
|
|||||||
>,
|
>,
|
||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: Data<dyn Analytics>,
|
analytics: Data<dyn Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> HttpResponse {
|
||||||
let features = index_scheduler.features()?;
|
let features = index_scheduler.features();
|
||||||
|
|
||||||
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
|
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
|
||||||
debug!("returns: {:?}", features.runtime_features());
|
debug!("returns: {:?}", features.runtime_features());
|
||||||
Ok(HttpResponse::Ok().json(features.runtime_features()))
|
HttpResponse::Ok().json(features.runtime_features())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserr)]
|
#[derive(Debug, Deserr)]
|
||||||
@@ -44,6 +44,10 @@ pub struct RuntimeTogglableFeatures {
|
|||||||
pub score_details: Option<bool>,
|
pub score_details: Option<bool>,
|
||||||
#[deserr(default)]
|
#[deserr(default)]
|
||||||
pub vector_store: Option<bool>,
|
pub vector_store: Option<bool>,
|
||||||
|
#[deserr(default)]
|
||||||
|
pub metrics: Option<bool>,
|
||||||
|
#[deserr(default)]
|
||||||
|
pub export_puffin_reports: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn patch_features(
|
async fn patch_features(
|
||||||
@@ -55,26 +59,36 @@ async fn patch_features(
|
|||||||
req: HttpRequest,
|
req: HttpRequest,
|
||||||
analytics: Data<dyn Analytics>,
|
analytics: Data<dyn Analytics>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
let features = index_scheduler.features()?;
|
let features = index_scheduler.features();
|
||||||
|
|
||||||
let old_features = features.runtime_features();
|
let old_features = features.runtime_features();
|
||||||
|
|
||||||
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
|
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
|
||||||
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
|
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
|
||||||
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
|
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
|
||||||
|
metrics: new_features.0.metrics.unwrap_or(old_features.metrics),
|
||||||
|
export_puffin_reports: new_features
|
||||||
|
.0
|
||||||
|
.export_puffin_reports
|
||||||
|
.unwrap_or(old_features.export_puffin_reports),
|
||||||
};
|
};
|
||||||
|
|
||||||
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
|
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
|
||||||
// the it renames to camelCase, which we don't want for analytics.
|
// the it renames to camelCase, which we don't want for analytics.
|
||||||
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
|
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
|
||||||
let meilisearch_types::features::RuntimeTogglableFeatures { score_details, vector_store } =
|
let meilisearch_types::features::RuntimeTogglableFeatures {
|
||||||
new_features;
|
score_details,
|
||||||
|
vector_store,
|
||||||
|
metrics,
|
||||||
|
export_puffin_reports,
|
||||||
|
} = new_features;
|
||||||
|
|
||||||
analytics.publish(
|
analytics.publish(
|
||||||
"Experimental features Updated".to_string(),
|
"Experimental features Updated".to_string(),
|
||||||
json!({
|
json!({
|
||||||
"score_details": score_details,
|
"score_details": score_details,
|
||||||
"vector_store": vector_store,
|
"vector_store": vector_store,
|
||||||
|
"metrics": metrics,
|
||||||
|
"export_puffin_reports": export_puffin_reports,
|
||||||
}),
|
}),
|
||||||
Some(&req),
|
Some(&req),
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ pub async fn search(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
let features = index_scheduler.features()?;
|
let features = index_scheduler.features();
|
||||||
let search_result = tokio::task::spawn_blocking(move || {
|
let search_result = tokio::task::spawn_blocking(move || {
|
||||||
perform_facet_search(&index, search_query, facet_query, facet_name, features)
|
perform_facet_search(&index, search_query, facet_query, facet_name, features)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -157,7 +157,7 @@ pub async fn search_with_url_query(
|
|||||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
let features = index_scheduler.features()?;
|
let features = index_scheduler.features();
|
||||||
let search_result =
|
let search_result =
|
||||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||||
if let Ok(ref search_result) = search_result {
|
if let Ok(ref search_result) = search_result {
|
||||||
@@ -192,7 +192,7 @@ pub async fn search_with_post(
|
|||||||
|
|
||||||
let index = index_scheduler.index(&index_uid)?;
|
let index = index_scheduler.index(&index_uid)?;
|
||||||
|
|
||||||
let features = index_scheduler.features()?;
|
let features = index_scheduler.features();
|
||||||
let search_result =
|
let search_result =
|
||||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||||
if let Ok(ref search_result) = search_result {
|
if let Ok(ref search_result) = search_result {
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ pub async fn get_metrics(
|
|||||||
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
|
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
|
||||||
auth_controller: Data<AuthController>,
|
auth_controller: Data<AuthController>,
|
||||||
) -> Result<HttpResponse, ResponseError> {
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
index_scheduler.features()?.check_metrics()?;
|
index_scheduler.features().check_metrics()?;
|
||||||
let auth_filters = index_scheduler.filters();
|
let auth_filters = index_scheduler.filters();
|
||||||
if !auth_filters.all_indexes_authorized() {
|
if !auth_filters.all_indexes_authorized() {
|
||||||
let mut error = ResponseError::from(AuthenticationError::InvalidToken);
|
let mut error = ResponseError::from(AuthenticationError::InvalidToken);
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ pub mod features;
|
|||||||
pub mod indexes;
|
pub mod indexes;
|
||||||
mod metrics;
|
mod metrics;
|
||||||
mod multi_search;
|
mod multi_search;
|
||||||
|
mod reports;
|
||||||
mod snapshot;
|
mod snapshot;
|
||||||
mod swap_indexes;
|
mod swap_indexes;
|
||||||
pub mod tasks;
|
pub mod tasks;
|
||||||
@@ -40,7 +41,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
|||||||
.service(web::scope("/multi-search").configure(multi_search::configure))
|
.service(web::scope("/multi-search").configure(multi_search::configure))
|
||||||
.service(web::scope("/swap-indexes").configure(swap_indexes::configure))
|
.service(web::scope("/swap-indexes").configure(swap_indexes::configure))
|
||||||
.service(web::scope("/metrics").configure(metrics::configure))
|
.service(web::scope("/metrics").configure(metrics::configure))
|
||||||
.service(web::scope("/experimental-features").configure(features::configure));
|
.service(web::scope("/experimental-features").configure(features::configure))
|
||||||
|
.service(web::scope("/reports").configure(reports::configure));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ pub async fn multi_search_with_post(
|
|||||||
let queries = params.into_inner().queries;
|
let queries = params.into_inner().queries;
|
||||||
|
|
||||||
let mut multi_aggregate = MultiSearchAggregator::from_queries(&queries, &req);
|
let mut multi_aggregate = MultiSearchAggregator::from_queries(&queries, &req);
|
||||||
let features = index_scheduler.features()?;
|
let features = index_scheduler.features();
|
||||||
|
|
||||||
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
|
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
|
||||||
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code
|
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code
|
||||||
|
|||||||
39
meilisearch/src/routes/reports.rs
Normal file
39
meilisearch/src/routes/reports.rs
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
use actix_web::web::{self, Data};
|
||||||
|
use actix_web::HttpResponse;
|
||||||
|
use index_scheduler::{IndexScheduler, Report};
|
||||||
|
use meilisearch_types::error::ResponseError;
|
||||||
|
use meilisearch_types::keys::actions;
|
||||||
|
|
||||||
|
use crate::extractors::authentication::policies::ActionPolicy;
|
||||||
|
use crate::extractors::authentication::GuardedData;
|
||||||
|
use crate::extractors::sequential_extractor::SeqHandler;
|
||||||
|
|
||||||
|
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||||
|
cfg.service(web::resource("").route(web::get().to(list_reports))).service(
|
||||||
|
web::scope("/{report_uid}")
|
||||||
|
.service(web::resource("").route(web::get().to(SeqHandler(get_report)))),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_reports(
|
||||||
|
index_scheduler: GuardedData<ActionPolicy<{ actions::SETTINGS_ALL }>, Data<IndexScheduler>>,
|
||||||
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
|
let reports = &index_scheduler.reports();
|
||||||
|
let reports = &reports.read().unwrap();
|
||||||
|
let reports: Vec<&Report> = reports.iter().collect();
|
||||||
|
|
||||||
|
Ok(HttpResponse::Ok().json(reports))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_report(
|
||||||
|
index_scheduler: GuardedData<ActionPolicy<{ actions::SETTINGS_ALL }>, Data<IndexScheduler>>,
|
||||||
|
report_id: web::Path<uuid::Uuid>,
|
||||||
|
) -> Result<HttpResponse, ResponseError> {
|
||||||
|
let reports = &index_scheduler.reports();
|
||||||
|
let reports = &reports.read().unwrap();
|
||||||
|
let report = reports
|
||||||
|
.find(*report_id)
|
||||||
|
.ok_or(crate::error::MeilisearchHttpError::ReportNotFound(*report_id))?;
|
||||||
|
|
||||||
|
Ok(HttpResponse::Ok().json(report))
|
||||||
|
}
|
||||||
@@ -2,10 +2,12 @@ use std::collections::{HashMap, HashSet};
|
|||||||
|
|
||||||
use ::time::format_description::well_known::Rfc3339;
|
use ::time::format_description::well_known::Rfc3339;
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{hashmap, hashset};
|
||||||
|
use meilisearch::Opt;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
use tempfile::TempDir;
|
||||||
use time::{Duration, OffsetDateTime};
|
use time::{Duration, OffsetDateTime};
|
||||||
|
|
||||||
use crate::common::{Server, Value};
|
use crate::common::{default_settings, Server, Value};
|
||||||
use crate::json;
|
use crate::json;
|
||||||
|
|
||||||
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
|
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
|
||||||
@@ -195,7 +197,9 @@ async fn access_authorized_master_key() {
|
|||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn access_authorized_restricted_index() {
|
async fn access_authorized_restricted_index() {
|
||||||
let mut server = Server::new_auth().await;
|
let dir = TempDir::new().unwrap();
|
||||||
|
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
|
||||||
|
let mut server = Server::new_auth_with_options(enable_metrics, dir).await;
|
||||||
for ((method, route), actions) in AUTHORIZATIONS.iter() {
|
for ((method, route), actions) in AUTHORIZATIONS.iter() {
|
||||||
for action in actions {
|
for action in actions {
|
||||||
// create a new API key letting only the needed action.
|
// create a new API key letting only the needed action.
|
||||||
|
|||||||
@@ -202,6 +202,10 @@ impl Server {
|
|||||||
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
|
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
|
||||||
self.service.patch("/experimental-features", value).await
|
self.service.patch("/experimental-features", value).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn get_metrics(&self) -> (Value, StatusCode) {
|
||||||
|
self.service.get("/metrics").await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||||
@@ -221,7 +225,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
|||||||
skip_index_budget: true,
|
skip_index_budget: true,
|
||||||
..Parser::parse_from(None as Option<&str>)
|
..Parser::parse_from(None as Option<&str>)
|
||||||
},
|
},
|
||||||
experimental_enable_metrics: true,
|
experimental_enable_metrics: false,
|
||||||
..Parser::parse_from(None as Option<&str>)
|
..Parser::parse_from(None as Option<&str>)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,7 @@
|
|||||||
use crate::common::Server;
|
use meilisearch::Opt;
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
use crate::common::{default_settings, Server};
|
||||||
use crate::json;
|
use crate::json;
|
||||||
|
|
||||||
/// Feature name to test against.
|
/// Feature name to test against.
|
||||||
@@ -16,7 +19,9 @@ async fn experimental_features() {
|
|||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"scoreDetails": false,
|
"scoreDetails": false,
|
||||||
"vectorStore": false
|
"vectorStore": false,
|
||||||
|
"metrics": false,
|
||||||
|
"exportPuffinReports": false
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -26,7 +31,9 @@ async fn experimental_features() {
|
|||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"scoreDetails": false,
|
"scoreDetails": false,
|
||||||
"vectorStore": true
|
"vectorStore": true,
|
||||||
|
"metrics": false,
|
||||||
|
"exportPuffinReports": false
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -36,7 +43,9 @@ async fn experimental_features() {
|
|||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"scoreDetails": false,
|
"scoreDetails": false,
|
||||||
"vectorStore": true
|
"vectorStore": true,
|
||||||
|
"metrics": false,
|
||||||
|
"exportPuffinReports": false
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -47,7 +56,9 @@ async fn experimental_features() {
|
|||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"scoreDetails": false,
|
"scoreDetails": false,
|
||||||
"vectorStore": true
|
"vectorStore": true,
|
||||||
|
"metrics": false,
|
||||||
|
"exportPuffinReports": false
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -58,11 +69,73 @@ async fn experimental_features() {
|
|||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"scoreDetails": false,
|
"scoreDetails": false,
|
||||||
"vectorStore": true
|
"vectorStore": true,
|
||||||
|
"metrics": false,
|
||||||
|
"exportPuffinReports": false
|
||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn experimental_feature_metrics() {
|
||||||
|
// instance flag for metrics enables metrics at startup
|
||||||
|
let dir = TempDir::new().unwrap();
|
||||||
|
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
|
||||||
|
let server = Server::new_with_options(enable_metrics).await.unwrap();
|
||||||
|
|
||||||
|
let (response, code) = server.get_features().await;
|
||||||
|
|
||||||
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
|
{
|
||||||
|
"scoreDetails": false,
|
||||||
|
"vectorStore": false,
|
||||||
|
"metrics": true,
|
||||||
|
"exportPuffinReports": false
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let (response, code) = server.get_metrics().await;
|
||||||
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
|
|
||||||
|
// metrics are not returned in json format
|
||||||
|
// so the test server will return null
|
||||||
|
meili_snap::snapshot!(response, @"null");
|
||||||
|
|
||||||
|
// disabling metrics results in invalid request
|
||||||
|
let (response, code) = server.set_features(json!({"metrics": false})).await;
|
||||||
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
|
meili_snap::snapshot!(response["metrics"], @"false");
|
||||||
|
|
||||||
|
let (response, code) = server.get_metrics().await;
|
||||||
|
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||||
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
|
{
|
||||||
|
"message": "Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625",
|
||||||
|
"code": "feature_not_enabled",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// enabling metrics via HTTP results in valid request
|
||||||
|
let (response, code) = server.set_features(json!({"metrics": true})).await;
|
||||||
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
|
meili_snap::snapshot!(response["metrics"], @"true");
|
||||||
|
|
||||||
|
let (response, code) = server.get_metrics().await;
|
||||||
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
|
meili_snap::snapshot!(response, @"null");
|
||||||
|
|
||||||
|
// startup without flag respects persisted metrics value
|
||||||
|
let disable_metrics =
|
||||||
|
Opt { experimental_enable_metrics: false, ..default_settings(dir.path()) };
|
||||||
|
let server_no_flag = Server::new_with_options(disable_metrics).await.unwrap();
|
||||||
|
let (response, code) = server_no_flag.get_metrics().await;
|
||||||
|
meili_snap::snapshot!(code, @"200 OK");
|
||||||
|
meili_snap::snapshot!(response, @"null");
|
||||||
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn errors() {
|
async fn errors() {
|
||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
@@ -73,7 +146,7 @@ async fn errors() {
|
|||||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||||
{
|
{
|
||||||
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`",
|
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`",
|
||||||
"code": "bad_request",
|
"code": "bad_request",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||||
|
|||||||
63
meilisearch/tests/search/distinct.rs
Normal file
63
meilisearch/tests/search/distinct.rs
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
use meili_snap::snapshot;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
use crate::common::{Server, Value};
|
||||||
|
use crate::json;
|
||||||
|
|
||||||
|
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||||
|
json!([
|
||||||
|
{"productId": 1, "shopId": 1},
|
||||||
|
{"productId": 2, "shopId": 1},
|
||||||
|
{"productId": 3, "shopId": 2},
|
||||||
|
{"productId": 4, "shopId": 2},
|
||||||
|
{"productId": 5, "shopId": 3},
|
||||||
|
{"productId": 6, "shopId": 3},
|
||||||
|
{"productId": 7, "shopId": 4},
|
||||||
|
{"productId": 8, "shopId": 4},
|
||||||
|
{"productId": 9, "shopId": 5},
|
||||||
|
{"productId": 10, "shopId": 5}
|
||||||
|
])
|
||||||
|
});
|
||||||
|
|
||||||
|
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
|
||||||
|
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
|
||||||
|
|
||||||
|
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn distinct_search_with_offset_no_ranking() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let documents = DOCUMENTS.clone();
|
||||||
|
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
|
||||||
|
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
|
||||||
|
index.wait_task(1).await;
|
||||||
|
|
||||||
|
fn get_hits(Value(response): Value) -> Vec<i64> {
|
||||||
|
let hits_array = response["hits"].as_array().unwrap();
|
||||||
|
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
|
||||||
|
}
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"2");
|
||||||
|
snapshot!(format!("{:?}", hits), @"[1, 2]");
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"2");
|
||||||
|
snapshot!(format!("{:?}", hits), @"[3, 4]");
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"1");
|
||||||
|
snapshot!(format!("{:?}", hits), @"[5]");
|
||||||
|
|
||||||
|
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
|
||||||
|
let hits = get_hits(response);
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(hits.len(), @"0");
|
||||||
|
}
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
// This modules contains all the test concerning search. Each particular feature of the search
|
// This modules contains all the test concerning search. Each particular feature of the search
|
||||||
// should be tested in its own module to isolate tests and keep the tests readable.
|
// should be tested in its own module to isolate tests and keep the tests readable.
|
||||||
|
|
||||||
|
mod distinct;
|
||||||
mod errors;
|
mod errors;
|
||||||
mod facet_search;
|
mod facet_search;
|
||||||
mod formatted;
|
mod formatted;
|
||||||
@@ -816,7 +817,7 @@ async fn experimental_feature_score_details() {
|
|||||||
},
|
},
|
||||||
"proximity": {
|
"proximity": {
|
||||||
"order": 2,
|
"order": 2,
|
||||||
"score": 0.875
|
"score": 0.75
|
||||||
},
|
},
|
||||||
"attribute": {
|
"attribute": {
|
||||||
"order": 3,
|
"order": 3,
|
||||||
|
|||||||
@@ -79,7 +79,6 @@ big_s = "1.0.2"
|
|||||||
insta = "1.29.0"
|
insta = "1.29.0"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
meili-snap = { path = "../meili-snap" }
|
|
||||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::{io, str};
|
use std::{io, str};
|
||||||
|
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
@@ -19,14 +20,14 @@ use crate::FieldId;
|
|||||||
pub struct EnrichedDocumentsBatchReader<R> {
|
pub struct EnrichedDocumentsBatchReader<R> {
|
||||||
documents: DocumentsBatchReader<R>,
|
documents: DocumentsBatchReader<R>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
external_ids: grenad::ReaderCursor<File>,
|
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
documents: DocumentsBatchReader<R>,
|
documents: DocumentsBatchReader<R>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
external_ids: grenad::Reader<File>,
|
external_ids: grenad::Reader<BufReader<File>>,
|
||||||
) -> Result<Self, Error> {
|
) -> Result<Self, Error> {
|
||||||
if documents.documents_count() as u64 == external_ids.len() {
|
if documents.documents_count() as u64 == external_ids.len() {
|
||||||
Ok(EnrichedDocumentsBatchReader {
|
Ok(EnrichedDocumentsBatchReader {
|
||||||
@@ -75,7 +76,7 @@ pub struct EnrichedDocument<'a> {
|
|||||||
pub struct EnrichedDocumentsBatchCursor<R> {
|
pub struct EnrichedDocumentsBatchCursor<R> {
|
||||||
documents: DocumentsBatchCursor<R>,
|
documents: DocumentsBatchCursor<R>,
|
||||||
primary_key: String,
|
primary_key: String,
|
||||||
external_ids: grenad::ReaderCursor<File>,
|
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R> EnrichedDocumentsBatchCursor<R> {
|
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
|
||||||
|
|
||||||
/// This is the limit where using a byteorder became less size efficient
|
/// This is the limit where using a byteorder became less size efficient
|
||||||
/// than using a direct roaring encoding, it is also the point where we are able
|
/// than using a direct roaring encoding, it is also the point where we are able
|
||||||
@@ -61,16 +60,12 @@ impl CboRoaringBitmapCodec {
|
|||||||
/// if the merged values length is under the threshold, values are directly
|
/// if the merged values length is under the threshold, values are directly
|
||||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||||
/// values and is serialized in the buffer.
|
/// values and is serialized in the buffer.
|
||||||
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||||
where
|
|
||||||
I: IntoIterator<Item = A>,
|
|
||||||
A: AsRef<[u8]>,
|
|
||||||
{
|
|
||||||
let mut roaring = RoaringBitmap::new();
|
let mut roaring = RoaringBitmap::new();
|
||||||
let mut vec = Vec::new();
|
let mut vec = Vec::new();
|
||||||
|
|
||||||
for bytes in slices {
|
for bytes in slices {
|
||||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||||
let mut reader = bytes.as_ref();
|
let mut reader = bytes.as_ref();
|
||||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||||
vec.push(integer);
|
vec.push(integer);
|
||||||
@@ -90,7 +85,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// We can unwrap safely because the vector is sorted upper.
|
// We can unwrap safely because the vector is sorted upper.
|
||||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
|
||||||
roaring.serialize_into(buffer)?;
|
roaring.serialize_into(buffer)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -100,28 +95,6 @@ impl CboRoaringBitmapCodec {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
|
||||||
pub fn merge_deladd_into(
|
|
||||||
deladd: KvReaderDelAdd<'_>,
|
|
||||||
previous: &[u8],
|
|
||||||
buffer: &mut Vec<u8>,
|
|
||||||
) -> io::Result<()> {
|
|
||||||
// Deserialize the bitmap that is already there
|
|
||||||
let mut previous = Self::deserialize_from(previous)?;
|
|
||||||
|
|
||||||
// Remove integers we no more want in the previous bitmap
|
|
||||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
|
||||||
previous -= Self::deserialize_from(value)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert the new integers we want in the previous bitmap
|
|
||||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
|
||||||
previous |= Self::deserialize_from(value)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
previous.serialize_into(buffer)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::mem::size_of;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use charabia::{Language, Script};
|
use charabia::{Language, Script};
|
||||||
@@ -13,6 +14,7 @@ use time::OffsetDateTime;
|
|||||||
|
|
||||||
use crate::distance::NDotProductPoint;
|
use crate::distance::NDotProductPoint;
|
||||||
use crate::error::{InternalError, UserError};
|
use crate::error::{InternalError, UserError};
|
||||||
|
use crate::facet::FacetType;
|
||||||
use crate::fields_ids_map::FieldsIdsMap;
|
use crate::fields_ids_map::FieldsIdsMap;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
@@ -53,6 +55,7 @@ pub mod main_key {
|
|||||||
/// e.g. vector-hnsw0x0032.
|
/// e.g. vector-hnsw0x0032.
|
||||||
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
|
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
|
||||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||||
|
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
|
||||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||||
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
||||||
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
|
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
|
||||||
@@ -61,6 +64,7 @@ pub mod main_key {
|
|||||||
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
|
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
|
||||||
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
|
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
|
||||||
pub const DICTIONARY_KEY: &str = "dictionary";
|
pub const DICTIONARY_KEY: &str = "dictionary";
|
||||||
|
pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
|
||||||
pub const SYNONYMS_KEY: &str = "synonyms";
|
pub const SYNONYMS_KEY: &str = "synonyms";
|
||||||
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
|
pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
|
||||||
pub const WORDS_FST_KEY: &str = "words-fst";
|
pub const WORDS_FST_KEY: &str = "words-fst";
|
||||||
@@ -115,16 +119,16 @@ pub struct Index {
|
|||||||
pub(crate) main: PolyDatabase,
|
pub(crate) main: PolyDatabase,
|
||||||
|
|
||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||||
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix.
|
/// A prefix of word and all the documents ids containing this prefix.
|
||||||
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||||
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
@@ -922,6 +926,44 @@ impl Index {
|
|||||||
|
|
||||||
/* faceted documents ids */
|
/* faceted documents ids */
|
||||||
|
|
||||||
|
/// Writes the documents ids that are faceted under this field id for the given facet type.
|
||||||
|
pub fn put_faceted_documents_ids(
|
||||||
|
&self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
field_id: FieldId,
|
||||||
|
facet_type: FacetType,
|
||||||
|
docids: &RoaringBitmap,
|
||||||
|
) -> heed::Result<()> {
|
||||||
|
let key = match facet_type {
|
||||||
|
FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||||
|
FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||||
|
};
|
||||||
|
let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
|
||||||
|
buffer[..key.len()].copy_from_slice(key.as_bytes());
|
||||||
|
buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
|
||||||
|
self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retrieve all the documents ids that are faceted under this field id for the given facet type.
|
||||||
|
pub fn faceted_documents_ids(
|
||||||
|
&self,
|
||||||
|
rtxn: &RoTxn,
|
||||||
|
field_id: FieldId,
|
||||||
|
facet_type: FacetType,
|
||||||
|
) -> heed::Result<RoaringBitmap> {
|
||||||
|
let key = match facet_type {
|
||||||
|
FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||||
|
FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
|
||||||
|
};
|
||||||
|
let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
|
||||||
|
buffer[..key.len()].copy_from_slice(key.as_bytes());
|
||||||
|
buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
|
||||||
|
match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
|
||||||
|
Some(docids) => Ok(docids),
|
||||||
|
None => Ok(RoaringBitmap::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieve all the documents which contain this field id set as null
|
/// Retrieve all the documents which contain this field id set as null
|
||||||
pub fn null_faceted_documents_ids(
|
pub fn null_faceted_documents_ids(
|
||||||
&self,
|
&self,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::cmp;
|
|||||||
|
|
||||||
use crate::{relative_from_absolute_position, Position};
|
use crate::{relative_from_absolute_position, Position};
|
||||||
|
|
||||||
pub const MAX_DISTANCE: u32 = 8;
|
pub const MAX_DISTANCE: u32 = 4;
|
||||||
|
|
||||||
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||||
if lhs <= rhs {
|
if lhs <= rhs {
|
||||||
|
|||||||
@@ -46,18 +46,27 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||||||
if let Some(distinct_fid) = distinct_fid {
|
if let Some(distinct_fid) = distinct_fid {
|
||||||
let mut excluded = RoaringBitmap::new();
|
let mut excluded = RoaringBitmap::new();
|
||||||
let mut results = vec![];
|
let mut results = vec![];
|
||||||
|
let mut skip = 0;
|
||||||
for docid in universe.iter() {
|
for docid in universe.iter() {
|
||||||
if results.len() >= from + length {
|
if results.len() >= length {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if excluded.contains(docid) {
|
if excluded.contains(docid) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||||
|
skip += 1;
|
||||||
|
if skip <= from {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
results.push(docid);
|
results.push(docid);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut all_candidates = universe - excluded;
|
let mut all_candidates = universe - excluded;
|
||||||
all_candidates.extend(results.iter().copied());
|
all_candidates.extend(results.iter().copied());
|
||||||
|
|
||||||
return Ok(BucketSortOutput {
|
return Ok(BucketSortOutput {
|
||||||
scores: vec![Default::default(); results.len()],
|
scores: vec![Default::default(); results.len()],
|
||||||
docids: results,
|
docids: results,
|
||||||
|
|||||||
@@ -11,7 +11,9 @@ use super::interner::Interned;
|
|||||||
use super::Word;
|
use super::Word;
|
||||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
|
use crate::{
|
||||||
|
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
||||||
|
};
|
||||||
|
|
||||||
/// A cache storing pointers to values in the LMDB databases.
|
/// A cache storing pointers to values in the LMDB databases.
|
||||||
///
|
///
|
||||||
@@ -166,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
@@ -180,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
word: Interned<String>,
|
word: Interned<String>,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
@@ -228,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
@@ -242,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
prefix: Interned<String>,
|
prefix: Interned<String>,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#![allow(clippy::too_many_arguments)]
|
#![allow(clippy::too_many_arguments)]
|
||||||
|
|
||||||
use super::ProximityCondition;
|
use super::ProximityCondition;
|
||||||
|
use crate::proximity::MAX_DISTANCE;
|
||||||
use crate::search::new::interner::{DedupInterner, Interned};
|
use crate::search::new::interner::{DedupInterner, Interned};
|
||||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||||
use crate::search::new::SearchContext;
|
use crate::search::new::SearchContext;
|
||||||
@@ -35,7 +36,7 @@ pub fn build_edges(
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut conditions = vec![];
|
let mut conditions = vec![];
|
||||||
for cost in right_ngram_max..(7 + right_ngram_max) {
|
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
|
||||||
conditions.push((
|
conditions.push((
|
||||||
cost as u32,
|
cost as u32,
|
||||||
conditions_interner.insert(ProximityCondition::Uninit {
|
conditions_interner.insert(ProximityCondition::Uninit {
|
||||||
@@ -47,7 +48,7 @@ pub fn build_edges(
|
|||||||
}
|
}
|
||||||
|
|
||||||
conditions.push((
|
conditions.push((
|
||||||
(7 + right_ngram_max) as u32,
|
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
|
||||||
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||||
));
|
));
|
||||||
|
|
||||||
|
|||||||
@@ -273,7 +273,7 @@ fn test_proximity_simple() {
|
|||||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||||
s.query("the quick brown fox jumps over the lazy dog");
|
s.query("the quick brown fox jumps over the lazy dog");
|
||||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 5, 2, 3, 0, 1]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 2, 3, 5, 1, 0]");
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
[
|
[
|
||||||
@@ -282,11 +282,11 @@ fn test_proximity_simple() {
|
|||||||
"\"the quickbrown fox jumps over the lazy dog\"",
|
"\"the quickbrown fox jumps over the lazy dog\"",
|
||||||
"\"the really quick brown fox jumps over the lazy dog\"",
|
"\"the really quick brown fox jumps over the lazy dog\"",
|
||||||
"\"the really quick brown fox jumps over the very lazy dog\"",
|
"\"the really quick brown fox jumps over the very lazy dog\"",
|
||||||
"\"brown quick fox jumps over the lazy dog\"",
|
|
||||||
"\"the quick brown fox jumps over the lazy. dog\"",
|
"\"the quick brown fox jumps over the lazy. dog\"",
|
||||||
"\"dog the quick brown fox jumps over the lazy\"",
|
"\"dog the quick brown fox jumps over the lazy\"",
|
||||||
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
"\"brown quick fox jumps over the lazy dog\"",
|
||||||
"\"the. quick brown fox jumps over the lazy. dog\"",
|
"\"the. quick brown fox jumps over the lazy. dog\"",
|
||||||
|
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
@@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best s");
|
s.query("best s");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
@@ -382,9 +382,9 @@ fn test_proximity_prefix_db() {
|
|||||||
"\"summer best\"",
|
"\"summer best\"",
|
||||||
"\"this is the best meal of summer\"",
|
"\"this is the best meal of summer\"",
|
||||||
"\"summer x best\"",
|
"\"summer x best\"",
|
||||||
"\"this is the best meal of the summer\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||||
"\"this is the best cooked meal of the summer\"",
|
"\"this is the best cooked meal of the summer\"",
|
||||||
|
"\"this is the best meal of the summer\"",
|
||||||
"\"summer x y best\"",
|
"\"summer x y best\"",
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
]
|
]
|
||||||
@@ -396,7 +396,7 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best su");
|
s.query("best su");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
@@ -406,10 +406,10 @@ fn test_proximity_prefix_db() {
|
|||||||
"\"summer best\"",
|
"\"summer best\"",
|
||||||
"\"this is the best meal of summer\"",
|
"\"this is the best meal of summer\"",
|
||||||
"\"summer x best\"",
|
"\"summer x best\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||||
|
"\"this is the best cooked meal of the summer\"",
|
||||||
"\"this is the best meal of the summer\"",
|
"\"this is the best meal of the summer\"",
|
||||||
"\"summer x y best\"",
|
"\"summer x y best\"",
|
||||||
"\"this is the best cooked meal of the summer\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
@@ -447,7 +447,7 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best wint");
|
s.query("best wint");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
@@ -457,10 +457,10 @@ fn test_proximity_prefix_db() {
|
|||||||
"\"winter best\"",
|
"\"winter best\"",
|
||||||
"\"this is the best meal of winter\"",
|
"\"this is the best meal of winter\"",
|
||||||
"\"winter x best\"",
|
"\"winter x best\"",
|
||||||
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
|
"\"this is the best cooked meal of the winter\"",
|
||||||
"\"this is the best meal of the winter\"",
|
"\"this is the best meal of the winter\"",
|
||||||
"\"winter x y best\"",
|
"\"winter x y best\"",
|
||||||
"\"this is the best cooked meal of the winter\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
|
|
||||||
@@ -471,7 +471,7 @@ fn test_proximity_prefix_db() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
s.query("best wi");
|
s.query("best wi");
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
|
|
||||||
@@ -481,9 +481,9 @@ fn test_proximity_prefix_db() {
|
|||||||
"\"winter best\"",
|
"\"winter best\"",
|
||||||
"\"this is the best meal of winter\"",
|
"\"this is the best meal of winter\"",
|
||||||
"\"winter x best\"",
|
"\"winter x best\"",
|
||||||
"\"this is the best meal of the winter\"",
|
|
||||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||||
"\"this is the best cooked meal of the winter\"",
|
"\"this is the best cooked meal of the winter\"",
|
||||||
|
"\"this is the best meal of the winter\"",
|
||||||
"\"winter x y best\"",
|
"\"winter x y best\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
|
|||||||
@@ -68,8 +68,8 @@ fn test_trap_basic() {
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Typo(
|
Typo(
|
||||||
@@ -82,8 +82,8 @@ fn test_trap_basic() {
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Typo(
|
Typo(
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 35,
|
rank: 9,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 35,
|
rank: 9,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 35,
|
rank: 9,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 57,
|
rank: 25,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 56,
|
rank: 24,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 35,
|
rank: 9,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -101,8 +101,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 10,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -127,8 +127,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 10,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -153,8 +153,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 10,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -179,8 +179,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 21,
|
rank: 9,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -205,8 +205,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 17,
|
rank: 5,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -231,8 +231,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 17,
|
rank: 5,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -3,59 +3,35 @@ source: milli/src/search/new/tests/proximity.rs
|
|||||||
expression: "format!(\"{document_scores:#?}\")"
|
expression: "format!(\"{document_scores:#?}\")"
|
||||||
---
|
---
|
||||||
[
|
[
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 8,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 7,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 6,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 6,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 5,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 5,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 4,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 3,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 2,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 2,
|
||||||
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -63,7 +39,31 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -6,40 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 7,
|
rank: 3,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 6,
|
rank: 2,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 6,
|
rank: 2,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 5,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -47,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -55,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -63,7 +55,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -6,40 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 7,
|
rank: 3,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 6,
|
rank: 2,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 6,
|
rank: 2,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 5,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -47,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -55,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -63,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -71,7 +63,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -3,59 +3,35 @@ source: milli/src/search/new/tests/proximity.rs
|
|||||||
expression: "format!(\"{document_scores:#?}\")"
|
expression: "format!(\"{document_scores:#?}\")"
|
||||||
---
|
---
|
||||||
[
|
[
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 8,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 7,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 6,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 6,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 5,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 5,
|
|
||||||
max_rank: 8,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 4,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 3,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 2,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 2,
|
||||||
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -63,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -71,7 +47,31 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 1,
|
||||||
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -15,7 +15,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -47,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -55,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -63,7 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -6,24 +6,24 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 5,
|
rank: 1,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 7,
|
rank: 3,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 15,
|
rank: 7,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 50,
|
rank: 22,
|
||||||
max_rank: 50,
|
max_rank: 22,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -24,132 +24,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
max_matching_words: 9,
|
max_matching_words: 9,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 50,
|
|
||||||
max_rank: 50,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 9,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 49,
|
|
||||||
max_rank: 50,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 9,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 49,
|
|
||||||
max_rank: 50,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 9,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 48,
|
|
||||||
max_rank: 50,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 9,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 41,
|
|
||||||
max_rank: 50,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 9,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 40,
|
|
||||||
max_rank: 50,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 8,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 43,
|
|
||||||
max_rank: 43,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 7,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 36,
|
|
||||||
max_rank: 36,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 7,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
|
||||||
Rank {
|
|
||||||
rank: 31,
|
|
||||||
max_rank: 36,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
],
|
|
||||||
[
|
|
||||||
Words(
|
|
||||||
Words {
|
|
||||||
matching_words: 5,
|
|
||||||
max_matching_words: 9,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 22,
|
||||||
@@ -160,14 +34,126 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Words(
|
Words(
|
||||||
Words {
|
Words {
|
||||||
matching_words: 4,
|
matching_words: 9,
|
||||||
max_matching_words: 9,
|
max_matching_words: 9,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 15,
|
rank: 21,
|
||||||
max_rank: 15,
|
max_rank: 22,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 9,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 21,
|
||||||
|
max_rank: 22,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 9,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 20,
|
||||||
|
max_rank: 22,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 9,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 17,
|
||||||
|
max_rank: 22,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 9,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 16,
|
||||||
|
max_rank: 22,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 8,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 19,
|
||||||
|
max_rank: 19,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 7,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 16,
|
||||||
|
max_rank: 16,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 7,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 13,
|
||||||
|
max_rank: 16,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 5,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 10,
|
||||||
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -180,8 +166,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 15,
|
rank: 7,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -194,8 +180,22 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 15,
|
rank: 7,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
[
|
||||||
|
Words(
|
||||||
|
Words {
|
||||||
|
matching_words: 4,
|
||||||
|
max_matching_words: 9,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
Proximity(
|
||||||
|
Rank {
|
||||||
|
rank: 7,
|
||||||
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 8,
|
rank: 4,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 43,
|
rank: 19,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 43,
|
rank: 19,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 42,
|
rank: 18,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 42,
|
rank: 18,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 41,
|
rank: 17,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 34,
|
rank: 14,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 33,
|
rank: 13,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 36,
|
rank: 16,
|
||||||
max_rank: 36,
|
max_rank: 16,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 29,
|
rank: 13,
|
||||||
max_rank: 29,
|
max_rank: 13,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 24,
|
rank: 10,
|
||||||
max_rank: 29,
|
max_rank: 13,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 15,
|
rank: 7,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 57,
|
rank: 25,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 57,
|
rank: 25,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 56,
|
rank: 24,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 56,
|
rank: 24,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 55,
|
rank: 23,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 54,
|
rank: 22,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 53,
|
rank: 21,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 52,
|
rank: 20,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 51,
|
rank: 20,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 48,
|
rank: 19,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 47,
|
rank: 19,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 50,
|
rank: 22,
|
||||||
max_rank: 50,
|
max_rank: 22,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 43,
|
rank: 19,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 38,
|
rank: 16,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -222,8 +222,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 29,
|
rank: 13,
|
||||||
max_rank: 29,
|
max_rank: 13,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -236,8 +236,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 10,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -250,8 +250,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 10,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -264,8 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 22,
|
rank: 10,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 15,
|
rank: 7,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 57,
|
rank: 25,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 56,
|
rank: 24,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 55,
|
rank: 23,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 54,
|
rank: 22,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 54,
|
rank: 22,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 54,
|
rank: 22,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 53,
|
rank: 21,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 53,
|
rank: 21,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 52,
|
rank: 20,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 47,
|
rank: 18,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 45,
|
rank: 18,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 47,
|
rank: 19,
|
||||||
max_rank: 50,
|
max_rank: 22,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 40,
|
rank: 16,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 35,
|
rank: 13,
|
||||||
max_rank: 43,
|
max_rank: 19,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -222,8 +222,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 26,
|
rank: 10,
|
||||||
max_rank: 29,
|
max_rank: 13,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -236,8 +236,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 19,
|
rank: 7,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -250,8 +250,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 19,
|
rank: 7,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -264,8 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 19,
|
rank: 7,
|
||||||
max_rank: 22,
|
max_rank: 10,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 13,
|
rank: 5,
|
||||||
max_rank: 15,
|
max_rank: 7,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -6,88 +6,88 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 57,
|
rank: 25,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 57,
|
rank: 25,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 56,
|
rank: 24,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 56,
|
rank: 24,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 55,
|
rank: 23,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 54,
|
rank: 22,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 53,
|
rank: 21,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 52,
|
rank: 20,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 51,
|
rank: 20,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 48,
|
rank: 19,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 47,
|
rank: 19,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
@@ -95,7 +95,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
|||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 1,
|
rank: 1,
|
||||||
max_rank: 57,
|
max_rank: 25,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ This module tests the `sort` ranking rule:
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
use meili_snap::insta;
|
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::new::tests::collect_field_values;
|
use crate::search::new::tests::collect_field_values;
|
||||||
|
|||||||
@@ -259,8 +259,8 @@ fn test_ignore_stop_words() {
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 7,
|
rank: 3,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Fid(
|
Fid(
|
||||||
@@ -411,8 +411,8 @@ fn test_stop_words_in_phrase() {
|
|||||||
),
|
),
|
||||||
Proximity(
|
Proximity(
|
||||||
Rank {
|
Rank {
|
||||||
rank: 6,
|
rank: 2,
|
||||||
max_rank: 8,
|
max_rank: 4,
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
Fid(
|
Fid(
|
||||||
|
|||||||
@@ -277,7 +277,7 @@ fn test_words_proximity_tms_last_simple() {
|
|||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
|
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
@@ -289,10 +289,10 @@ fn test_words_proximity_tms_last_simple() {
|
|||||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
"\"the brown quick fox jumps over the lazy dog\"",
|
"\"the brown quick fox jumps over the lazy dog\"",
|
||||||
"\"the brown quick fox jumps over the really lazy dog\"",
|
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
|
||||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
|
||||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
"\"the quick brown fox jumps over the lazy\"",
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
"\"the quick brown fox jumps over the\"",
|
"\"the quick brown fox jumps over the\"",
|
||||||
@@ -312,7 +312,7 @@ fn test_words_proximity_tms_last_simple() {
|
|||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
// 10 is better than 9 because of the proximity between "quick" and "brown"
|
// 10 is better than 9 because of the proximity between "quick" and "brown"
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 15, 16, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
@@ -326,8 +326,8 @@ fn test_words_proximity_tms_last_simple() {
|
|||||||
"\"the great quick brown fox jumps over the lazy dog\"",
|
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||||
"\"the quick brown fox jumps over the really lazy dog\"",
|
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
|
||||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
"\"the quick brown fox jumps over the lazy\"",
|
"\"the quick brown fox jumps over the lazy\"",
|
||||||
"\"the quick brown fox jumps over the\"",
|
"\"the quick brown fox jumps over the\"",
|
||||||
@@ -427,7 +427,7 @@ fn test_words_tms_all() {
|
|||||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||||
|
|
||||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22]");
|
||||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||||
insta::assert_debug_snapshot!(texts, @r###"
|
insta::assert_debug_snapshot!(texts, @r###"
|
||||||
@@ -439,10 +439,10 @@ fn test_words_tms_all() {
|
|||||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||||
"\"the brown quick fox jumps over the lazy dog\"",
|
"\"the brown quick fox jumps over the lazy dog\"",
|
||||||
"\"the brown quick fox jumps over the really lazy dog\"",
|
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
|
||||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
|
||||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||||
|
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||||
]
|
]
|
||||||
"###);
|
"###);
|
||||||
|
|||||||
@@ -359,7 +359,31 @@ pub fn snap_external_documents_ids(index: &Index) -> String {
|
|||||||
|
|
||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
|
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let mut snap = String::new();
|
||||||
|
for field_id in fields_ids_map.ids() {
|
||||||
|
let number_faceted_documents_ids =
|
||||||
|
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
|
||||||
|
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
snap
|
||||||
|
}
|
||||||
|
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
|
||||||
|
let mut snap = String::new();
|
||||||
|
for field_id in fields_ids_map.ids() {
|
||||||
|
let string_faceted_documents_ids =
|
||||||
|
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
|
||||||
|
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
snap
|
||||||
|
}
|
||||||
pub fn snap_words_fst(index: &Index) -> String {
|
pub fn snap_words_fst(index: &Index) -> String {
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let words_fst = index.words_fst(&rtxn).unwrap();
|
let words_fst = index.words_fst(&rtxn).unwrap();
|
||||||
@@ -507,6 +531,12 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, external_documents_ids) => {{
|
($index:ident, external_documents_ids) => {{
|
||||||
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
||||||
}};
|
}};
|
||||||
|
($index:ident, number_faceted_documents_ids) => {{
|
||||||
|
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
|
||||||
|
}};
|
||||||
|
($index:ident, string_faceted_documents_ids) => {{
|
||||||
|
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
|
||||||
|
}};
|
||||||
($index:ident, words_fst) => {{
|
($index:ident, words_fst) => {{
|
||||||
$crate::snapshot_tests::snap_words_fst(&$index)
|
$crate::snapshot_tests::snap_words_fst(&$index)
|
||||||
}};
|
}};
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
|
use crate::facet::FacetType;
|
||||||
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
||||||
|
|
||||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||||
@@ -50,6 +51,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
// We retrieve the number of documents ids that we are deleting.
|
// We retrieve the number of documents ids that we are deleting.
|
||||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||||
|
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||||
|
|
||||||
// We clean some of the main engine datastructures.
|
// We clean some of the main engine datastructures.
|
||||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||||
@@ -62,6 +64,22 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||||
|
|
||||||
|
// We clean all the faceted documents ids.
|
||||||
|
for field_id in faceted_fields {
|
||||||
|
self.index.put_faceted_documents_ids(
|
||||||
|
self.wtxn,
|
||||||
|
field_id,
|
||||||
|
FacetType::Number,
|
||||||
|
&empty_roaring,
|
||||||
|
)?;
|
||||||
|
self.index.put_faceted_documents_ids(
|
||||||
|
self.wtxn,
|
||||||
|
field_id,
|
||||||
|
FacetType::String,
|
||||||
|
&empty_roaring,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
// Clear the other databases.
|
// Clear the other databases.
|
||||||
word_docids.clear(self.wtxn)?;
|
word_docids.clear(self.wtxn)?;
|
||||||
exact_word_docids.clear(self.wtxn)?;
|
exact_word_docids.clear(self.wtxn)?;
|
||||||
|
|||||||
@@ -1,104 +0,0 @@
|
|||||||
use obkv::Key;
|
|
||||||
|
|
||||||
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
|
||||||
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
|
||||||
|
|
||||||
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
|
||||||
///
|
|
||||||
/// Its used in an OBKV to be serialized in grenad files.
|
|
||||||
#[repr(u8)]
|
|
||||||
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
|
||||||
pub enum DelAdd {
|
|
||||||
Deletion = 0,
|
|
||||||
Addition = 1,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Key for DelAdd {
|
|
||||||
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
|
||||||
type BYTES = [u8; Self::BYTES_SIZE];
|
|
||||||
|
|
||||||
fn to_be_bytes(&self) -> Self::BYTES {
|
|
||||||
u8::to_be_bytes(*self as u8)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_be_bytes(array: Self::BYTES) -> Self {
|
|
||||||
match u8::from_be_bytes(array) {
|
|
||||||
0 => Self::Deletion,
|
|
||||||
1 => Self::Addition,
|
|
||||||
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
|
||||||
///
|
|
||||||
/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
|
|
||||||
/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
|
|
||||||
/// if both deletion and addition are `true, the value will be inserted in both keys.
|
|
||||||
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
|
||||||
reader: obkv::KvReader<K>,
|
|
||||||
deletion: bool,
|
|
||||||
addition: bool,
|
|
||||||
buffer: &mut Vec<u8>,
|
|
||||||
) -> Result<(), std::io::Error> {
|
|
||||||
let mut writer = obkv::KvWriter::new(buffer);
|
|
||||||
let mut value_buffer = Vec::new();
|
|
||||||
for (key, value) in reader.iter() {
|
|
||||||
value_buffer.clear();
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
|
||||||
if deletion {
|
|
||||||
value_writer.insert(DelAdd::Deletion, value)?;
|
|
||||||
}
|
|
||||||
if addition {
|
|
||||||
value_writer.insert(DelAdd::Addition, value)?;
|
|
||||||
}
|
|
||||||
value_writer.finish()?;
|
|
||||||
writer.insert(key, &value_buffer)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.finish()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
|
||||||
///
|
|
||||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
|
||||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
|
||||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
|
||||||
deletion: obkv::KvReader<K>,
|
|
||||||
addition: obkv::KvReader<K>,
|
|
||||||
buffer: &mut Vec<u8>,
|
|
||||||
) -> Result<(), std::io::Error> {
|
|
||||||
use itertools::merge_join_by;
|
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
|
||||||
|
|
||||||
let mut writer = obkv::KvWriter::new(buffer);
|
|
||||||
let mut value_buffer = Vec::new();
|
|
||||||
|
|
||||||
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
|
||||||
value_buffer.clear();
|
|
||||||
match eob {
|
|
||||||
Left((k, v)) => {
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
|
||||||
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
|
||||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
|
||||||
}
|
|
||||||
Right((k, v)) => {
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
|
||||||
value_writer.insert(DelAdd::Addition, v).unwrap();
|
|
||||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
|
||||||
}
|
|
||||||
Both((k, deletion), (_, addition)) => {
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
|
||||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
|
||||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
|
||||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.finish()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
|
|
||||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
|
||||||
}
|
|
||||||
@@ -16,7 +16,9 @@ use crate::facet::FacetType;
|
|||||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::index::Hnsw;
|
use crate::index::Hnsw;
|
||||||
use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
|
use crate::{
|
||||||
|
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
|
||||||
|
};
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@@ -106,15 +108,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
self.delete_document(docid);
|
self.delete_document(docid);
|
||||||
Some(docid)
|
Some(docid)
|
||||||
}
|
}
|
||||||
pub fn execute(self) -> Result<DocumentDeletionResult> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
|
pub fn execute(self) -> Result<DocumentDeletionResult> {
|
||||||
let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
|
let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
|
||||||
self.execute_inner()?;
|
self.execute_inner()?;
|
||||||
|
|
||||||
Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
|
Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn execute_inner(mut self) -> Result<DetailedDocumentDeletionResult> {
|
pub(crate) fn execute_inner(mut self) -> Result<DetailedDocumentDeletionResult> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
|
||||||
// We retrieve the current documents ids that are in the database.
|
// We retrieve the current documents ids that are in the database.
|
||||||
@@ -382,6 +386,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
for facet_type in [FacetType::Number, FacetType::String] {
|
for facet_type in [FacetType::Number, FacetType::String] {
|
||||||
let mut affected_facet_values = HashMap::new();
|
let mut affected_facet_values = HashMap::new();
|
||||||
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
|
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
|
||||||
|
// Remove docids from the number faceted documents ids
|
||||||
|
let mut docids =
|
||||||
|
self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?;
|
||||||
|
docids -= &self.to_delete_docids;
|
||||||
|
self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?;
|
||||||
|
|
||||||
let facet_values = remove_docids_from_field_id_docid_facet_value(
|
let facet_values = remove_docids_from_field_id_docid_facet_value(
|
||||||
self.index,
|
self.index,
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
@@ -468,6 +478,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
|
C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
|
||||||
+ for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
|
+ for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (bytes, mut docids) = result?;
|
let (bytes, mut docids) = result?;
|
||||||
let previous_len = docids.len();
|
let previous_len = docids.len();
|
||||||
@@ -487,9 +499,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
fn remove_from_word_prefix_docids(
|
fn remove_from_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
db: &Database<Str, CboRoaringBitmapCodec>,
|
db: &Database<Str, RoaringBitmapCodec>,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
) -> Result<fst::Set<Vec<u8>>> {
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||||
|
|
||||||
// We iterate over the word prefix docids database and remove the deleted documents ids
|
// We iterate over the word prefix docids database and remove the deleted documents ids
|
||||||
@@ -515,11 +529,13 @@ fn remove_from_word_prefix_docids(
|
|||||||
|
|
||||||
fn remove_from_word_docids(
|
fn remove_from_word_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
db: &heed::Database<Str, CboRoaringBitmapCodec>,
|
db: &heed::Database<Str, RoaringBitmapCodec>,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
words_to_keep: &mut BTreeSet<String>,
|
words_to_keep: &mut BTreeSet<String>,
|
||||||
words_to_remove: &mut BTreeSet<String>,
|
words_to_remove: &mut BTreeSet<String>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
// We create an iterator to be able to get the content and delete the word docids.
|
// We create an iterator to be able to get the content and delete the word docids.
|
||||||
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
|
||||||
// the LMDB B-Tree two times but only once.
|
// the LMDB B-Tree two times but only once.
|
||||||
@@ -551,6 +567,8 @@ fn remove_docids_from_field_id_docid_facet_value(
|
|||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
) -> heed::Result<HashSet<Vec<u8>>> {
|
) -> heed::Result<HashSet<Vec<u8>>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let db = match facet_type {
|
let db = match facet_type {
|
||||||
FacetType::String => {
|
FacetType::String => {
|
||||||
index.field_id_docid_facet_strings.remap_types::<ByteSlice, DecodeIgnore>()
|
index.field_id_docid_facet_strings.remap_types::<ByteSlice, DecodeIgnore>()
|
||||||
@@ -586,6 +604,8 @@ fn remove_docids_from_facet_id_docids<'a, C>(
|
|||||||
where
|
where
|
||||||
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
|
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
|
||||||
{
|
{
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
|
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (bytes, mut docids) = result?;
|
let (bytes, mut docids) = result?;
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
||||||
use obkv::KvReader;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||||
@@ -12,7 +13,6 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
use crate::update::del_add::DelAdd;
|
|
||||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||||
|
|
||||||
@@ -21,6 +21,9 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
|||||||
///
|
///
|
||||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||||
/// higher levels are cleared and recomputed from the content of level 0.
|
/// higher levels are cleared and recomputed from the content of level 0.
|
||||||
|
///
|
||||||
|
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
|
||||||
|
/// is updated to contain the new set of faceted documents.
|
||||||
pub struct FacetsUpdateBulk<'i> {
|
pub struct FacetsUpdateBulk<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
@@ -28,7 +31,7 @@ pub struct FacetsUpdateBulk<'i> {
|
|||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
// None if level 0 does not need to be updated
|
// None if level 0 does not need to be updated
|
||||||
delta_data: Option<grenad::Reader<File>>,
|
new_data: Option<grenad::Reader<BufReader<File>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateBulk<'i> {
|
impl<'i> FacetsUpdateBulk<'i> {
|
||||||
@@ -36,7 +39,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
@@ -46,7 +49,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
group_size,
|
group_size,
|
||||||
min_level_size,
|
min_level_size,
|
||||||
facet_type,
|
facet_type,
|
||||||
delta_data: Some(delta_data),
|
new_data: Some(new_data),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,13 +64,13 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
group_size: FACET_GROUP_SIZE,
|
group_size: FACET_GROUP_SIZE,
|
||||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
facet_type,
|
facet_type,
|
||||||
delta_data: None,
|
new_data: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
|
||||||
|
|
||||||
let db = match facet_type {
|
let db = match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
@@ -78,9 +81,12 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
|
||||||
|
|
||||||
inner.update(wtxn, &field_ids)?;
|
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
|
||||||
|
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -89,19 +95,26 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||||
pub delta_data: Option<grenad::Reader<R>>,
|
pub new_data: Option<grenad::Reader<R>>,
|
||||||
pub group_size: u8,
|
pub group_size: u8,
|
||||||
pub min_level_size: u8,
|
pub min_level_size: u8,
|
||||||
}
|
}
|
||||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||||
pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
|
pub fn update(
|
||||||
|
mut self,
|
||||||
|
wtxn: &mut RwTxn,
|
||||||
|
field_ids: &[u16],
|
||||||
|
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
|
||||||
|
) -> Result<()> {
|
||||||
self.update_level0(wtxn)?;
|
self.update_level0(wtxn)?;
|
||||||
for &field_id in field_ids.iter() {
|
for &field_id in field_ids.iter() {
|
||||||
self.clear_levels(wtxn, field_id)?;
|
self.clear_levels(wtxn, field_id)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
for &field_id in field_ids.iter() {
|
for &field_id in field_ids.iter() {
|
||||||
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||||
|
|
||||||
|
handle_all_docids(wtxn, field_id, all_docids)?;
|
||||||
|
|
||||||
for level_reader in level_readers {
|
for level_reader in level_readers {
|
||||||
let mut cursor = level_reader.into_cursor()?;
|
let mut cursor = level_reader.into_cursor()?;
|
||||||
@@ -120,27 +133,19 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||||
let delta_data = match self.delta_data.take() {
|
let new_data = match self.new_data.take() {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
None => return Ok(()),
|
None => return Ok(()),
|
||||||
};
|
};
|
||||||
if self.db.is_empty(wtxn)? {
|
if self.db.is_empty(wtxn)? {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||||
let mut cursor = delta_data.into_cursor()?;
|
let mut cursor = new_data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let value: KvReader<DelAdd> = KvReader::new(value);
|
|
||||||
|
|
||||||
// DB is empty, it is safe to ignore Del operations
|
|
||||||
let Some(value) = value.get(DelAdd::Addition) else {
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
// the group size for level 0
|
// the group size for level 0
|
||||||
buffer.push(1);
|
buffer.push(1);
|
||||||
@@ -152,14 +157,11 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
||||||
|
|
||||||
let mut cursor = delta_data.into_cursor()?;
|
let mut cursor = new_data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let value: KvReader<DelAdd> = KvReader::new(value);
|
|
||||||
|
|
||||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||||
// group size for level 0 (= 1) to it
|
// group size for level 0 (= 1) to it
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
@@ -168,15 +170,12 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
match database.get(wtxn, key)? {
|
match database.get(wtxn, key)? {
|
||||||
Some(prev_value) => {
|
Some(prev_value) => {
|
||||||
let old_bitmap = &prev_value[1..];
|
let old_bitmap = &prev_value[1..];
|
||||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
CboRoaringBitmapCodec::merge_into(
|
||||||
|
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
|
||||||
|
&mut buffer,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// it is safe to ignore the del in that case.
|
|
||||||
let Some(value) = value.get(DelAdd::Addition) else {
|
|
||||||
// won't put the key in DB as the value would be empty
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
|
|
||||||
buffer.extend_from_slice(value);
|
buffer.extend_from_slice(value);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@@ -189,10 +188,16 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
&self,
|
&self,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
) -> Result<Vec<grenad::Reader<File>>> {
|
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
|
||||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
let mut all_docids = RoaringBitmap::new();
|
||||||
|
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
||||||
|
for bitmap in bitmaps {
|
||||||
|
all_docids |= bitmap;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
|
||||||
Ok(subwriters)
|
Ok((subwriters, all_docids))
|
||||||
}
|
}
|
||||||
#[allow(clippy::type_complexity)]
|
#[allow(clippy::type_complexity)]
|
||||||
fn read_level_0<'t>(
|
fn read_level_0<'t>(
|
||||||
@@ -255,7 +260,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
field_id: u16,
|
field_id: u16,
|
||||||
level: u8,
|
level: u8,
|
||||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||||
) -> Result<Vec<grenad::Reader<File>>> {
|
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||||
if level == 0 {
|
if level == 0 {
|
||||||
self.read_level_0(rtxn, field_id, handle_group)?;
|
self.read_level_0(rtxn, field_id, handle_group)?;
|
||||||
// Level 0 is already in the database
|
// Level 0 is already in the database
|
||||||
@@ -486,6 +491,7 @@ mod tests {
|
|||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||||
|
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
@@ -160,6 +160,7 @@ mod tests {
|
|||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
||||||
|
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
|
||||||
|
|
||||||
let mut wtxn = index.env.write_txn().unwrap();
|
let mut wtxn = index.env.write_txn().unwrap();
|
||||||
|
|
||||||
@@ -177,6 +178,7 @@ mod tests {
|
|||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||||
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
||||||
|
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Same test as above but working with string values for the facets
|
// Same test as above but working with string values for the facets
|
||||||
@@ -217,6 +219,7 @@ mod tests {
|
|||||||
|
|
||||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||||
|
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||||
|
|
||||||
let mut wtxn = index.env.write_txn().unwrap();
|
let mut wtxn = index.env.write_txn().unwrap();
|
||||||
|
|
||||||
@@ -234,6 +237,7 @@ mod tests {
|
|||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||||
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
||||||
|
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -270,6 +274,7 @@ mod tests {
|
|||||||
|
|
||||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||||
|
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||||
|
|
||||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||||
|
|
||||||
@@ -286,6 +291,12 @@ mod tests {
|
|||||||
|
|
||||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||||
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
||||||
|
db_snap!(index, string_faceted_documents_ids, 2, @r###"
|
||||||
|
0 []
|
||||||
|
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||||
|
2 [292, 324, 358, 381, 493, 839, 852, ]
|
||||||
|
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||||
|
"###);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use heed::types::{ByteSlice, DecodeIgnore};
|
use heed::types::{ByteSlice, DecodeIgnore};
|
||||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||||
use obkv::KvReader;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
@@ -11,9 +12,8 @@ use crate::heed_codec::facet::{
|
|||||||
};
|
};
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::update::del_add::DelAdd;
|
|
||||||
use crate::update::index_documents::valid_lmdb_key;
|
use crate::update::index_documents::valid_lmdb_key;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||||
|
|
||||||
enum InsertionResult {
|
enum InsertionResult {
|
||||||
InPlace,
|
InPlace,
|
||||||
@@ -28,21 +28,27 @@ enum DeletionResult {
|
|||||||
|
|
||||||
/// Algorithm to incrementally insert and delete elememts into the
|
/// Algorithm to incrementally insert and delete elememts into the
|
||||||
/// `facet_id_(string/f64)_docids` databases.
|
/// `facet_id_(string/f64)_docids` databases.
|
||||||
pub struct FacetsUpdateIncremental {
|
///
|
||||||
|
/// Rhe `faceted_documents_ids` value in the main database of `Index`
|
||||||
|
/// is also updated to contain the new set of faceted documents.
|
||||||
|
pub struct FacetsUpdateIncremental<'i> {
|
||||||
|
index: &'i Index,
|
||||||
inner: FacetsUpdateIncrementalInner,
|
inner: FacetsUpdateIncrementalInner,
|
||||||
delta_data: grenad::Reader<File>,
|
facet_type: FacetType,
|
||||||
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetsUpdateIncremental {
|
impl<'i> FacetsUpdateIncremental<'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
index: &Index,
|
index: &'i Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
FacetsUpdateIncremental {
|
FacetsUpdateIncremental {
|
||||||
|
index,
|
||||||
inner: FacetsUpdateIncrementalInner {
|
inner: FacetsUpdateIncrementalInner {
|
||||||
db: match facet_type {
|
db: match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
@@ -56,41 +62,31 @@ impl FacetsUpdateIncremental {
|
|||||||
max_group_size,
|
max_group_size,
|
||||||
min_level_size,
|
min_level_size,
|
||||||
},
|
},
|
||||||
delta_data,
|
facet_type,
|
||||||
|
new_data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
||||||
let mut cursor = self.delta_data.into_cursor()?;
|
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||||
|
|
||||||
|
let mut cursor = self.new_data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if !valid_lmdb_key(key) {
|
if !valid_lmdb_key(key) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
||||||
.ok_or(heed::Error::Encoding)?;
|
.ok_or(heed::Error::Encoding)?;
|
||||||
let value = KvReader::new(value);
|
let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||||
|
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||||
let docids_to_delete = value
|
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||||
.get(DelAdd::Deletion)
|
|
||||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
|
||||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
|
||||||
|
|
||||||
let docids_to_add = value
|
|
||||||
.get(DelAdd::Addition)
|
|
||||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
|
||||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
|
||||||
|
|
||||||
if let Some(docids_to_delete) = docids_to_delete {
|
|
||||||
let docids_to_delete = docids_to_delete?;
|
|
||||||
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(docids_to_add) = docids_to_add {
|
|
||||||
let docids_to_add = docids_to_add?;
|
|
||||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (field_id, new_docids) in new_faceted_docids {
|
||||||
|
let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
|
||||||
|
docids |= new_docids;
|
||||||
|
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
|||||||
|
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||||
@@ -108,14 +109,17 @@ pub struct FacetsUpdate<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: grenad::Reader<File>,
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
}
|
}
|
||||||
impl<'i> FacetsUpdate<'i> {
|
impl<'i> FacetsUpdate<'i> {
|
||||||
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
|
pub fn new(
|
||||||
pub fn new(index: &'i Index, facet_type: FacetType, delta_data: grenad::Reader<File>) -> Self {
|
index: &'i Index,
|
||||||
|
facet_type: FacetType,
|
||||||
|
new_data: grenad::Reader<BufReader<File>>,
|
||||||
|
) -> Self {
|
||||||
let database = match facet_type {
|
let database = match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
.facet_id_string_docids
|
.facet_id_string_docids
|
||||||
@@ -131,26 +135,26 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||||
facet_type,
|
facet_type,
|
||||||
delta_data,
|
new_data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
if self.delta_data.is_empty() {
|
if self.new_data.is_empty() {
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||||
|
|
||||||
// See self::comparison_bench::benchmark_facet_indexing
|
// See self::comparison_bench::benchmark_facet_indexing
|
||||||
if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||||
let field_ids =
|
let field_ids =
|
||||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||||
let bulk_update = FacetsUpdateBulk::new(
|
let bulk_update = FacetsUpdateBulk::new(
|
||||||
self.index,
|
self.index,
|
||||||
field_ids,
|
field_ids,
|
||||||
self.facet_type,
|
self.facet_type,
|
||||||
self.delta_data,
|
self.new_data,
|
||||||
self.group_size,
|
self.group_size,
|
||||||
self.min_level_size,
|
self.min_level_size,
|
||||||
);
|
);
|
||||||
@@ -159,7 +163,7 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
let incremental_update = FacetsUpdateIncremental::new(
|
let incremental_update = FacetsUpdateIncremental::new(
|
||||||
self.index,
|
self.index,
|
||||||
self.facet_type,
|
self.facet_type,
|
||||||
self.delta_data,
|
self.new_data,
|
||||||
self.group_size,
|
self.group_size,
|
||||||
self.min_level_size,
|
self.min_level_size,
|
||||||
self.max_group_size,
|
self.max_group_size,
|
||||||
@@ -459,7 +463,7 @@ pub(crate) mod test_helpers {
|
|||||||
|
|
||||||
let update = FacetsUpdateBulkInner {
|
let update = FacetsUpdateBulkInner {
|
||||||
db: self.content,
|
db: self.content,
|
||||||
delta_data: Some(reader),
|
new_data: Some(reader),
|
||||||
group_size: self.group_size.get(),
|
group_size: self.group_size.get(),
|
||||||
min_level_size: self.min_level_size.get(),
|
min_level_size: self.min_level_size.get(),
|
||||||
};
|
};
|
||||||
@@ -594,6 +598,7 @@ mod tests {
|
|||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
||||||
|
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
|
||||||
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
||||||
|
|
||||||
let mut documents = vec![];
|
let mut documents = vec![];
|
||||||
@@ -616,6 +621,7 @@ mod tests {
|
|||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
||||||
|
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
|
||||||
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
||||||
|
|
||||||
// Then replace the last document while disabling soft_deletion
|
// Then replace the last document while disabling soft_deletion
|
||||||
@@ -640,6 +646,7 @@ mod tests {
|
|||||||
index.add_documents(documents).unwrap();
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
||||||
|
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
|
||||||
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use std::io::{Read, Seek};
|
use std::io::{BufWriter, Read, Seek};
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
use std::{fmt, iter};
|
use std::{fmt, iter};
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
|||||||
|
|
||||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||||
|
|
||||||
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||||
|
|
||||||
// The primary key *field id* that has already been set for this index or the one
|
// The primary key *field id* that has already been set for this index or the one
|
||||||
|
|||||||
@@ -1,19 +1,22 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
use obkv::{KvReader, KvWriterU16};
|
use obkv::KvReader;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::{InternalError, SerializationError};
|
use crate::error::{InternalError, SerializationError};
|
||||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
use crate::update::index_documents::MergeFn;
|
||||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
use crate::{
|
||||||
|
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
||||||
|
};
|
||||||
|
|
||||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
@@ -29,160 +32,25 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
allowed_separators: Option<&[&str]>,
|
allowed_separators: Option<&[&str]>,
|
||||||
dictionary: Option<&[&str]>,
|
dictionary: Option<&[&str]>,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
// initialize destination values.
|
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
let mut script_language_docids = HashMap::new();
|
let mut script_language_docids = HashMap::new();
|
||||||
let mut docid_word_positions_sorter = create_sorter(
|
let mut docid_word_positions_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
keep_latest_obkv,
|
concat_u32s_array,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
// initialize buffers.
|
let mut buffers = Buffers::default();
|
||||||
let mut del_buffers = Buffers::default();
|
|
||||||
let mut add_buffers = Buffers::default();
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
let mut value_buffer = Vec::new();
|
|
||||||
|
|
||||||
// initialize tokenizer.
|
|
||||||
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
|
|
||||||
let tokenizer = builder.build();
|
|
||||||
|
|
||||||
// iterate over documents.
|
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
let document_id = key
|
|
||||||
.try_into()
|
|
||||||
.map(u32::from_be_bytes)
|
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
|
||||||
let obkv = KvReader::<FieldId>::new(value);
|
|
||||||
|
|
||||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
|
||||||
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents_ids.push(document_id);
|
|
||||||
|
|
||||||
// Update key buffer prefix.
|
|
||||||
key_buffer.clear();
|
|
||||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
|
||||||
|
|
||||||
// Tokenize deletions and additions in 2 diffferent threads.
|
|
||||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
|
||||||
|| {
|
|
||||||
// deletions
|
|
||||||
lang_safe_tokens_from_document(
|
|
||||||
&obkv,
|
|
||||||
searchable_fields,
|
|
||||||
&tokenizer,
|
|
||||||
stop_words,
|
|
||||||
allowed_separators,
|
|
||||||
dictionary,
|
|
||||||
max_positions_per_attributes,
|
|
||||||
DelAdd::Deletion,
|
|
||||||
&mut del_buffers,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
|| {
|
|
||||||
// additions
|
|
||||||
lang_safe_tokens_from_document(
|
|
||||||
&obkv,
|
|
||||||
searchable_fields,
|
|
||||||
&tokenizer,
|
|
||||||
stop_words,
|
|
||||||
allowed_separators,
|
|
||||||
dictionary,
|
|
||||||
max_positions_per_attributes,
|
|
||||||
DelAdd::Addition,
|
|
||||||
&mut add_buffers,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
let (del_obkv, del_script_language_word_count) = del?;
|
|
||||||
let (add_obkv, add_script_language_word_count) = add?;
|
|
||||||
|
|
||||||
// merge deletions and additions.
|
|
||||||
value_buffer.clear();
|
|
||||||
del_add_from_two_obkvs(
|
|
||||||
KvReader::<FieldId>::new(del_obkv),
|
|
||||||
KvReader::<FieldId>::new(add_obkv),
|
|
||||||
&mut value_buffer,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// write them into the sorter.
|
|
||||||
let obkv = KvReader::<FieldId>::new(value);
|
|
||||||
for (field_id, value) in obkv.iter() {
|
|
||||||
key_buffer.truncate(mem::size_of::<u32>());
|
|
||||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// update script_language_docids deletions.
|
|
||||||
for (script, languages_frequency) in del_script_language_word_count {
|
|
||||||
for (language, _) in languages_frequency {
|
|
||||||
let entry = script_language_docids
|
|
||||||
.entry((script, language))
|
|
||||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
|
||||||
entry.0.push(document_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// update script_language_docids additions.
|
|
||||||
for (script, languages_frequency) in add_script_language_word_count {
|
|
||||||
for (language, _) in languages_frequency {
|
|
||||||
let entry = script_language_docids
|
|
||||||
.entry((script, language))
|
|
||||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
|
||||||
entry.1.push(document_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
|
||||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Check if any searchable fields of a document changed.
|
|
||||||
fn searchable_fields_changed(
|
|
||||||
obkv: &KvReader<FieldId>,
|
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
|
||||||
) -> bool {
|
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
|
||||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
|
||||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
|
||||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
|
||||||
// if both fields are None, check the next field.
|
|
||||||
(None, None) => (),
|
|
||||||
// if both contains a value and values are the same, check the next field.
|
|
||||||
(Some(del), Some(add)) if del == add => (),
|
|
||||||
// otherwise the fields are different, return true.
|
|
||||||
_otherwise => return true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Factorize tokenizer building.
|
|
||||||
fn tokenizer_builder<'a>(
|
|
||||||
stop_words: Option<&'a fst::Set<&[u8]>>,
|
|
||||||
allowed_separators: Option<&'a [&str]>,
|
|
||||||
dictionary: Option<&'a [&str]>,
|
|
||||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
|
||||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
if let Some(stop_words) = stop_words {
|
if let Some(stop_words) = stop_words {
|
||||||
tokenizer_builder.stop_words(stop_words);
|
tokenizer_builder.stop_words(stop_words);
|
||||||
@@ -193,144 +61,130 @@ fn tokenizer_builder<'a>(
|
|||||||
if let Some(separators) = allowed_separators {
|
if let Some(separators) = allowed_separators {
|
||||||
tokenizer_builder.separators(separators);
|
tokenizer_builder.separators(separators);
|
||||||
}
|
}
|
||||||
|
let tokenizer = tokenizer_builder.build();
|
||||||
|
|
||||||
if let Some(script_language) = script_language {
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
tokenizer_builder.allow_list(&script_language);
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
}
|
let document_id = key
|
||||||
|
.try_into()
|
||||||
|
.map(u32::from_be_bytes)
|
||||||
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
|
let obkv = KvReader::<FieldId>::new(value);
|
||||||
|
|
||||||
tokenizer_builder
|
documents_ids.push(document_id);
|
||||||
}
|
buffers.key_buffer.clear();
|
||||||
|
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||||
|
|
||||||
/// Extract words maped with their positions of a document,
|
let mut script_language_word_count = HashMap::new();
|
||||||
/// ensuring no Language detection mistakes was made.
|
|
||||||
fn lang_safe_tokens_from_document<'a>(
|
|
||||||
obkv: &KvReader<FieldId>,
|
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
|
||||||
tokenizer: &Tokenizer,
|
|
||||||
stop_words: Option<&fst::Set<&[u8]>>,
|
|
||||||
allowed_separators: Option<&[&str]>,
|
|
||||||
dictionary: Option<&[&str]>,
|
|
||||||
max_positions_per_attributes: u32,
|
|
||||||
del_add: DelAdd,
|
|
||||||
buffers: &'a mut Buffers,
|
|
||||||
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
|
||||||
let mut script_language_word_count = HashMap::new();
|
|
||||||
|
|
||||||
tokens_from_document(
|
extract_tokens_from_document(
|
||||||
&obkv,
|
&obkv,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
&tokenizer,
|
&tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
del_add,
|
&mut buffers,
|
||||||
buffers,
|
&mut script_language_word_count,
|
||||||
&mut script_language_word_count,
|
&mut docid_word_positions_sorter,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// if we detect a potetial mistake in the language detection,
|
// if we detect a potetial mistake in the language detection,
|
||||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||||
if script_language_word_count
|
if script_language_word_count
|
||||||
.values()
|
.values()
|
||||||
.map(Vec::as_slice)
|
.map(Vec::as_slice)
|
||||||
.any(potential_language_detection_error)
|
.any(potential_language_detection_error)
|
||||||
{
|
{
|
||||||
// build an allow list with the most frequent detected languages in the document.
|
// build an allow list with the most frequent detected languages in the document.
|
||||||
let script_language: HashMap<_, _> =
|
let script_language: HashMap<_, _> =
|
||||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||||
|
|
||||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||||
// then we don't rerun the extraction.
|
// then we don't rerun the extraction.
|
||||||
if !script_language.is_empty() {
|
if !script_language.is_empty() {
|
||||||
// build a new temporary tokenizer including the allow list.
|
// build a new temporary tokenizer including the allow list.
|
||||||
let mut builder = tokenizer_builder(
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
stop_words,
|
if let Some(stop_words) = stop_words {
|
||||||
dictionary,
|
tokenizer_builder.stop_words(stop_words);
|
||||||
allowed_separators,
|
}
|
||||||
Some(&script_language),
|
tokenizer_builder.allow_list(&script_language);
|
||||||
);
|
let tokenizer = tokenizer_builder.build();
|
||||||
let tokenizer = builder.build();
|
|
||||||
|
|
||||||
script_language_word_count.clear();
|
script_language_word_count.clear();
|
||||||
|
|
||||||
// rerun the extraction.
|
// rerun the extraction.
|
||||||
tokens_from_document(
|
extract_tokens_from_document(
|
||||||
&obkv,
|
&obkv,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
&tokenizer,
|
&tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
del_add,
|
&mut buffers,
|
||||||
buffers,
|
&mut script_language_word_count,
|
||||||
&mut script_language_word_count,
|
&mut docid_word_positions_sorter,
|
||||||
)?;
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (script, languages_frequency) in script_language_word_count {
|
||||||
|
for (language, _) in languages_frequency {
|
||||||
|
let entry = script_language_docids
|
||||||
|
.entry((script, language))
|
||||||
|
.or_insert_with(RoaringBitmap::new);
|
||||||
|
entry.push(document_id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((&buffers.obkv_buffer, script_language_word_count))
|
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||||
|
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract words maped with their positions of a document.
|
fn extract_tokens_from_document(
|
||||||
fn tokens_from_document<'a>(
|
|
||||||
obkv: &KvReader<FieldId>,
|
obkv: &KvReader<FieldId>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
max_positions_per_attributes: u32,
|
max_positions_per_attributes: u32,
|
||||||
del_add: DelAdd,
|
buffers: &mut Buffers,
|
||||||
buffers: &'a mut Buffers,
|
|
||||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||||
) -> Result<&'a [u8]> {
|
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
buffers.obkv_buffer.clear();
|
) -> Result<()> {
|
||||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
// if field is searchable.
|
|
||||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||||
// extract deletion or addition only.
|
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
buffers.field_buffer.clear();
|
||||||
// parse json.
|
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||||
let value =
|
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
// prepare writting destination.
|
for (index, token) in tokens {
|
||||||
buffers.obkv_positions_buffer.clear();
|
// if a language has been detected for the token, we update the counter.
|
||||||
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
if let Some(language) = token.language {
|
||||||
|
let script = token.script;
|
||||||
// convert json into an unique string.
|
let entry =
|
||||||
buffers.field_buffer.clear();
|
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||||
// create an iterator of token with their positions.
|
Some((_, n)) => *n += 1,
|
||||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
None => entry.push((language, 1)),
|
||||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
|
||||||
|
|
||||||
for (index, token) in tokens {
|
|
||||||
// if a language has been detected for the token, we update the counter.
|
|
||||||
if let Some(language) = token.language {
|
|
||||||
let script = token.script;
|
|
||||||
let entry =
|
|
||||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
|
||||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
|
||||||
Some((_, n)) => *n += 1,
|
|
||||||
None => entry.push((language, 1)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// keep a word only if it is not empty and fit in a LMDB key.
|
|
||||||
let token = token.lemma().trim();
|
|
||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
|
||||||
let position: u16 = index
|
|
||||||
.try_into()
|
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
|
||||||
writer.insert(position, token.as_bytes())?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
let token = token.lemma().trim();
|
||||||
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
|
buffers.key_buffer.truncate(mem::size_of::<u32>());
|
||||||
|
buffers.key_buffer.extend_from_slice(token.as_bytes());
|
||||||
|
|
||||||
// write positions into document.
|
let position: u16 = index
|
||||||
let positions = writer.into_inner()?;
|
.try_into()
|
||||||
document_writer.insert(field_id, positions)?;
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
|
let position = absolute_from_relative_position(field_id, position);
|
||||||
|
docid_word_positions_sorter
|
||||||
|
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Transform a JSON value into a string that can be indexed.
|
/// Transform a JSON value into a string that can be indexed.
|
||||||
@@ -433,10 +287,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Buffers {
|
struct Buffers {
|
||||||
|
// the key buffer is the concatenation of the internal document id with the field id.
|
||||||
|
// The buffer has to be completelly cleared between documents,
|
||||||
|
// and the field id part must be cleared between each field.
|
||||||
|
key_buffer: Vec<u8>,
|
||||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||||
field_buffer: String,
|
field_buffer: String,
|
||||||
// buffer used to store the value data containing an obkv.
|
|
||||||
obkv_buffer: Vec<u8>,
|
|
||||||
// buffer used to store the value data containing an obkv of tokens with their positions.
|
|
||||||
obkv_positions_buffer: Vec<u8>,
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,15 +1,14 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||||
};
|
};
|
||||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||||
@@ -18,39 +17,30 @@ use crate::Result;
|
|||||||
/// documents ids from the given chunk of docid facet number positions.
|
/// documents ids from the given chunk of docid facet number positions.
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||||
fid_docid_facet_number: grenad::Reader<R>,
|
docid_fid_facet_number: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut facet_number_docids_sorter = create_sorter(
|
let mut facet_number_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
let mut cursor = docid_fid_facet_number.into_cursor()?;
|
||||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
while let Some((key_bytes, _)) = cursor.move_on_next()? {
|
||||||
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
|
||||||
let (field_id, document_id, number) =
|
let (field_id, document_id, number) =
|
||||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||||
|
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||||
|
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
||||||
buffer.clear();
|
|
||||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
|
||||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
|
||||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
|
||||||
}
|
|
||||||
obkv.finish()?;
|
|
||||||
|
|
||||||
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||||
|
|||||||
@@ -1,14 +1,13 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::{io, str};
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
|
||||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||||
use crate::heed_codec::StrRefCodec;
|
use crate::heed_codec::StrRefCodec;
|
||||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
||||||
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||||
use crate::{FieldId, Result};
|
|
||||||
|
|
||||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||||
///
|
///
|
||||||
@@ -18,23 +17,22 @@ use crate::{FieldId, Result};
|
|||||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||||
docid_fid_facet_string: grenad::Reader<R>,
|
docid_fid_facet_string: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
|
||||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||||
|
|
||||||
@@ -42,17 +40,21 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
|
||||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
|
||||||
|
|
||||||
buffer.clear();
|
let normalised_truncated_value: String;
|
||||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
|
||||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
normalised_truncated_value = normalised_value
|
||||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
.char_indices()
|
||||||
|
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||||
|
.map(|(_, c)| c)
|
||||||
|
.collect();
|
||||||
|
normalised_value = normalised_truncated_value.as_str();
|
||||||
}
|
}
|
||||||
obkv.finish()?;
|
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
|
||||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||||
|
// document id is encoded in native-endian because of the CBO roaring bitmap codec
|
||||||
|
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||||
|
|||||||
@@ -1,39 +1,27 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::{BTreeMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
use std::result::Result as StdResult;
|
|
||||||
|
|
||||||
use grenad::Sorter;
|
|
||||||
use heed::zerocopy::AsBytes;
|
use heed::zerocopy::AsBytes;
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
use itertools::EitherOrBoth;
|
|
||||||
use ordered_float::OrderedFloat;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
use FilterableValues::{Empty, Null, Values};
|
|
||||||
|
|
||||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::InternalError;
|
use crate::error::InternalError;
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||||
use crate::{
|
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
||||||
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// The length of the elements that are always in the buffer when inserting new values.
|
|
||||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
|
||||||
|
|
||||||
/// The extracted facet values stored in grenad files by type.
|
/// The extracted facet values stored in grenad files by type.
|
||||||
pub struct ExtractedFacetValues {
|
pub struct ExtractedFacetValues {
|
||||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
|
pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
|
pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts the facet values of each faceted field of each document.
|
/// Extracts the facet values of each faceted field of each document.
|
||||||
@@ -70,150 +58,71 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
max_memory.map(|m| m / 2),
|
max_memory.map(|m| m / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
// The tuples represents the Del and Add side for a bitmap
|
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||||
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||||
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
||||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
|
||||||
|
|
||||||
// We create two buffer for mutable ref issues with closures.
|
|
||||||
let mut numbers_key_buffer = Vec::new();
|
|
||||||
let mut strings_key_buffer = Vec::new();
|
|
||||||
|
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
|
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
if faceted_fields.contains(&field_id) {
|
if faceted_fields.contains(&field_id) {
|
||||||
numbers_key_buffer.clear();
|
key_buffer.clear();
|
||||||
strings_key_buffer.clear();
|
|
||||||
|
|
||||||
// Set key to the field_id
|
// Set key to the field_id
|
||||||
// Note: this encoding is consistent with FieldIdCodec
|
// Note: this encoding is consistent with FieldIdCodec
|
||||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
|
|
||||||
|
// Here, we know already that the document must be added to the “field id exists” database
|
||||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||||
let document = BEU32::from(document).get();
|
let document = BEU32::from(document).get();
|
||||||
|
|
||||||
|
facet_exists_docids.entry(field_id).or_default().insert(document);
|
||||||
|
|
||||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
key_buffer.extend_from_slice(docid_bytes);
|
||||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
|
||||||
|
|
||||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
|
||||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
|
||||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
|
||||||
None => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
// We insert the document id on the Del and the Add side if the field exists.
|
match extract_facet_values(
|
||||||
let (ref mut del_exists, ref mut add_exists) =
|
&value,
|
||||||
facet_exists_docids.entry(field_id).or_default();
|
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
|
||||||
let (ref mut del_is_null, ref mut add_is_null) =
|
) {
|
||||||
facet_is_null_docids.entry(field_id).or_default();
|
FilterableValues::Null => {
|
||||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
||||||
facet_is_empty_docids.entry(field_id).or_default();
|
}
|
||||||
|
FilterableValues::Empty => {
|
||||||
|
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
||||||
|
}
|
||||||
|
FilterableValues::Values { numbers, strings } => {
|
||||||
|
// insert facet numbers in sorter
|
||||||
|
for number in numbers {
|
||||||
|
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||||
|
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||||
|
key_buffer.extend_from_slice(&value_bytes);
|
||||||
|
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||||
|
|
||||||
if del_value.is_some() {
|
fid_docid_facet_numbers_sorter
|
||||||
del_exists.insert(document);
|
.insert(&key_buffer, ().as_bytes())?;
|
||||||
}
|
}
|
||||||
if add_value.is_some() {
|
}
|
||||||
add_exists.insert(document);
|
|
||||||
}
|
|
||||||
|
|
||||||
let geo_support =
|
// insert normalized and original facet string in sorter
|
||||||
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
for (normalized, original) in
|
||||||
let del_filterable_values =
|
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
||||||
del_value.map(|value| extract_facet_values(&value, geo_support));
|
{
|
||||||
let add_filterable_values =
|
let normalized_truncated_value: String = normalized
|
||||||
add_value.map(|value| extract_facet_values(&value, geo_support));
|
.char_indices()
|
||||||
|
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||||
|
.map(|(_, c)| c)
|
||||||
|
.collect();
|
||||||
|
|
||||||
// Those closures are just here to simplify things a bit.
|
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
||||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
||||||
insert_numbers_diff(
|
fid_docid_facet_strings_sorter
|
||||||
&mut fid_docid_facet_numbers_sorter,
|
.insert(&key_buffer, original.as_bytes())?;
|
||||||
&mut numbers_key_buffer,
|
|
||||||
del_numbers,
|
|
||||||
add_numbers,
|
|
||||||
)
|
|
||||||
};
|
|
||||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
|
||||||
insert_strings_diff(
|
|
||||||
&mut fid_docid_facet_strings_sorter,
|
|
||||||
&mut strings_key_buffer,
|
|
||||||
del_strings,
|
|
||||||
add_strings,
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
match (del_filterable_values, add_filterable_values) {
|
|
||||||
(None, None) => (),
|
|
||||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
|
||||||
Null => {
|
|
||||||
del_is_null.insert(document);
|
|
||||||
}
|
|
||||||
Empty => {
|
|
||||||
del_is_empty.insert(document);
|
|
||||||
}
|
|
||||||
Values { numbers, strings } => {
|
|
||||||
insert_numbers_diff(numbers, vec![])?;
|
|
||||||
insert_strings_diff(strings, vec![])?;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
|
||||||
Null => {
|
|
||||||
add_is_null.insert(document);
|
|
||||||
}
|
|
||||||
Empty => {
|
|
||||||
add_is_empty.insert(document);
|
|
||||||
}
|
|
||||||
Values { numbers, strings } => {
|
|
||||||
insert_numbers_diff(vec![], numbers)?;
|
|
||||||
insert_strings_diff(vec![], strings)?;
|
|
||||||
}
|
|
||||||
},
|
|
||||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
|
||||||
match (del_filterable_values, add_filterable_values) {
|
|
||||||
(Null, Null) | (Empty, Empty) => (),
|
|
||||||
(Null, Empty) => {
|
|
||||||
del_is_null.insert(document);
|
|
||||||
add_is_empty.insert(document);
|
|
||||||
}
|
|
||||||
(Empty, Null) => {
|
|
||||||
del_is_empty.insert(document);
|
|
||||||
add_is_null.insert(document);
|
|
||||||
}
|
|
||||||
(Null, Values { numbers, strings }) => {
|
|
||||||
insert_numbers_diff(vec![], numbers)?;
|
|
||||||
insert_strings_diff(vec![], strings)?;
|
|
||||||
del_is_null.insert(document);
|
|
||||||
}
|
|
||||||
(Empty, Values { numbers, strings }) => {
|
|
||||||
insert_numbers_diff(vec![], numbers)?;
|
|
||||||
insert_strings_diff(vec![], strings)?;
|
|
||||||
del_is_empty.insert(document);
|
|
||||||
}
|
|
||||||
(Values { numbers, strings }, Null) => {
|
|
||||||
add_is_null.insert(document);
|
|
||||||
insert_numbers_diff(numbers, vec![])?;
|
|
||||||
insert_strings_diff(strings, vec![])?;
|
|
||||||
}
|
|
||||||
(Values { numbers, strings }, Empty) => {
|
|
||||||
add_is_empty.insert(document);
|
|
||||||
insert_numbers_diff(numbers, vec![])?;
|
|
||||||
insert_strings_diff(strings, vec![])?;
|
|
||||||
}
|
|
||||||
(
|
|
||||||
Values { numbers: del_numbers, strings: del_strings },
|
|
||||||
Values { numbers: add_numbers, strings: add_strings },
|
|
||||||
) => {
|
|
||||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
|
||||||
insert_strings_diff(del_strings, add_strings)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -221,15 +130,14 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut facet_exists_docids_writer = create_writer(
|
let mut facet_exists_docids_writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
for (fid, bitmap) in facet_exists_docids.into_iter() {
|
||||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||||
}
|
}
|
||||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||||
|
|
||||||
@@ -238,9 +146,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
||||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||||
}
|
}
|
||||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||||
|
|
||||||
@@ -249,156 +157,21 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
||||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
||||||
}
|
}
|
||||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||||
|
|
||||||
Ok(ExtractedFacetValues {
|
Ok(ExtractedFacetValues {
|
||||||
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||||
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
|
||||||
fn deladd_obkv_cbo_roaring_bitmaps(
|
|
||||||
buffer: &mut Vec<u8>,
|
|
||||||
del_bitmap: &RoaringBitmap,
|
|
||||||
add_bitmap: &RoaringBitmap,
|
|
||||||
) -> io::Result<()> {
|
|
||||||
buffer.clear();
|
|
||||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
|
||||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
|
||||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
|
||||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
|
||||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
|
||||||
obkv.finish()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Truncates a string to the biggest valid LMDB key size.
|
|
||||||
fn truncate_string(s: String) -> String {
|
|
||||||
s.char_indices()
|
|
||||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
|
||||||
.map(|(_, c)| c)
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Computes the diff between both Del and Add numbers and
|
|
||||||
/// only inserts the parts that differ in the sorter.
|
|
||||||
fn insert_numbers_diff<MF>(
|
|
||||||
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
|
||||||
key_buffer: &mut Vec<u8>,
|
|
||||||
mut del_numbers: Vec<f64>,
|
|
||||||
mut add_numbers: Vec<f64>,
|
|
||||||
) -> Result<()>
|
|
||||||
where
|
|
||||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
|
||||||
{
|
|
||||||
// We sort and dedup the float numbers
|
|
||||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
|
||||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
|
||||||
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
|
||||||
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
|
||||||
|
|
||||||
let merged_numbers_iter = itertools::merge_join_by(
|
|
||||||
del_numbers.into_iter().map(OrderedFloat),
|
|
||||||
add_numbers.into_iter().map(OrderedFloat),
|
|
||||||
|del, add| del.cmp(add),
|
|
||||||
);
|
|
||||||
|
|
||||||
// insert facet numbers in sorter
|
|
||||||
for eob in merged_numbers_iter {
|
|
||||||
key_buffer.truncate(TRUNCATE_SIZE);
|
|
||||||
match eob {
|
|
||||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
|
||||||
EitherOrBoth::Left(OrderedFloat(number)) => {
|
|
||||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
|
||||||
key_buffer.extend_from_slice(&value_bytes);
|
|
||||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
|
||||||
|
|
||||||
// We insert only the Del part of the Obkv to inform
|
|
||||||
// that we only want to remove all those numbers.
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
|
||||||
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
|
|
||||||
let bytes = obkv.into_inner()?;
|
|
||||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
EitherOrBoth::Right(OrderedFloat(number)) => {
|
|
||||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
|
||||||
key_buffer.extend_from_slice(&value_bytes);
|
|
||||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
|
||||||
|
|
||||||
// We insert only the Del part of the Obkv to inform
|
|
||||||
// that we only want to remove all those numbers.
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
|
||||||
obkv.insert(DelAdd::Addition, ().as_bytes())?;
|
|
||||||
let bytes = obkv.into_inner()?;
|
|
||||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Computes the diff between both Del and Add strings and
|
|
||||||
/// only inserts the parts that differ in the sorter.
|
|
||||||
fn insert_strings_diff<MF>(
|
|
||||||
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
|
||||||
key_buffer: &mut Vec<u8>,
|
|
||||||
mut del_strings: Vec<(String, String)>,
|
|
||||||
mut add_strings: Vec<(String, String)>,
|
|
||||||
) -> Result<()>
|
|
||||||
where
|
|
||||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
|
||||||
{
|
|
||||||
// We sort and dedup the normalized and original strings
|
|
||||||
del_strings.sort_unstable();
|
|
||||||
add_strings.sort_unstable();
|
|
||||||
del_strings.dedup();
|
|
||||||
add_strings.dedup();
|
|
||||||
|
|
||||||
let merged_strings_iter = itertools::merge_join_by(
|
|
||||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
|
||||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
|
||||||
|del, add| del.cmp(add),
|
|
||||||
);
|
|
||||||
|
|
||||||
// insert normalized and original facet string in sorter
|
|
||||||
for eob in merged_strings_iter {
|
|
||||||
key_buffer.truncate(TRUNCATE_SIZE);
|
|
||||||
match eob {
|
|
||||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
|
||||||
EitherOrBoth::Left((normalized, original)) => {
|
|
||||||
let truncated = truncate_string(normalized);
|
|
||||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
|
||||||
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
|
||||||
obkv.insert(DelAdd::Deletion, original)?;
|
|
||||||
let bytes = obkv.into_inner()?;
|
|
||||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
|
||||||
}
|
|
||||||
EitherOrBoth::Right((normalized, original)) => {
|
|
||||||
let truncated = truncate_string(normalized);
|
|
||||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
|
||||||
|
|
||||||
let mut obkv = KvWriterDelAdd::memory();
|
|
||||||
obkv.insert(DelAdd::Addition, original)?;
|
|
||||||
let bytes = obkv.into_inner()?;
|
|
||||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Represent what a document field contains.
|
/// Represent what a document field contains.
|
||||||
enum FilterableValues {
|
enum FilterableValues {
|
||||||
/// Corresponds to the JSON `null` value.
|
/// Corresponds to the JSON `null` value.
|
||||||
@@ -409,7 +182,6 @@ enum FilterableValues {
|
|||||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts the facet values of a JSON field.
|
|
||||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||||
fn inner_extract_facet_values(
|
fn inner_extract_facet_values(
|
||||||
value: &Value,
|
value: &Value,
|
||||||
|
|||||||
@@ -1,17 +1,16 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use obkv::KvReaderU16;
|
use grenad::Sorter;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
GrenadParameters,
|
try_split_array_at, GrenadParameters, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::Result;
|
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
||||||
|
|
||||||
const MAX_COUNTED_WORDS: usize = 30;
|
|
||||||
|
|
||||||
/// Extracts the field id word count and the documents ids where
|
/// Extracts the field id word count and the documents ids where
|
||||||
/// this field id with this amount of words appear.
|
/// this field id with this amount of words appear.
|
||||||
@@ -22,7 +21,7 @@ const MAX_COUNTED_WORDS: usize = 30;
|
|||||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
@@ -36,21 +35,63 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
|||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
// This map is assumed to not consume a lot of memory.
|
||||||
|
let mut document_fid_wordcount = HashMap::new();
|
||||||
|
let mut current_document_id = None;
|
||||||
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
|
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||||
if word_count <= MAX_COUNTED_WORDS {
|
if curr_document_id != document_id {
|
||||||
key_buffer.clear();
|
drain_document_fid_wordcount_into_sorter(
|
||||||
key_buffer.extend_from_slice(fid_bytes);
|
&mut fid_word_count_docids_sorter,
|
||||||
key_buffer.push(word_count as u8);
|
&mut document_fid_wordcount,
|
||||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
curr_document_id,
|
||||||
|
)?;
|
||||||
|
current_document_id = Some(document_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for position in read_u32_ne_bytes(value) {
|
||||||
|
let (field_id, _) = relative_from_absolute_position(position);
|
||||||
|
|
||||||
|
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
||||||
|
*value += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(document_id) = current_document_id {
|
||||||
|
// We must make sure that don't lose the current document field id
|
||||||
|
// word count map if we break because we reached the end of the chunk.
|
||||||
|
drain_document_fid_wordcount_into_sorter(
|
||||||
|
&mut fid_word_count_docids_sorter,
|
||||||
|
&mut document_fid_wordcount,
|
||||||
|
document_id,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn drain_document_fid_wordcount_into_sorter(
|
||||||
|
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
|
||||||
|
document_fid_wordcount: &mut HashMap<FieldId, u32>,
|
||||||
|
document_id: DocumentId,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
|
||||||
|
for (fid, count) in document_fid_wordcount.drain() {
|
||||||
|
if count <= 30 {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
key_buffer.push(count as u8);
|
||||||
|
|
||||||
|
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use concat_arrays::concat_arrays;
|
use concat_arrays::concat_arrays;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@@ -18,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
(lat_fid, lng_fid): (FieldId, FieldId),
|
(lat_fid, lng_fid): (FieldId, FieldId),
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use bytemuck::cast_slice;
|
use bytemuck::cast_slice;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
@@ -18,7 +18,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
primary_key_id: FieldId,
|
primary_key_id: FieldId,
|
||||||
vectors_fid: FieldId,
|
vectors_fid: FieldId,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
|
|||||||
@@ -1,20 +1,18 @@
|
|||||||
use std::collections::{BTreeSet, HashSet};
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
use heed::BytesDecode;
|
use roaring::RoaringBitmap;
|
||||||
use obkv::KvReaderU16;
|
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
|
||||||
try_split_array_at, writer_into_reader, GrenadParameters,
|
try_split_array_at, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::heed_codec::StrBEU16Codec;
|
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::index_documents::helpers::read_u32_ne_bytes;
|
||||||
use crate::update::MergeFn;
|
use crate::{relative_from_absolute_position, FieldId, Result};
|
||||||
use crate::{DocumentId, FieldId, Result};
|
|
||||||
|
|
||||||
/// Extracts the word and the documents ids where this word appear.
|
/// Extracts the word and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
@@ -28,148 +26,65 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
exact_attributes: &HashSet<FieldId>,
|
exact_attributes: &HashSet<FieldId>,
|
||||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_fid_docids_sorter = create_sorter(
|
|
||||||
grenad::SortAlgorithm::Unstable,
|
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory.map(|x| x / 3),
|
|
||||||
);
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
let mut del_words = BTreeSet::new();
|
|
||||||
let mut add_words = BTreeSet::new();
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
|
||||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
|
||||||
let fid = u16::from_be_bytes(fid_bytes);
|
|
||||||
|
|
||||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
|
||||||
// extract all unique words to remove.
|
|
||||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
|
||||||
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
|
|
||||||
del_words.insert(word.to_vec());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract all unique additional words.
|
|
||||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
|
||||||
for (_pos, word) in KvReaderU16::new(&addition).iter() {
|
|
||||||
add_words.insert(word.to_vec());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
words_into_sorter(
|
|
||||||
document_id,
|
|
||||||
fid,
|
|
||||||
&mut key_buffer,
|
|
||||||
&del_words,
|
|
||||||
&add_words,
|
|
||||||
&mut word_fid_docids_sorter,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
del_words.clear();
|
|
||||||
add_words.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|x| x / 3),
|
max_memory.map(|x| x / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut exact_word_docids_sorter = create_sorter(
|
let mut exact_word_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|x| x / 3),
|
max_memory.map(|x| x / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut word_fid_docids_writer = create_writer(
|
let mut value_buffer = Vec::new();
|
||||||
indexer.chunk_compression_type,
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
indexer.chunk_compression_level,
|
while let Some((key, positions)) = cursor.move_on_next()? {
|
||||||
tempfile::tempfile()?,
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
);
|
|
||||||
|
|
||||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
|
||||||
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
|
||||||
while let Some((key, value)) = iter.next()? {
|
|
||||||
// only keep the value if their is a change to apply in the DB.
|
|
||||||
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
|
||||||
word_fid_docids_writer.insert(key, value)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
||||||
if exact_attributes.contains(&fid) {
|
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
||||||
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
|
|
||||||
|
// If there are no exact attributes, we do not need to iterate over positions.
|
||||||
|
if exact_attributes.is_empty() {
|
||||||
|
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
} else {
|
} else {
|
||||||
word_docids_sorter.insert(word.as_bytes(), &value)?;
|
let mut added_to_exact = false;
|
||||||
|
let mut added_to_word_docids = false;
|
||||||
|
for position in read_u32_ne_bytes(positions) {
|
||||||
|
// as soon as we know that this word had been to both readers, we don't need to
|
||||||
|
// iterate over the positions.
|
||||||
|
if added_to_exact && added_to_word_docids {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let (fid, _) = relative_from_absolute_position(position);
|
||||||
|
if exact_attributes.contains(&fid) && !added_to_exact {
|
||||||
|
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
|
added_to_exact = true;
|
||||||
|
} else if !added_to_word_docids {
|
||||||
|
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
||||||
|
added_to_word_docids = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||||
writer_into_reader(word_fid_docids_writer)?,
|
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn words_into_sorter(
|
|
||||||
document_id: DocumentId,
|
|
||||||
fid: FieldId,
|
|
||||||
key_buffer: &mut Vec<u8>,
|
|
||||||
del_words: &BTreeSet<Vec<u8>>,
|
|
||||||
add_words: &BTreeSet<Vec<u8>>,
|
|
||||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
|
||||||
) -> Result<()> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
use itertools::merge_join_by;
|
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
|
||||||
buffer.clear();
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
|
||||||
let word_bytes = match eob {
|
|
||||||
Left(word_bytes) => {
|
|
||||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
|
||||||
word_bytes
|
|
||||||
}
|
|
||||||
Right(word_bytes) => {
|
|
||||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
|
||||||
word_bytes
|
|
||||||
}
|
|
||||||
Both(word_bytes, _) => {
|
|
||||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
|
||||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
|
||||||
word_bytes
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
key_buffer.clear();
|
|
||||||
key_buffer.extend_from_slice(&word_bytes);
|
|
||||||
key_buffer.push(0);
|
|
||||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
|
||||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,51 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
|
use super::helpers::{
|
||||||
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
|
try_split_array_at, GrenadParameters,
|
||||||
|
};
|
||||||
|
use crate::error::SerializationError;
|
||||||
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
|
use crate::{relative_from_absolute_position, DocumentId, Result};
|
||||||
|
|
||||||
|
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
||||||
|
docid_word_positions: grenad::Reader<R>,
|
||||||
|
indexer: GrenadParameters,
|
||||||
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Unstable,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
|
for position in read_u32_ne_bytes(value) {
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
|
let (fid, _) = relative_from_absolute_position(position);
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
||||||
|
|
||||||
|
Ok(word_fid_docids_reader)
|
||||||
|
}
|
||||||
@@ -1,17 +1,16 @@
|
|||||||
use std::collections::{BTreeMap, VecDeque};
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::{cmp, io};
|
use std::io::BufReader;
|
||||||
|
use std::{cmp, io, mem, str, vec};
|
||||||
use obkv::KvReaderU16;
|
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
writer_into_reader, GrenadParameters, MergeFn,
|
try_split_array_at, GrenadParameters, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
use crate::proximity::{positions_proximity, MAX_DISTANCE};
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
|
||||||
use crate::{DocumentId, Result};
|
use crate::{DocumentId, Result};
|
||||||
|
|
||||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||||
@@ -22,143 +21,63 @@ use crate::{DocumentId, Result};
|
|||||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
let mut word_pair_proximity_docids_sorter = create_sorter(
|
||||||
.into_iter()
|
grenad::SortAlgorithm::Unstable,
|
||||||
.map(|_| {
|
merge_cbo_roaring_bitmaps,
|
||||||
create_sorter(
|
indexer.chunk_compression_type,
|
||||||
grenad::SortAlgorithm::Unstable,
|
indexer.chunk_compression_level,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
indexer.max_nb_chunks,
|
||||||
indexer.chunk_compression_type,
|
max_memory.map(|m| m / 2),
|
||||||
indexer.chunk_compression_level,
|
);
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut del_word_positions: VecDeque<(String, u16)> =
|
// This map is assumed to not consume a lot of memory.
|
||||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
let mut document_word_positions_heap = BinaryHeap::new();
|
||||||
let mut add_word_positions: VecDeque<(String, u16)> =
|
|
||||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
|
||||||
let mut del_word_pair_proximity = BTreeMap::new();
|
|
||||||
let mut add_word_pair_proximity = BTreeMap::new();
|
|
||||||
let mut current_document_id = None;
|
let mut current_document_id = None;
|
||||||
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
let word = str::from_utf8(word_bytes)?;
|
||||||
|
|
||||||
// if we change document, we fill the sorter
|
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
||||||
if current_document_id.map_or(false, |id| id != document_id) {
|
if curr_document_id != document_id {
|
||||||
puffin::profile_scope!("Document into sorter");
|
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
||||||
|
|
||||||
document_word_positions_into_sorter(
|
document_word_positions_into_sorter(
|
||||||
current_document_id.unwrap(),
|
curr_document_id,
|
||||||
&del_word_pair_proximity,
|
document_word_positions_heap,
|
||||||
&add_word_pair_proximity,
|
&mut word_pair_proximity_docids_sorter,
|
||||||
&mut word_pair_proximity_docids_sorters,
|
|
||||||
)?;
|
)?;
|
||||||
del_word_pair_proximity.clear();
|
current_document_id = Some(document_id);
|
||||||
add_word_pair_proximity.clear();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
current_document_id = Some(document_id);
|
let word = word.to_string();
|
||||||
|
let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
|
||||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
positions.sort_unstable();
|
||||||
|| {
|
let mut iter = positions.into_iter();
|
||||||
// deletions
|
if let Some(position) = iter.next() {
|
||||||
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
|
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
|
||||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
}
|
||||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
|
||||||
while del_word_positions.get(0).map_or(false, |(_w, p)| {
|
|
||||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
|
||||||
}) {
|
|
||||||
word_positions_into_word_pair_proximity(
|
|
||||||
&mut del_word_positions,
|
|
||||||
&mut del_word_pair_proximity,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// insert the new word.
|
|
||||||
let word = std::str::from_utf8(word)?;
|
|
||||||
del_word_positions.push_back((word.to_string(), position));
|
|
||||||
}
|
|
||||||
|
|
||||||
while !del_word_positions.is_empty() {
|
|
||||||
word_positions_into_word_pair_proximity(
|
|
||||||
&mut del_word_positions,
|
|
||||||
&mut del_word_pair_proximity,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
|| {
|
|
||||||
// additions
|
|
||||||
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
|
|
||||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
|
||||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
|
||||||
while add_word_positions.get(0).map_or(false, |(_w, p)| {
|
|
||||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
|
||||||
}) {
|
|
||||||
word_positions_into_word_pair_proximity(
|
|
||||||
&mut add_word_positions,
|
|
||||||
&mut add_word_pair_proximity,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// insert the new word.
|
|
||||||
let word = std::str::from_utf8(word)?;
|
|
||||||
add_word_positions.push_back((word.to_string(), position));
|
|
||||||
}
|
|
||||||
|
|
||||||
while !add_word_positions.is_empty() {
|
|
||||||
word_positions_into_word_pair_proximity(
|
|
||||||
&mut add_word_positions,
|
|
||||||
&mut add_word_pair_proximity,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
del?;
|
|
||||||
add?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(document_id) = current_document_id {
|
if let Some(document_id) = current_document_id {
|
||||||
puffin::profile_scope!("Final document into sorter");
|
// We must make sure that don't lose the current document field id
|
||||||
|
// word count map if we break because we reached the end of the chunk.
|
||||||
|
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
||||||
document_word_positions_into_sorter(
|
document_word_positions_into_sorter(
|
||||||
document_id,
|
document_id,
|
||||||
&del_word_pair_proximity,
|
document_word_positions_heap,
|
||||||
&add_word_pair_proximity,
|
&mut word_pair_proximity_docids_sorter,
|
||||||
&mut word_pair_proximity_docids_sorters,
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
{
|
|
||||||
puffin::profile_scope!("sorter_into_reader");
|
|
||||||
let mut writer = create_writer(
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
for sorter in word_pair_proximity_docids_sorters {
|
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
|
||||||
sorter.write_into_stream_writer(&mut writer)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
writer_into_reader(writer)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||||
@@ -167,66 +86,96 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
/// close to each other.
|
/// close to each other.
|
||||||
fn document_word_positions_into_sorter(
|
fn document_word_positions_into_sorter(
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
|
||||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
|
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
use itertools::merge_join_by;
|
let mut word_pair_proximity = HashMap::new();
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
let mut ordered_peeked_word_positions = Vec::new();
|
||||||
|
while !word_positions_heap.is_empty() {
|
||||||
|
while let Some(peeked_word_position) = word_positions_heap.pop() {
|
||||||
|
ordered_peeked_word_positions.push(peeked_word_position);
|
||||||
|
if ordered_peeked_word_positions.len() == 7 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
|
||||||
|
for PeekedWordPosition { word, position, .. } in tail {
|
||||||
|
let prox = positions_proximity(head.position, *position);
|
||||||
|
if prox > 0 && prox < MAX_DISTANCE {
|
||||||
|
word_pair_proximity
|
||||||
|
.entry((head.word.clone(), word.clone()))
|
||||||
|
.and_modify(|p| {
|
||||||
|
*p = cmp::min(*p, prox);
|
||||||
|
})
|
||||||
|
.or_insert(prox);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Push the tail in the heap.
|
||||||
|
let tail_iter = ordered_peeked_word_positions.drain(1..);
|
||||||
|
word_positions_heap.extend(tail_iter);
|
||||||
|
|
||||||
|
// Advance the head and push it in the heap.
|
||||||
|
if let Some(mut head) = ordered_peeked_word_positions.pop() {
|
||||||
|
if let Some(next_position) = head.iter.next() {
|
||||||
|
let prox = positions_proximity(head.position, next_position);
|
||||||
|
|
||||||
|
if prox > 0 && prox < MAX_DISTANCE {
|
||||||
|
word_pair_proximity
|
||||||
|
.entry((head.word.clone(), head.word.clone()))
|
||||||
|
.and_modify(|p| {
|
||||||
|
*p = cmp::min(*p, prox);
|
||||||
|
})
|
||||||
|
.or_insert(prox);
|
||||||
|
}
|
||||||
|
|
||||||
|
word_positions_heap.push(PeekedWordPosition {
|
||||||
|
word: head.word,
|
||||||
|
position: next_position,
|
||||||
|
iter: head.iter,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
for eob in
|
for ((w1, w2), prox) in word_pair_proximity {
|
||||||
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
|
||||||
d.cmp(a)
|
|
||||||
})
|
|
||||||
{
|
|
||||||
buffer.clear();
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
|
||||||
let ((w1, w2), prox) = match eob {
|
|
||||||
Left(key_value) => {
|
|
||||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
|
||||||
key_value
|
|
||||||
}
|
|
||||||
Right(key_value) => {
|
|
||||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
|
||||||
key_value
|
|
||||||
}
|
|
||||||
Both(key_value, _) => {
|
|
||||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
|
||||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
|
||||||
key_value
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.push(*prox as u8);
|
key_buffer.push(prox as u8);
|
||||||
key_buffer.extend_from_slice(w1.as_bytes());
|
key_buffer.extend_from_slice(w1.as_bytes());
|
||||||
key_buffer.push(0);
|
key_buffer.push(0);
|
||||||
key_buffer.extend_from_slice(w2.as_bytes());
|
key_buffer.extend_from_slice(w2.as_bytes());
|
||||||
|
|
||||||
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn word_positions_into_word_pair_proximity(
|
struct PeekedWordPosition<I> {
|
||||||
word_positions: &mut VecDeque<(String, u16)>,
|
word: String,
|
||||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
position: u32,
|
||||||
) -> Result<()> {
|
iter: I,
|
||||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
}
|
||||||
for (word, position) in word_positions.iter() {
|
|
||||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
impl<I> Ord for PeekedWordPosition<I> {
|
||||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
word_pair_proximity
|
self.position.cmp(&other.position).reverse()
|
||||||
.entry((head_word.clone(), word.clone()))
|
}
|
||||||
.and_modify(|p| {
|
}
|
||||||
*p = cmp::min(*p, prox);
|
|
||||||
})
|
impl<I> PartialOrd for PeekedWordPosition<I> {
|
||||||
.or_insert(prox);
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
}
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
Ok(())
|
}
|
||||||
|
|
||||||
|
impl<I> Eq for PeekedWordPosition<I> {}
|
||||||
|
|
||||||
|
impl<I> PartialEq for PeekedWordPosition<I> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.position == other.position
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,18 +1,13 @@
|
|||||||
use std::collections::BTreeSet;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use obkv::KvReaderU16;
|
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
||||||
GrenadParameters,
|
try_split_array_at, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
|
||||||
use crate::update::MergeFn;
|
|
||||||
use crate::{bucketed_position, DocumentId, Result};
|
|
||||||
|
|
||||||
/// Extracts the word positions and the documents ids where this word appear.
|
/// Extracts the word positions and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
@@ -22,117 +17,39 @@ use crate::{bucketed_position, DocumentId, Result};
|
|||||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_position_docids_sorter = create_sorter(
|
let mut word_position_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
|
||||||
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
|
||||||
let mut current_document_id: Option<u32> = None;
|
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
if current_document_id.map_or(false, |id| document_id != id) {
|
for position in read_u32_ne_bytes(value) {
|
||||||
words_position_into_sorter(
|
key_buffer.clear();
|
||||||
current_document_id.unwrap(),
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
&mut key_buffer,
|
key_buffer.push(0);
|
||||||
&del_word_positions,
|
let (_, position) = relative_from_absolute_position(position);
|
||||||
&add_word_positions,
|
let position = bucketed_position(position);
|
||||||
&mut word_position_docids_sorter,
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
)?;
|
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
del_word_positions.clear();
|
|
||||||
add_word_positions.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
current_document_id = Some(document_id);
|
|
||||||
|
|
||||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
|
||||||
// extract all unique words to remove.
|
|
||||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
|
||||||
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
|
||||||
let position = bucketed_position(position);
|
|
||||||
del_word_positions.insert((position, word_bytes.to_vec()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract all unique additional words.
|
|
||||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
|
||||||
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
|
|
||||||
let position = bucketed_position(position);
|
|
||||||
add_word_positions.insert((position, word_bytes.to_vec()));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(document_id) = current_document_id {
|
|
||||||
words_position_into_sorter(
|
|
||||||
document_id,
|
|
||||||
&mut key_buffer,
|
|
||||||
&del_word_positions,
|
|
||||||
&add_word_positions,
|
|
||||||
&mut word_position_docids_sorter,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO remove noop DelAdd OBKV
|
|
||||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||||
|
|
||||||
Ok(word_position_docids_reader)
|
Ok(word_position_docids_reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn words_position_into_sorter(
|
|
||||||
document_id: DocumentId,
|
|
||||||
key_buffer: &mut Vec<u8>,
|
|
||||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
|
||||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
|
||||||
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
|
||||||
) -> Result<()> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
use itertools::merge_join_by;
|
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
|
||||||
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
|
||||||
{
|
|
||||||
buffer.clear();
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
|
||||||
let (position, word_bytes) = match eob {
|
|
||||||
Left(key) => {
|
|
||||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
|
||||||
key
|
|
||||||
}
|
|
||||||
Right(key) => {
|
|
||||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
|
||||||
key
|
|
||||||
}
|
|
||||||
Both(key, _) => {
|
|
||||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
|
||||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
|
||||||
key
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
key_buffer.clear();
|
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
|
||||||
key_buffer.push(0);
|
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
|
||||||
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -6,11 +6,13 @@ mod extract_fid_word_count_docids;
|
|||||||
mod extract_geo_points;
|
mod extract_geo_points;
|
||||||
mod extract_vector_points;
|
mod extract_vector_points;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
|
mod extract_word_fid_docids;
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
@@ -24,11 +26,12 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
|||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_vector_points::extract_vector_points;
|
use self::extract_vector_points::extract_vector_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
|
use self::extract_word_fid_docids::extract_word_fid_docids;
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
||||||
MergeableReader,
|
GrenadParameters, MergeFn, MergeableReader,
|
||||||
};
|
};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result};
|
||||||
@@ -37,8 +40,8 @@ use crate::{FieldId, Result};
|
|||||||
/// Send data in grenad file over provided Sender.
|
/// Send data in grenad file over provided Sender.
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(crate) fn data_from_obkv_documents(
|
pub(crate) fn data_from_obkv_documents(
|
||||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
searchable_fields: Option<HashSet<FieldId>>,
|
searchable_fields: Option<HashSet<FieldId>>,
|
||||||
@@ -91,9 +94,9 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
let (
|
let (
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks,
|
||||||
(
|
(
|
||||||
fid_docid_facet_numbers_chunks,
|
docid_fid_facet_numbers_chunks,
|
||||||
(
|
(
|
||||||
fid_docid_facet_strings_chunks,
|
docid_fid_facet_strings_chunks,
|
||||||
(
|
(
|
||||||
facet_is_null_docids_chunks,
|
facet_is_null_docids_chunks,
|
||||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||||
@@ -150,7 +153,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@@ -160,37 +163,34 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-pair-proximity-docids",
|
"word-pair-proximity-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_fid_word_count_docids,
|
extract_fid_word_count_docids,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
TypedChunk::FieldIdWordCountDocids,
|
TypedChunk::FieldIdWordcountDocids,
|
||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<
|
spawn_extraction_task::<
|
||||||
_,
|
_,
|
||||||
_,
|
_,
|
||||||
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
|
Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
|
||||||
>(
|
>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
|
||||||
TypedChunk::WordDocids {
|
word_docids_reader,
|
||||||
word_docids_reader,
|
exact_word_docids_reader,
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@@ -199,9 +199,18 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
|
docid_word_positions_chunks,
|
||||||
|
indexer,
|
||||||
|
lmdb_writer_sx.clone(),
|
||||||
|
extract_word_fid_docids,
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
|
TypedChunk::WordFidDocids,
|
||||||
|
"word-fid-docids",
|
||||||
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
fid_docid_facet_strings_chunks,
|
docid_fid_facet_strings_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_facet_string_docids,
|
extract_facet_string_docids,
|
||||||
@@ -210,8 +219,8 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-facet-string-docids",
|
"field-id-facet-string-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||||
fid_docid_facet_numbers_chunks,
|
docid_fid_facet_numbers_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx,
|
lmdb_writer_sx,
|
||||||
extract_facet_number_docids,
|
extract_facet_number_docids,
|
||||||
@@ -265,7 +274,7 @@ fn spawn_extraction_task<FE, FS, M>(
|
|||||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents
|
/// - documents
|
||||||
fn send_original_documents_data(
|
fn send_original_documents_data(
|
||||||
original_documents_chunk: Result<grenad::Reader<File>>,
|
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
vectors_field_id: Option<FieldId>,
|
vectors_field_id: Option<FieldId>,
|
||||||
@@ -307,7 +316,7 @@ fn send_original_documents_data(
|
|||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[allow(clippy::type_complexity)]
|
#[allow(clippy::type_complexity)]
|
||||||
fn send_and_extract_flattened_documents_data(
|
fn send_and_extract_flattened_documents_data(
|
||||||
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
@@ -324,7 +333,10 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(
|
(
|
||||||
grenad::Reader<CursorClonableMmap>,
|
grenad::Reader<CursorClonableMmap>,
|
||||||
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
|
(
|
||||||
|
grenad::Reader<BufReader<File>>,
|
||||||
|
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||||
|
),
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
)> {
|
)> {
|
||||||
@@ -344,7 +356,7 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
||||||
@@ -372,8 +384,8 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
let ExtractedFacetValues {
|
let ExtractedFacetValues {
|
||||||
fid_docid_facet_numbers_chunk,
|
docid_fid_facet_numbers_chunk,
|
||||||
fid_docid_facet_strings_chunk,
|
docid_fid_facet_strings_chunk,
|
||||||
fid_facet_is_null_docids_chunk,
|
fid_facet_is_null_docids_chunk,
|
||||||
fid_facet_is_empty_docids_chunk,
|
fid_facet_is_empty_docids_chunk,
|
||||||
fid_facet_exists_docids_chunk,
|
fid_facet_exists_docids_chunk,
|
||||||
@@ -384,26 +396,26 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// send fid_docid_facet_numbers_chunk to DB writer
|
// send docid_fid_facet_numbers_chunk to DB writer
|
||||||
let fid_docid_facet_numbers_chunk =
|
let docid_fid_facet_numbers_chunk =
|
||||||
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||||
fid_docid_facet_numbers_chunk.clone(),
|
docid_fid_facet_numbers_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
// send fid_docid_facet_strings_chunk to DB writer
|
// send docid_fid_facet_strings_chunk to DB writer
|
||||||
let fid_docid_facet_strings_chunk =
|
let docid_fid_facet_strings_chunk =
|
||||||
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||||
fid_docid_facet_strings_chunk.clone(),
|
docid_fid_facet_strings_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
fid_docid_facet_numbers_chunk,
|
docid_fid_facet_numbers_chunk,
|
||||||
(
|
(
|
||||||
fid_docid_facet_strings_chunk,
|
docid_fid_facet_strings_chunk,
|
||||||
(
|
(
|
||||||
fid_facet_is_null_docids_chunk,
|
fid_facet_is_null_docids_chunk,
|
||||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||||
@@ -413,5 +425,5 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Seek};
|
use std::io::{self, BufReader, BufWriter, Seek};
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
use grenad::{CompressionType, Sorter};
|
use grenad::{CompressionType, Sorter};
|
||||||
@@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
|
|||||||
typ: grenad::CompressionType,
|
typ: grenad::CompressionType,
|
||||||
level: Option<u32>,
|
level: Option<u32>,
|
||||||
file: R,
|
file: R,
|
||||||
) -> grenad::Writer<R> {
|
) -> grenad::Writer<BufWriter<R>> {
|
||||||
let mut builder = grenad::Writer::builder();
|
let mut builder = grenad::Writer::builder();
|
||||||
builder.compression_type(typ);
|
builder.compression_type(typ);
|
||||||
if let Some(level) = level {
|
if let Some(level) = level {
|
||||||
builder.compression_level(level);
|
builder.compression_level(level);
|
||||||
}
|
}
|
||||||
builder.build(file)
|
builder.build(BufWriter::new(file))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn create_sorter(
|
pub fn create_sorter(
|
||||||
@@ -53,8 +53,7 @@ pub fn create_sorter(
|
|||||||
pub fn sorter_into_reader(
|
pub fn sorter_into_reader(
|
||||||
sorter: grenad::Sorter<MergeFn>,
|
sorter: grenad::Sorter<MergeFn>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
puffin::profile_function!();
|
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
@@ -65,16 +64,18 @@ pub fn sorter_into_reader(
|
|||||||
writer_into_reader(writer)
|
writer_into_reader(writer)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
|
pub fn writer_into_reader(
|
||||||
let mut file = writer.into_inner()?;
|
writer: grenad::Writer<BufWriter<File>>,
|
||||||
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
|
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||||
file.rewind()?;
|
file.rewind()?;
|
||||||
grenad::Reader::new(file).map_err(Into::into)
|
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub unsafe fn as_cloneable_grenad(
|
pub unsafe fn as_cloneable_grenad(
|
||||||
reader: &grenad::Reader<File>,
|
reader: &grenad::Reader<BufReader<File>>,
|
||||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||||
let file = reader.get_ref();
|
let file = reader.get_ref().get_ref();
|
||||||
let mmap = memmap2::Mmap::map(file)?;
|
let mmap = memmap2::Mmap::map(file)?;
|
||||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||||
let reader = grenad::Reader::new(cursor)?;
|
let reader = grenad::Reader::new(cursor)?;
|
||||||
@@ -90,8 +91,8 @@ where
|
|||||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeableReader for Vec<grenad::Reader<File>> {
|
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||||
type Output = grenad::Reader<File>;
|
type Output = grenad::Reader<BufReader<File>>;
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
let mut merger = MergerBuilder::new(merge_fn);
|
let mut merger = MergerBuilder::new(merge_fn);
|
||||||
@@ -100,8 +101,8 @@ impl MergeableReader for Vec<grenad::Reader<File>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||||
type Output = (grenad::Reader<File>, grenad::Reader<File>);
|
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
let mut m1 = MergerBuilder::new(merge_fn);
|
let mut m1 = MergerBuilder::new(merge_fn);
|
||||||
@@ -114,22 +115,6 @@ impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
|
||||||
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
|
|
||||||
|
|
||||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
|
||||||
let mut m1 = MergerBuilder::new(merge_fn);
|
|
||||||
let mut m2 = MergerBuilder::new(merge_fn);
|
|
||||||
let mut m3 = MergerBuilder::new(merge_fn);
|
|
||||||
for (r1, r2, r3) in self.into_iter() {
|
|
||||||
m1.push(r1)?;
|
|
||||||
m2.push(r2)?;
|
|
||||||
m3.push(r3)?;
|
|
||||||
}
|
|
||||||
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||||
@@ -142,7 +127,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
|
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
let merger = self.0.build();
|
let merger = self.0.build();
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
params.chunk_compression_type,
|
params.chunk_compression_type,
|
||||||
@@ -193,7 +178,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
|||||||
reader: grenad::Reader<R>,
|
reader: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
documents_chunk_size: usize,
|
documents_chunk_size: usize,
|
||||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
|
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||||
let mut continue_reading = true;
|
let mut continue_reading = true;
|
||||||
let mut cursor = reader.into_cursor()?;
|
let mut cursor = reader.into_cursor()?;
|
||||||
|
|
||||||
|
|||||||
@@ -6,13 +6,11 @@ use std::result::Result as StdResult;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
|
||||||
use crate::update::index_documents::transform::Operation;
|
use crate::update::index_documents::transform::Operation;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||||
|
|
||||||
#[allow(unused)]
|
|
||||||
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
if values.len() == 1 {
|
if values.len() == 1 {
|
||||||
Ok(values[0].clone())
|
Ok(values[0].clone())
|
||||||
@@ -77,123 +75,57 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
|
|||||||
Ok(obkvs.last().unwrap().clone())
|
Ok(obkvs.last().unwrap().clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_two_del_add_obkvs(
|
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||||
base: obkv::KvReaderU16,
|
|
||||||
update: obkv::KvReaderU16,
|
|
||||||
merge_additions: bool,
|
|
||||||
buffer: &mut Vec<u8>,
|
|
||||||
) {
|
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
|
|
||||||
let mut writer = obkv::KvWriter::new(buffer);
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
let mut value_buffer = Vec::new();
|
|
||||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||||
match eob {
|
match eob {
|
||||||
Left((k, v)) => {
|
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||||
if merge_additions {
|
|
||||||
writer.insert(k, v).unwrap()
|
|
||||||
} else {
|
|
||||||
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
|
||||||
value_buffer.clear();
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
|
||||||
let base_reader = KvReaderDelAdd::new(v);
|
|
||||||
|
|
||||||
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
|
||||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
|
||||||
value_writer.finish().unwrap();
|
|
||||||
writer.insert(k, &value_buffer).unwrap()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Right((k, v)) => writer.insert(k, v).unwrap(),
|
|
||||||
Both((k, base), (_, update)) => {
|
|
||||||
// merge deletions and additions.
|
|
||||||
value_buffer.clear();
|
|
||||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
|
||||||
let base_reader = KvReaderDelAdd::new(base);
|
|
||||||
let update_reader = KvReaderDelAdd::new(update);
|
|
||||||
|
|
||||||
// keep newest deletion.
|
|
||||||
if let Some(deletion) = update_reader
|
|
||||||
.get(DelAdd::Deletion)
|
|
||||||
.or_else(|| base_reader.get(DelAdd::Deletion))
|
|
||||||
{
|
|
||||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// keep base addition only if merge_additions is true.
|
|
||||||
let base_addition =
|
|
||||||
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
|
||||||
// keep newest addition.
|
|
||||||
// TODO use or_else
|
|
||||||
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
|
||||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
value_writer.finish().unwrap();
|
|
||||||
writer.insert(k, &value_buffer).unwrap()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.finish().unwrap();
|
writer.finish().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge all the obkvs from the newest to the oldest.
|
/// Merge all the obks in the order we see them.
|
||||||
fn inner_merge_del_add_obkvs<'a>(
|
pub fn merge_obkvs_and_operations<'a>(
|
||||||
|
_key: &[u8],
|
||||||
obkvs: &[Cow<'a, [u8]>],
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
merge_additions: bool,
|
|
||||||
) -> Result<Cow<'a, [u8]>> {
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
// pop the newest operation from the list.
|
// [add, add, delete, add, add]
|
||||||
let (newest, obkvs) = obkvs.split_last().unwrap();
|
// we can ignore everything that happened before the last delete.
|
||||||
// keep the operation type for the returned value.
|
let starting_position =
|
||||||
let newest_operation_type = newest[0];
|
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
|
||||||
|
|
||||||
// treat the newest obkv as the starting point of the merge.
|
// [add, add, delete]
|
||||||
let mut acc_operation_type = newest_operation_type;
|
// if the last operation was a deletion then we simply return the deletion
|
||||||
let mut acc = newest[1..].to_vec();
|
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
|
||||||
let mut buffer = Vec::new();
|
{
|
||||||
// reverse iter from the most recent to the oldest.
|
return Ok(obkvs[obkvs.len() - 1].clone());
|
||||||
for current in obkvs.into_iter().rev() {
|
|
||||||
// if in the previous iteration there was a complete deletion,
|
|
||||||
// stop the merge process.
|
|
||||||
if acc_operation_type == Operation::Deletion as u8 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
let newest = obkv::KvReader::new(&acc);
|
|
||||||
let oldest = obkv::KvReader::new(¤t[1..]);
|
|
||||||
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
|
||||||
|
|
||||||
// we want the result of the merge into our accumulator.
|
|
||||||
std::mem::swap(&mut acc, &mut buffer);
|
|
||||||
acc_operation_type = current[0];
|
|
||||||
}
|
}
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
acc.insert(0, newest_operation_type);
|
// (add, add, delete) [add, add]
|
||||||
Ok(Cow::from(acc))
|
// in the other case, no deletion will be encountered during the merge
|
||||||
|
let mut ret =
|
||||||
|
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
|
||||||
|
let first = obkv::KvReader::new(&acc);
|
||||||
|
let second = obkv::KvReader::new(¤t[1..]);
|
||||||
|
merge_two_obkvs(first, second, &mut buffer);
|
||||||
|
|
||||||
|
// we want the result of the merge into our accumulator
|
||||||
|
std::mem::swap(&mut acc, &mut buffer);
|
||||||
|
acc
|
||||||
|
});
|
||||||
|
|
||||||
|
ret.insert(0, Operation::Addition as u8);
|
||||||
|
Ok(Cow::from(ret))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge all the obkvs from the newest to the oldest.
|
|
||||||
pub fn obkvs_merge_additions_and_deletions<'a>(
|
|
||||||
_key: &[u8],
|
|
||||||
obkvs: &[Cow<'a, [u8]>],
|
|
||||||
) -> Result<Cow<'a, [u8]>> {
|
|
||||||
inner_merge_del_add_obkvs(obkvs, true)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
|
||||||
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
|
|
||||||
_key: &[u8],
|
|
||||||
obkvs: &[Cow<'a, [u8]>],
|
|
||||||
) -> Result<Cow<'a, [u8]>> {
|
|
||||||
inner_merge_del_add_obkvs(obkvs, false)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
|
||||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||||
_key: &[u8],
|
_key: &[u8],
|
||||||
values: &[Cow<'a, [u8]>],
|
values: &[Cow<'a, [u8]>],
|
||||||
@@ -206,36 +138,3 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
|
|||||||
Ok(Cow::from(vec))
|
Ok(Cow::from(vec))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
|
||||||
/// separately and outputs a new DelAdd with both unions.
|
|
||||||
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
|
||||||
_key: &[u8],
|
|
||||||
values: &[Cow<'a, [u8]>],
|
|
||||||
) -> Result<Cow<'a, [u8]>> {
|
|
||||||
if values.len() == 1 {
|
|
||||||
Ok(values[0].clone())
|
|
||||||
} else {
|
|
||||||
// Retrieve the bitmaps from both sides
|
|
||||||
let mut del_bitmaps_bytes = Vec::new();
|
|
||||||
let mut add_bitmaps_bytes = Vec::new();
|
|
||||||
for value in values {
|
|
||||||
let obkv = KvReaderDelAdd::new(value);
|
|
||||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
|
||||||
del_bitmaps_bytes.push(bitmap_bytes);
|
|
||||||
}
|
|
||||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
|
||||||
add_bitmaps_bytes.push(bitmap_bytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
|
||||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
|
||||||
buffer.clear();
|
|
||||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
|
||||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
|
||||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -14,8 +14,7 @@ pub use grenad_helpers::{
|
|||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
||||||
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
|
||||||
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
|
|
||||||
serialize_roaring_bitmap, MergeFn,
|
serialize_roaring_bitmap, MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -45,7 +44,6 @@ where
|
|||||||
Some((head, tail))
|
Some((head, tail))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(unused)]
|
|
||||||
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
||||||
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ use crate::update::{
|
|||||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
use crate::{Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||||
@@ -406,23 +406,13 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
let typed_chunk = match result? {
|
let typed_chunk = match result? {
|
||||||
TypedChunk::WordDocids {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
} => {
|
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
word_docids = Some(cloneable_chunk);
|
word_docids = Some(cloneable_chunk);
|
||||||
let cloneable_chunk =
|
let cloneable_chunk =
|
||||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||||
exact_word_docids = Some(cloneable_chunk);
|
exact_word_docids = Some(cloneable_chunk);
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
|
||||||
word_fid_docids = Some(cloneable_chunk);
|
|
||||||
TypedChunk::WordDocids {
|
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
@@ -434,6 +424,11 @@ where
|
|||||||
word_position_docids = Some(cloneable_chunk);
|
word_position_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordPositionDocids(chunk)
|
TypedChunk::WordPositionDocids(chunk)
|
||||||
}
|
}
|
||||||
|
TypedChunk::WordFidDocids(chunk) => {
|
||||||
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
|
word_fid_docids = Some(cloneable_chunk);
|
||||||
|
TypedChunk::WordFidDocids(chunk)
|
||||||
|
}
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -475,14 +470,13 @@ where
|
|||||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
let all_documents_ids = index_documents_ids | new_documents_ids;
|
||||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
||||||
|
|
||||||
// TODO: reactivate prefix DB with diff-indexing
|
self.execute_prefix_databases(
|
||||||
// self.execute_prefix_databases(
|
word_docids,
|
||||||
// word_docids,
|
exact_word_docids,
|
||||||
// exact_word_docids,
|
word_pair_proximity_docids,
|
||||||
// word_pair_proximity_docids,
|
word_position_docids,
|
||||||
// word_position_docids,
|
word_fid_docids,
|
||||||
// word_fid_docids,
|
)?;
|
||||||
// )?;
|
|
||||||
|
|
||||||
Ok(all_documents_ids.len())
|
Ok(all_documents_ids.len())
|
||||||
}
|
}
|
||||||
@@ -696,8 +690,8 @@ where
|
|||||||
fn execute_word_prefix_docids(
|
fn execute_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
word_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
|
||||||
indexer_config: &IndexerConfig,
|
indexer_config: &IndexerConfig,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
@@ -1499,6 +1493,12 @@ mod tests {
|
|||||||
3 2 second second
|
3 2 second second
|
||||||
3 3 third third
|
3 3 third third
|
||||||
"###);
|
"###);
|
||||||
|
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||||
|
0 []
|
||||||
|
1 []
|
||||||
|
2 []
|
||||||
|
3 [0, 1, 2, 3, ]
|
||||||
|
"###);
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
@@ -1522,6 +1522,12 @@ mod tests {
|
|||||||
|
|
||||||
db_snap!(index, facet_id_string_docids, @"");
|
db_snap!(index, facet_id_string_docids, @"");
|
||||||
db_snap!(index, field_id_docid_facet_strings, @"");
|
db_snap!(index, field_id_docid_facet_strings, @"");
|
||||||
|
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||||
|
0 []
|
||||||
|
1 []
|
||||||
|
2 []
|
||||||
|
3 [0, 1, 2, 3, ]
|
||||||
|
"###);
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
@@ -1548,6 +1554,12 @@ mod tests {
|
|||||||
3 2 second second
|
3 2 second second
|
||||||
3 3 third third
|
3 3 third third
|
||||||
"###);
|
"###);
|
||||||
|
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||||
|
0 []
|
||||||
|
1 []
|
||||||
|
2 []
|
||||||
|
3 [0, 1, 2, 3, ]
|
||||||
|
"###);
|
||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -7,20 +7,18 @@ use std::io::{Read, Seek};
|
|||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use obkv::{KvReader, KvReaderU16, KvWriter};
|
use obkv::{KvReader, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use smartstring::SmartString;
|
use smartstring::SmartString;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions,
|
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
|
||||||
obkvs_merge_additions_and_deletions, MergeFn,
|
|
||||||
};
|
};
|
||||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
use crate::index::{db_name, main_key};
|
use crate::index::{db_name, main_key};
|
||||||
use crate::update::del_add::into_del_add_obkv;
|
|
||||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||||
use crate::{
|
use crate::{
|
||||||
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
||||||
@@ -108,8 +106,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// We must choose the appropriate merge function for when two or more documents
|
// We must choose the appropriate merge function for when two or more documents
|
||||||
// with the same user id must be merged or fully replaced in the same batch.
|
// with the same user id must be merged or fully replaced in the same batch.
|
||||||
let merge_function = match index_documents_method {
|
let merge_function = match index_documents_method {
|
||||||
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
|
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
|
||||||
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
|
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
|
||||||
};
|
};
|
||||||
|
|
||||||
// We initialize the sorter with the user indexing settings.
|
// We initialize the sorter with the user indexing settings.
|
||||||
@@ -225,21 +223,19 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
||||||
Entry::Occupied(entry) => *entry.get() as u32,
|
Entry::Occupied(entry) => *entry.get() as u32,
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
let docid = match external_documents_ids.get(entry.key()) {
|
// If the document was already in the db we mark it as a replaced document.
|
||||||
Some(docid) => {
|
// It'll be deleted later.
|
||||||
// If it was already in the list of replaced documents it means it was deleted
|
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
||||||
// by the remove_document method. We should starts as if it never existed.
|
// If it was already in the list of replaced documents it means it was deleted
|
||||||
if self.replaced_documents_ids.insert(docid) {
|
// by the remove_document method. We should starts as if it never existed.
|
||||||
original_docid = Some(docid);
|
if self.replaced_documents_ids.insert(docid) {
|
||||||
}
|
original_docid = Some(docid);
|
||||||
|
|
||||||
docid
|
|
||||||
}
|
}
|
||||||
None => self
|
}
|
||||||
.available_documents_ids
|
let docid = self
|
||||||
.next()
|
.available_documents_ids
|
||||||
.ok_or(UserError::DocumentLimitReached)?,
|
.next()
|
||||||
};
|
.ok_or(UserError::DocumentLimitReached)?;
|
||||||
entry.insert(docid as u64);
|
entry.insert(docid as u64);
|
||||||
docid
|
docid
|
||||||
}
|
}
|
||||||
@@ -267,28 +263,16 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
skip_insertion = true;
|
skip_insertion = true;
|
||||||
} else {
|
} else {
|
||||||
// we associate the base document with the new key, everything will get merged later.
|
// we associate the base document with the new key, everything will get merged later.
|
||||||
let keep_original_version =
|
|
||||||
self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
|
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
into_del_add_obkv(
|
document_sorter_buffer.extend_from_slice(base_obkv);
|
||||||
KvReaderU16::new(base_obkv),
|
|
||||||
true,
|
|
||||||
keep_original_version,
|
|
||||||
&mut document_sorter_buffer,
|
|
||||||
)?;
|
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
||||||
Some(flattened_obkv) => {
|
Some(flattened_obkv) => {
|
||||||
// we recreate our buffer with the flattened documents
|
// we recreate our buffer with the flattened documents
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
into_del_add_obkv(
|
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
||||||
KvReaderU16::new(&flattened_obkv),
|
|
||||||
true,
|
|
||||||
keep_original_version,
|
|
||||||
&mut document_sorter_buffer,
|
|
||||||
)?;
|
|
||||||
self.flattened_sorter
|
self.flattened_sorter
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||||
}
|
}
|
||||||
@@ -304,12 +288,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
into_del_add_obkv(
|
document_sorter_buffer.extend_from_slice(&obkv_buffer);
|
||||||
KvReaderU16::new(&obkv_buffer),
|
|
||||||
false,
|
|
||||||
true,
|
|
||||||
&mut document_sorter_buffer,
|
|
||||||
)?;
|
|
||||||
// We use the extracted/generated user id as the key for this document.
|
// We use the extracted/generated user id as the key for this document.
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
|
|
||||||
@@ -317,12 +296,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
Some(flattened_obkv) => {
|
Some(flattened_obkv) => {
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
into_del_add_obkv(
|
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
||||||
KvReaderU16::new(&flattened_obkv),
|
|
||||||
false,
|
|
||||||
true,
|
|
||||||
&mut document_sorter_buffer,
|
|
||||||
)?;
|
|
||||||
self.flattened_sorter
|
self.flattened_sorter
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||||
}
|
}
|
||||||
@@ -380,25 +354,19 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
|
|
||||||
let mut documents_deleted = 0;
|
let mut documents_deleted = 0;
|
||||||
let mut document_sorter_buffer = Vec::new();
|
|
||||||
for to_remove in to_remove {
|
for to_remove in to_remove {
|
||||||
if should_abort() {
|
if should_abort() {
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if the document has been added in the current indexing process.
|
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
||||||
let deleted_from_current = match self
|
|
||||||
.new_external_documents_ids_builder
|
|
||||||
.entry((*to_remove).into())
|
|
||||||
{
|
|
||||||
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
||||||
Entry::Occupied(entry) => {
|
Entry::Occupied(entry) => {
|
||||||
let doc_id = *entry.get() as u32;
|
let doc_id = *entry.get() as u32;
|
||||||
document_sorter_buffer.clear();
|
self.original_sorter
|
||||||
document_sorter_buffer.push(Operation::Deletion as u8);
|
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
||||||
obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap();
|
self.flattened_sorter
|
||||||
self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
|
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
||||||
self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
|
|
||||||
|
|
||||||
// we must NOT update the list of replaced_documents_ids
|
// we must NOT update the list of replaced_documents_ids
|
||||||
// Either:
|
// Either:
|
||||||
@@ -407,69 +375,21 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// we're removing it there is nothing to do.
|
// we're removing it there is nothing to do.
|
||||||
self.new_documents_ids.remove(doc_id);
|
self.new_documents_ids.remove(doc_id);
|
||||||
entry.remove_entry();
|
entry.remove_entry();
|
||||||
true
|
|
||||||
}
|
}
|
||||||
Entry::Vacant(_) => false,
|
Entry::Vacant(entry) => {
|
||||||
};
|
// If the document was already in the db we mark it as a `to_delete` document.
|
||||||
|
// It'll be deleted later. We don't need to push anything to the sorters.
|
||||||
// If the document was already in the db we mark it as a `to_delete` document.
|
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
||||||
// Then we push the document in sorters in deletion mode.
|
self.replaced_documents_ids.insert(docid);
|
||||||
let deleted_from_db = match external_documents_ids.get(&to_remove) {
|
} else {
|
||||||
Some(docid) => {
|
// if the document is nowehere to be found, there is nothing to do and we must NOT
|
||||||
self.replaced_documents_ids.insert(docid);
|
// increment the count of documents_deleted
|
||||||
|
continue;
|
||||||
// fetch the obkv document
|
|
||||||
let original_key = BEU32::new(docid);
|
|
||||||
let base_obkv = self
|
|
||||||
.index
|
|
||||||
.documents
|
|
||||||
.remap_data_type::<heed::types::ByteSlice>()
|
|
||||||
.get(wtxn, &original_key)?
|
|
||||||
.ok_or(InternalError::DatabaseMissingEntry {
|
|
||||||
db_name: db_name::DOCUMENTS,
|
|
||||||
key: None,
|
|
||||||
})?;
|
|
||||||
|
|
||||||
// push it as to delete in the original_sorter
|
|
||||||
document_sorter_buffer.clear();
|
|
||||||
document_sorter_buffer.push(Operation::Deletion as u8);
|
|
||||||
into_del_add_obkv(
|
|
||||||
KvReaderU16::new(base_obkv),
|
|
||||||
true,
|
|
||||||
false,
|
|
||||||
&mut document_sorter_buffer,
|
|
||||||
)?;
|
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
|
||||||
|
|
||||||
// flatten it and push it as to delete in the flattened_sorter
|
|
||||||
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
|
||||||
Some(flattened_obkv) => {
|
|
||||||
// we recreate our buffer with the flattened documents
|
|
||||||
document_sorter_buffer.clear();
|
|
||||||
document_sorter_buffer.push(Operation::Deletion as u8);
|
|
||||||
into_del_add_obkv(
|
|
||||||
KvReaderU16::new(&flattened_obkv),
|
|
||||||
true,
|
|
||||||
false,
|
|
||||||
&mut document_sorter_buffer,
|
|
||||||
)?;
|
|
||||||
self.flattened_sorter
|
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
|
||||||
}
|
|
||||||
None => self
|
|
||||||
.flattened_sorter
|
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
true
|
|
||||||
}
|
}
|
||||||
None => false,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// increase counter only if the document existed somewhere before.
|
documents_deleted += 1;
|
||||||
if deleted_from_current || deleted_from_db {
|
|
||||||
documents_deleted += 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(documents_deleted)
|
Ok(documents_deleted)
|
||||||
@@ -669,7 +589,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let mut documents_count = 0;
|
let mut documents_count = 0;
|
||||||
|
|
||||||
while let Some((key, val)) = iter.next()? {
|
while let Some((key, val)) = iter.next()? {
|
||||||
// skip first byte corresponding to the operation type (Deletion or Addition).
|
if val[0] == Operation::Deletion as u8 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let val = &val[1..];
|
let val = &val[1..];
|
||||||
|
|
||||||
// send a callback to show at which step we are
|
// send a callback to show at which step we are
|
||||||
@@ -709,7 +631,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
||||||
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
||||||
while let Some((key, val)) = iter.next()? {
|
while let Some((key, val)) = iter.next()? {
|
||||||
// skip first byte corresponding to the operation type (Deletion or Addition).
|
if val[0] == Operation::Deletion as u8 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let val = &val[1..];
|
let val = &val[1..];
|
||||||
writer.insert(key, val)?;
|
writer.insert(key, val)?;
|
||||||
}
|
}
|
||||||
@@ -735,8 +659,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
new_documents_ids: self.new_documents_ids,
|
new_documents_ids: self.new_documents_ids,
|
||||||
replaced_documents_ids: self.replaced_documents_ids,
|
replaced_documents_ids: self.replaced_documents_ids,
|
||||||
documents_count: self.documents_count,
|
documents_count: self.documents_count,
|
||||||
original_documents,
|
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||||
flattened_documents,
|
flattened_documents: flattened_documents
|
||||||
|
.into_inner()
|
||||||
|
.map_err(|err| err.into_error())?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -787,7 +713,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut obkv_buffer = Vec::new();
|
let mut obkv_buffer = Vec::new();
|
||||||
let mut document_sorter_buffer = Vec::new();
|
|
||||||
for result in self.index.all_documents(wtxn)? {
|
for result in self.index.all_documents(wtxn)? {
|
||||||
let (docid, obkv) = result?;
|
let (docid, obkv) = result?;
|
||||||
|
|
||||||
@@ -802,9 +727,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let buffer = obkv_writer.into_inner()?;
|
let buffer = obkv_writer.into_inner()?;
|
||||||
document_sorter_buffer.clear();
|
original_writer.insert(docid.to_be_bytes(), &buffer)?;
|
||||||
into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?;
|
|
||||||
original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
|
||||||
|
|
||||||
// Once we have the document. We're going to flatten it
|
// Once we have the document. We're going to flatten it
|
||||||
// and insert it in the flattened sorter.
|
// and insert it in the flattened sorter.
|
||||||
@@ -839,9 +762,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||||
writer.insert(fid, &value)?;
|
writer.insert(fid, &value)?;
|
||||||
}
|
}
|
||||||
document_sorter_buffer.clear();
|
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
|
||||||
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?;
|
|
||||||
flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Once we have written all the documents, we extract
|
// Once we have written all the documents, we extract
|
||||||
@@ -860,8 +781,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
new_documents_ids: documents_ids,
|
new_documents_ids: documents_ids,
|
||||||
replaced_documents_ids: RoaringBitmap::default(),
|
replaced_documents_ids: RoaringBitmap::default(),
|
||||||
documents_count,
|
documents_count,
|
||||||
original_documents,
|
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
|
||||||
flattened_documents,
|
flattened_documents: flattened_documents
|
||||||
|
.into_inner()
|
||||||
|
.map_err(|err| err.into_error())?,
|
||||||
};
|
};
|
||||||
|
|
||||||
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
let new_facets = output.compute_real_facets(wtxn, self.index)?;
|
||||||
@@ -905,86 +828,38 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn merge_obkvs() {
|
fn merge_obkvs() {
|
||||||
let mut additive_doc_0 = Vec::new();
|
let mut doc_0 = Vec::new();
|
||||||
let mut deletive_doc_0 = Vec::new();
|
let mut kv_writer = KvWriter::new(&mut doc_0);
|
||||||
let mut del_add_doc_0 = Vec::new();
|
|
||||||
let mut kv_writer = KvWriter::memory();
|
|
||||||
kv_writer.insert(0_u8, [0]).unwrap();
|
kv_writer.insert(0_u8, [0]).unwrap();
|
||||||
let buffer = kv_writer.into_inner().unwrap();
|
kv_writer.finish().unwrap();
|
||||||
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap();
|
doc_0.insert(0, Operation::Addition as u8);
|
||||||
additive_doc_0.insert(0, Operation::Addition as u8);
|
|
||||||
into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap();
|
|
||||||
deletive_doc_0.insert(0, Operation::Deletion as u8);
|
|
||||||
into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap();
|
|
||||||
del_add_doc_0.insert(0, Operation::Addition as u8);
|
|
||||||
|
|
||||||
let mut additive_doc_1 = Vec::new();
|
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
|
||||||
let mut kv_writer = KvWriter::memory();
|
assert_eq!(*ret, doc_0);
|
||||||
kv_writer.insert(1_u8, [1]).unwrap();
|
|
||||||
let buffer = kv_writer.into_inner().unwrap();
|
|
||||||
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap();
|
|
||||||
additive_doc_1.insert(0, Operation::Addition as u8);
|
|
||||||
|
|
||||||
let mut additive_doc_0_1 = Vec::new();
|
let ret = merge_obkvs_and_operations(
|
||||||
let mut kv_writer = KvWriter::memory();
|
|
||||||
kv_writer.insert(0_u8, [0]).unwrap();
|
|
||||||
kv_writer.insert(1_u8, [1]).unwrap();
|
|
||||||
let buffer = kv_writer.into_inner().unwrap();
|
|
||||||
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap();
|
|
||||||
additive_doc_0_1.insert(0, Operation::Addition as u8);
|
|
||||||
|
|
||||||
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(*ret, additive_doc_0);
|
|
||||||
|
|
||||||
let ret = obkvs_merge_additions_and_deletions(
|
|
||||||
&[],
|
&[],
|
||||||
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, del_add_doc_0);
|
assert_eq!(*ret, doc_0);
|
||||||
|
|
||||||
let ret = obkvs_merge_additions_and_deletions(
|
let ret = merge_obkvs_and_operations(
|
||||||
&[],
|
&[],
|
||||||
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
|
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, deletive_doc_0);
|
assert_eq!(*ret, [Operation::Deletion as u8]);
|
||||||
|
|
||||||
let ret = obkvs_merge_additions_and_deletions(
|
let ret = merge_obkvs_and_operations(
|
||||||
&[],
|
&[],
|
||||||
&[
|
&[
|
||||||
Cow::from(additive_doc_1.as_slice()),
|
Cow::from([Operation::Addition as u8, 1].as_slice()),
|
||||||
Cow::from(deletive_doc_0.as_slice()),
|
Cow::from([Operation::Deletion as u8].as_slice()),
|
||||||
Cow::from(additive_doc_0.as_slice()),
|
Cow::from(doc_0.as_slice()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, del_add_doc_0);
|
assert_eq!(*ret, doc_0);
|
||||||
|
|
||||||
let ret = obkvs_merge_additions_and_deletions(
|
|
||||||
&[],
|
|
||||||
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(*ret, additive_doc_0_1);
|
|
||||||
|
|
||||||
let ret = obkvs_keep_last_addition_merge_deletions(
|
|
||||||
&[],
|
|
||||||
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(*ret, additive_doc_0);
|
|
||||||
|
|
||||||
let ret = obkvs_keep_last_addition_merge_deletions(
|
|
||||||
&[],
|
|
||||||
&[
|
|
||||||
Cow::from(deletive_doc_0.as_slice()),
|
|
||||||
Cow::from(additive_doc_1.as_slice()),
|
|
||||||
Cow::from(additive_doc_0.as_slice()),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(*ret, del_add_doc_0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use bytemuck::allocation::pod_collect_to_vec;
|
use bytemuck::allocation::pod_collect_to_vec;
|
||||||
use charabia::{Language, Script};
|
use charabia::{Language, Script};
|
||||||
@@ -10,13 +11,14 @@ use heed::types::ByteSlice;
|
|||||||
use heed::RwTxn;
|
use heed::RwTxn;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
|
use super::helpers::{
|
||||||
|
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
|
||||||
|
};
|
||||||
use super::{ClonableMmap, MergeFn};
|
use super::{ClonableMmap, MergeFn};
|
||||||
use crate::distance::NDotProductPoint;
|
use crate::distance::NDotProductPoint;
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::index::Hnsw;
|
use crate::index::Hnsw;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
|
||||||
use crate::update::facet::FacetsUpdate;
|
use crate::update::facet::FacetsUpdate;
|
||||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
||||||
@@ -25,23 +27,23 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||||
Documents(grenad::Reader<CursorClonableMmap>),
|
Documents(grenad::Reader<CursorClonableMmap>),
|
||||||
FieldIdWordCountDocids(grenad::Reader<File>),
|
FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
|
||||||
NewDocumentsIds(RoaringBitmap),
|
NewDocumentsIds(RoaringBitmap),
|
||||||
WordDocids {
|
WordDocids {
|
||||||
word_docids_reader: grenad::Reader<File>,
|
word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
exact_word_docids_reader: grenad::Reader<File>,
|
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||||
word_fid_docids_reader: grenad::Reader<File>,
|
|
||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordFidDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetExistsDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetIsNullDocids(grenad::Reader<File>),
|
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
||||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||||
GeoPoints(grenad::Reader<File>),
|
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||||
VectorPoints(grenad::Reader<File>),
|
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
VectorPoints(grenad::Reader<BufReader<File>>),
|
||||||
|
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TypedChunk {
|
impl TypedChunk {
|
||||||
@@ -56,25 +58,23 @@ impl TypedChunk {
|
|||||||
TypedChunk::Documents(grenad) => {
|
TypedChunk::Documents(grenad) => {
|
||||||
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
format!("Documents {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdWordCountDocids(grenad) => {
|
TypedChunk::FieldIdWordcountDocids(grenad) => {
|
||||||
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::NewDocumentsIds(grenad) => {
|
TypedChunk::NewDocumentsIds(grenad) => {
|
||||||
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
|
||||||
word_docids_reader,
|
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
} => format!(
|
|
||||||
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
|
|
||||||
word_docids_reader.len(),
|
word_docids_reader.len(),
|
||||||
exact_word_docids_reader.len(),
|
exact_word_docids_reader.len()
|
||||||
word_fid_docids_reader.len()
|
|
||||||
),
|
),
|
||||||
TypedChunk::WordPositionDocids(grenad) => {
|
TypedChunk::WordPositionDocids(grenad) => {
|
||||||
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
|
TypedChunk::WordFidDocids(grenad) => {
|
||||||
|
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
|
}
|
||||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
@@ -99,8 +99,8 @@ impl TypedChunk {
|
|||||||
TypedChunk::VectorPoints(grenad) => {
|
TypedChunk::VectorPoints(grenad) => {
|
||||||
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
TypedChunk::ScriptLanguageDocids(grenad) => {
|
||||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -124,33 +124,29 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
|
index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
|
TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => {
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
fid_word_count_docids_iter,
|
fid_word_count_docids_iter,
|
||||||
&index.field_id_word_count_docids,
|
&index.field_id_word_count_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
TypedChunk::NewDocumentsIds(documents_ids) => {
|
||||||
return Ok((documents_ids, is_merged_database))
|
return Ok((documents_ids, is_merged_database))
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids {
|
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
||||||
word_docids_reader,
|
|
||||||
exact_word_docids_reader,
|
|
||||||
word_fid_docids_reader,
|
|
||||||
} => {
|
|
||||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
word_docids_iter.clone(),
|
word_docids_iter.clone(),
|
||||||
&index.word_docids,
|
&index.word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||||
@@ -159,18 +155,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.exact_word_docids,
|
&index.exact_word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_roaring_bitmaps,
|
||||||
)?;
|
|
||||||
|
|
||||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
|
||||||
append_entries_into_database(
|
|
||||||
word_fid_docids_iter,
|
|
||||||
&index.word_fid_docids,
|
|
||||||
wtxn,
|
|
||||||
index_is_empty,
|
|
||||||
deladd_serialize_add_side,
|
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
@@ -191,8 +177,19 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_position_docids,
|
&index.word_position_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
|
)?;
|
||||||
|
is_merged_database = true;
|
||||||
|
}
|
||||||
|
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
|
||||||
|
append_entries_into_database(
|
||||||
|
word_fid_docids_iter,
|
||||||
|
&index.word_fid_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -212,8 +209,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_exists_docids,
|
&index.facet_id_exists_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -223,8 +220,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_is_null_docids,
|
&index.facet_id_is_null_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -234,8 +231,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_is_empty_docids,
|
&index.facet_id_is_empty_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -245,8 +242,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_pair_proximity_docids,
|
&index.word_pair_proximity_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
deladd_serialize_add_side,
|
|value, _buffer| Ok(value),
|
||||||
merge_deladd_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -318,7 +315,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let found = vector.len();
|
let found = vector.len();
|
||||||
let expected = *expected_dimensions.get_or_insert(found);
|
let expected = *expected_dimensions.get_or_insert(found);
|
||||||
if expected != found {
|
if expected != found {
|
||||||
return Err(UserError::InvalidVectorDimensions { expected, found }.into());
|
return Err(UserError::InvalidVectorDimensions { expected, found })?;
|
||||||
}
|
}
|
||||||
|
|
||||||
points.push(NDotProductPoint::new(vector));
|
points.push(NDotProductPoint::new(vector));
|
||||||
@@ -342,25 +339,22 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
||||||
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
||||||
for (key, (deletion, addition)) in sl_map {
|
let mut buffer = Vec::new();
|
||||||
let mut db_key_exists = false;
|
for (key, value) in hash_pair {
|
||||||
|
buffer.clear();
|
||||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||||
Some(db_values) => {
|
Some(db_values) => {
|
||||||
db_key_exists = true;
|
let mut db_value_buffer = Vec::new();
|
||||||
(db_values - deletion) | addition
|
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
|
||||||
|
let mut new_value_buffer = Vec::new();
|
||||||
|
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
|
||||||
|
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
|
||||||
|
RoaringBitmap::deserialize_from(&buffer[..])?
|
||||||
}
|
}
|
||||||
None => addition,
|
None => value,
|
||||||
};
|
};
|
||||||
|
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||||
if final_value.is_empty() {
|
|
||||||
// If the database entry exists, delete it.
|
|
||||||
if db_key_exists == true {
|
|
||||||
index.script_language_docids.delete(wtxn, &key)?;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -385,28 +379,20 @@ fn merge_word_docids_reader_into_fst(
|
|||||||
Ok(builder.into_set())
|
Ok(builder.into_set())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
||||||
/// This is useful when there are no previous value in the database and
|
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
||||||
/// therefore we don't need to do a diff with what's already there.
|
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
||||||
///
|
let value = new_value | db_value;
|
||||||
/// If there is no Add side we currently write an empty buffer
|
Ok(serialize_roaring_bitmap(&value, buffer)?)
|
||||||
/// which is a valid CboRoaringBitmap.
|
|
||||||
fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
|
|
||||||
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
fn merge_cbo_roaring_bitmaps(
|
||||||
///
|
new_value: &[u8],
|
||||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
db_value: &[u8],
|
||||||
/// the second one is the CboRoaringBitmap to merge into.
|
|
||||||
fn merge_deladd_cbo_roaring_bitmaps(
|
|
||||||
deladd_obkv: &[u8],
|
|
||||||
previous: &[u8],
|
|
||||||
buffer: &mut Vec<u8>,
|
buffer: &mut Vec<u8>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
Ok(CboRoaringBitmapCodec::merge_into(
|
||||||
KvReaderDelAdd::new(deladd_obkv),
|
&[Cow::Borrowed(db_value), Cow::Borrowed(new_value)],
|
||||||
previous,
|
|
||||||
buffer,
|
buffer,
|
||||||
)?)
|
)?)
|
||||||
}
|
}
|
||||||
@@ -469,7 +455,6 @@ where
|
|||||||
R: io::Read + io::Seek,
|
R: io::Read + io::Seek,
|
||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||||
K: for<'a> heed::BytesDecode<'a>,
|
|
||||||
{
|
{
|
||||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||||
|
|
||||||
@@ -490,12 +475,6 @@ where
|
|||||||
let mut cursor = data.into_cursor()?;
|
let mut cursor = data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
debug_assert!(
|
|
||||||
K::bytes_decode(key).is_some(),
|
|
||||||
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
|
||||||
key.len(),
|
|
||||||
&key
|
|
||||||
);
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let value = serialize_value(value, &mut buffer)?;
|
let value = serialize_value(value, &mut buffer)?;
|
||||||
unsafe { database.append(key, value)? };
|
unsafe { database.append(key, value)? };
|
||||||
|
|||||||
@@ -21,7 +21,6 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
|
|||||||
|
|
||||||
mod available_documents_ids;
|
mod available_documents_ids;
|
||||||
mod clear_documents;
|
mod clear_documents;
|
||||||
pub(crate) mod del_add;
|
|
||||||
mod delete_documents;
|
mod delete_documents;
|
||||||
pub(crate) mod facet;
|
pub(crate) mod facet;
|
||||||
mod index_documents;
|
mod index_documents;
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::BufReader;
|
use std::io::{BufReader, BufWriter};
|
||||||
|
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
@@ -119,9 +119,9 @@ pub fn insert_into_database(
|
|||||||
pub fn write_into_lmdb_database_without_merging(
|
pub fn write_into_lmdb_database_without_merging(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
database: heed::PolyDatabase,
|
database: heed::PolyDatabase,
|
||||||
writer: grenad::Writer<std::fs::File>,
|
writer: grenad::Writer<BufWriter<std::fs::File>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let file = writer.into_inner()?;
|
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||||
if database.is_empty(wtxn)? {
|
if database.is_empty(wtxn)? {
|
||||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user