mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-18 20:30:47 +00:00
Compare commits
170 Commits
diff-index
...
prototype-
Author | SHA1 | Date | |
---|---|---|---|
620fee35f9 | |||
cbaa54cafd | |||
1bccf2079e | |||
1b2ea6cf19 | |||
1ad1fcc8c8 | |||
87610a5f98 | |||
2544bc1416 | |||
ff522c919d | |||
1c39459cf4 | |||
bf0651f23c | |||
5b20e625f3 | |||
bc51d6157a | |||
1b4ff991c0 | |||
4b64c33aa2 | |||
12323d610e | |||
44e9033b3a | |||
4d864f0702 | |||
b10c060bf7 | |||
e507ef5932 | |||
c71b1d33ae | |||
0fc446c62f | |||
0fb6acefc3 | |||
b1d1355b69 | |||
f19332466e | |||
03ddb4f310 | |||
c855cc2721 | |||
da0503ef80 | |||
94206b0055 | |||
b40253bf18 | |||
d8bf3f3fc2 | |||
9d59e8011a | |||
dad78cbf8d | |||
4e91707a06 | |||
de10f20732 | |||
be395c7944 | |||
9fedd8101a | |||
54d07a8da3 | |||
58690dfb19 | |||
abf424ebfc | |||
dfab6293c9 | |||
fdf3f7f627 | |||
6260cff65f | |||
8e0d9c9a5e | |||
ae4ec8ea55 | |||
652ac3052d | |||
9a2dccc3bc | |||
a35988550c | |||
e78281785c | |||
3c15881818 | |||
73c06d31d9 | |||
290e773d23 | |||
fa6c7f65ca | |||
113527f466 | |||
c534a1b687 | |||
2263dff02b | |||
d651b3ef01 | |||
762b0b47e6 | |||
01d5eedf2f | |||
073f89db79 | |||
8370fbc92b | |||
85f42fbc03 | |||
c6b3c18c85 | |||
bafeb892a7 | |||
8fb221dae3 | |||
5be569e3e2 | |||
946c762d28 | |||
cda6ca1ee6 | |||
696fcf4d18 | |||
476e4d3dbe | |||
576fa9c6da | |||
77dcbff6b2 | |||
544440c363 | |||
a3dae4db9b | |||
ba90a5ec0e | |||
b26dc9aabe | |||
66abac9364 | |||
59f88c14b3 | |||
14832cb324 | |||
04ec293024 | |||
f67ff3a738 | |||
560e8f5613 | |||
2d3f15f82c | |||
40186bf403 | |||
87e3d27878 | |||
6bcf8b4f8c | |||
46aa75abdb | |||
2597bbd107 | |||
e2bc054604 | |||
fcd3a1434d | |||
a82dee21e0 | |||
bc45c1206d | |||
6ae4100f07 | |||
0c47defeee | |||
313b16bec2 | |||
1dd97578a8 | |||
f5ef69293b | |||
1c5705c164 | |||
66c2c82a18 | |||
28a8d0ccda | |||
96be85396d | |||
df9e5c8651 | |||
b541d48847 | |||
8ccf32d1a0 | |||
db1ca21231 | |||
11ea5acff9 | |||
8d77736a67 | |||
748b333161 | |||
17b647dfe5 | |||
62ea81bef6 | |||
f28f09ae2f | |||
eae9eab181 | |||
cf8dad1ca0 | |||
dd619913da | |||
9b55ff16e9 | |||
e761db582f | |||
d8c649b3cd | |||
5e0485d8dd | |||
27eec21415 | |||
62cc97ba70 | |||
fed59cc1d5 | |||
2b3adef796 | |||
956cfc5487 | |||
12fc878640 | |||
0a2e8b92a9 | |||
c7a3f80de6 | |||
029d4de043 | |||
549f1bcccf | |||
689ec7c7ad | |||
3655d4bdca | |||
055ca3935b | |||
1b8871a585 | |||
bf8fac6676 | |||
f2a9e1ebbb | |||
c45c6cf54c | |||
513e61e9a3 | |||
90a626bf80 | |||
0d4acf2daa | |||
58db8d85ec | |||
62dfd09dc6 | |||
656dadabea | |||
c5f7893fbb | |||
8cf2ccf168 | |||
0913373a5e | |||
1a7f1282af | |||
bc747aac3a | |||
be92376ab3 | |||
cf7e355735 | |||
5f09d89ad1 | |||
6ecb26a3f8 | |||
76c6f554d6 | |||
f343ef5f2f | |||
96982a768a | |||
fca78fbc46 | |||
67a678cfb6 | |||
d1331d8abf | |||
19ba129165 | |||
d4da06ff47 | |||
3e0471edae | |||
432df03c4c | |||
11958016dd | |||
63c250a04d | |||
06d8cd5b72 | |||
c0f2724c2d | |||
d772073dfa | |||
8fe8ddea79 | |||
8a95bf28e5 | |||
43989fe2e4 | |||
c668a29ed5 | |||
b10eeb0e41 | |||
4a8515e9fc |
8
.github/ISSUE_TEMPLATE/sprint_issue.md
vendored
8
.github/ISSUE_TEMPLATE/sprint_issue.md
vendored
@ -7,19 +7,17 @@ assignees: ''
|
||||
|
||||
---
|
||||
|
||||
Related product team resources: [roadmap card]() (_internal only_) and [PRD]() (_internal only_)
|
||||
Related product team resources: [PRD]() (_internal only_)
|
||||
Related product discussion:
|
||||
Related spec: WIP
|
||||
|
||||
## Motivation
|
||||
|
||||
<!---Copy/paste the information in the roadmap resources or briefly detail the product motivation. Ask product team if any hesitation.-->
|
||||
<!---Copy/paste the information in PRD or briefly detail the product motivation. Ask product team if any hesitation.-->
|
||||
|
||||
## Usage
|
||||
|
||||
<!---Write a quick description of the usage if the usage has already been defined-->
|
||||
|
||||
Refer to the final spec to know the details and the final decisions about the usage.
|
||||
<!---Link to the public part of the PRD, or to the related product discussion for experimental features-->
|
||||
|
||||
## TODO
|
||||
|
||||
|
@ -8,11 +8,11 @@ env:
|
||||
|
||||
jobs:
|
||||
run-benchmarks-on-comment:
|
||||
if: startsWith(github.event.comment.body, '/benchmark')
|
||||
name: Run and upload benchmarks
|
||||
runs-on: benchmarks
|
||||
timeout-minutes: 4320 # 72h
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
@ -27,14 +27,25 @@ jobs:
|
||||
reaction-type: "eyes"
|
||||
repo-token: ${{ env.GH_TOKEN }}
|
||||
|
||||
- uses: xt0rted/pull-request-comment-branch@v2
|
||||
id: comment-branch
|
||||
with:
|
||||
repo_token: ${{ env.GH_TOKEN }}
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
if: success()
|
||||
with:
|
||||
fetch-depth: 0 # fetch full history to be able to get main commit sha
|
||||
ref: ${{ steps.comment-branch.outputs.head_ref }}
|
||||
|
||||
# Set variables
|
||||
- name: Set current branch name
|
||||
shell: bash
|
||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
|
||||
run: echo "name=$(git rev-parse --abbrev-ref HEAD)" >> $GITHUB_OUTPUT
|
||||
id: current_branch
|
||||
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
|
||||
shell: bash
|
||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
|
||||
run: echo "name=$(git rev-parse --abbrev-ref HEAD | tr '/' '_')" >> $GITHUB_OUTPUT
|
||||
id: normalized_current_branch
|
||||
- name: Set shorter commit SHA
|
||||
shell: bash
|
||||
@ -76,9 +87,11 @@ jobs:
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||
run: |
|
||||
export base=$(git log --pretty=%p -n 1)
|
||||
set -x
|
||||
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
||||
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
||||
echo '```' >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
echo '```' >> body.txt
|
||||
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
|
||||
gh pr comment ${{ steps.current_branch.outputs.name }} --body-file body.txt
|
60
Cargo.lock
generated
60
Cargo.lock
generated
@ -468,7 +468,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
||||
|
||||
[[package]]
|
||||
name = "benchmarks"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
@ -1206,7 +1206,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "dump"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
@ -1417,7 +1417,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "file-store"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"faux",
|
||||
"tempfile",
|
||||
@ -1439,7 +1439,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "filter-parser"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"nom",
|
||||
@ -1459,7 +1459,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "flatten-serde-json"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@ -1577,7 +1577,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fuzzers"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"clap",
|
||||
@ -1663,12 +1663,13 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
|
||||
[[package]]
|
||||
name = "grenad"
|
||||
version = "0.4.4"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
|
||||
checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"rayon",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
@ -1891,7 +1892,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "index-scheduler"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
@ -2088,7 +2089,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "json-depth-checker"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@ -2500,7 +2501,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "meili-snap"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"md5",
|
||||
@ -2509,7 +2510,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"actix-cors",
|
||||
"actix-http",
|
||||
@ -2564,7 +2565,6 @@ dependencies = [
|
||||
"platform-dirs",
|
||||
"prometheus",
|
||||
"puffin",
|
||||
"puffin_http",
|
||||
"rand",
|
||||
"rayon",
|
||||
"regex",
|
||||
@ -2600,7 +2600,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-auth"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"base64 0.21.2",
|
||||
"enum-iterator",
|
||||
@ -2619,7 +2619,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-types"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"actix-web",
|
||||
"anyhow",
|
||||
@ -2673,7 +2673,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "milli"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"big_s",
|
||||
"bimap",
|
||||
@ -2867,9 +2867,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "obkv"
|
||||
version = "0.2.0"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
|
||||
checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
@ -2996,7 +2996,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
|
||||
|
||||
[[package]]
|
||||
name = "permissive-json-pointer"
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
dependencies = [
|
||||
"big_s",
|
||||
"serde_json",
|
||||
@ -3194,7 +3194,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"rustix 0.36.15",
|
||||
"rustix 0.36.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3237,18 +3237,6 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "puffin_http"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13bffc600c35913d282ae1e96a6ffcdf36dc7a7cdb9310e0ba15914d258c8193"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossbeam-channel",
|
||||
"log",
|
||||
"puffin",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.32"
|
||||
@ -3479,9 +3467,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.15"
|
||||
version = "0.36.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c37f1bd5ef1b5422177b7646cba67430579cfe2ace80f284fee876bca52ad941"
|
||||
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
@ -4444,9 +4432,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "webpki"
|
||||
version = "0.22.1"
|
||||
version = "0.22.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
|
||||
checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"untrusted",
|
||||
|
@ -18,7 +18,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.4.0"
|
||||
version = "1.4.1"
|
||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||
description = "Meilisearch HTTP server"
|
||||
homepage = "https://meilisearch.com"
|
||||
|
@ -1,14 +1,14 @@
|
||||
# Profiling Meilisearch
|
||||
|
||||
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options.
|
||||
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options [in Puffin Viewer](https://github.com/embarkstudios/puffin#ui).
|
||||
|
||||

|
||||
|
||||
## Profiling the Indexing Process
|
||||
|
||||
When you enable the `profile-with-puffin` feature of Meilisearch, a Puffin HTTP server will run on Meilisearch and listen on the default _0.0.0.0:8585_ address. This server will record a "frame" whenever it executes the `IndexScheduler::tick` method.
|
||||
When you enable [the `exportPuffinReports` experimental feature](https://www.meilisearch.com/docs/learn/experimental/overview) of Meilisearch, Puffin reports with the `.puffin` extension will be automatically exported to disk. When this option is enabled, the engine will automatically create a "frame" whenever it executes the `IndexScheduler::tick` method.
|
||||
|
||||
Once your Meilisearch is running and awaits new indexation operations, you must [install and run the `puffin_viewer` tool](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) to see the profiling results. I advise you to run the viewer with the `RUST_LOG=puffin_http::client=debug` environment variable to see the client trying to connect to your server.
|
||||
[Puffin Viewer](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) is used to analyze the reports. Those reports show areas where Meilisearch spent time during indexing.
|
||||
|
||||
Another piece of advice on the Puffin viewer UI interface is to consider the _Merge children with same ID_ option. It can hide the exact actual timings at which events were sent. Please turn it off when you see strange gaps on the Flamegraph. It can help.
|
||||
|
||||
|
@ -25,6 +25,12 @@
|
||||
|
||||
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
|
||||
|
||||
---
|
||||
|
||||
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
|
||||
|
||||
---
|
||||
|
||||
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
|
||||
|
||||
<p align="center" name="demo">
|
||||
|
@ -6,9 +6,7 @@ use std::path::Path;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use milli::heed::{EnvOpenOptions, RwTxn};
|
||||
use milli::update::{
|
||||
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
|
||||
};
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::Index;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_chacha::rand_core::SeedableRng;
|
||||
@ -266,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
@ -613,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
@ -875,22 +853,41 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let indexer_config = IndexerConfig::default();
|
||||
for ids in document_ids_to_delete {
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(&wtxn, ids)
|
||||
.unwrap()
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let ids = external_to_internal.unwrap();
|
||||
let config = IndexDocumentsConfig::default();
|
||||
|
||||
let mut builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
|
||||
.unwrap();
|
||||
(builder, _) = builder.remove_documents(ids).unwrap();
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(BENCHMARK_ITERATION);
|
||||
@ -1112,17 +1109,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
@ -1338,17 +1325,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
},
|
||||
)
|
||||
});
|
||||
|
@ -526,12 +526,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -541,12 +541,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -571,12 +571,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -617,12 +617,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -632,12 +632,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -647,12 +647,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
|
@ -1,24 +0,0 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: spells.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
@ -1,38 +0,0 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: products.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {
|
||||
"android": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"iphone": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"phone": [
|
||||
"android",
|
||||
"iphone",
|
||||
"smartphone"
|
||||
]
|
||||
},
|
||||
"distinctAttribute": null
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: movies.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness",
|
||||
"release_date:asc"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
|
||||
pub type Unchecked = settings::Unchecked;
|
||||
|
||||
pub type Task = updates::UpdateEntry;
|
||||
pub type Kind = updates::UpdateMeta;
|
||||
|
||||
// everything related to the errors
|
||||
pub type ResponseError = errors::ResponseError;
|
||||
@ -107,8 +108,11 @@ impl V2Reader {
|
||||
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
||||
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
||||
V2IndexReader::new(
|
||||
index.uid.clone(),
|
||||
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
||||
index,
|
||||
BufReader::new(
|
||||
File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
|
||||
),
|
||||
)
|
||||
}))
|
||||
}
|
||||
@ -143,16 +147,41 @@ pub struct V2IndexReader {
|
||||
}
|
||||
|
||||
impl V2IndexReader {
|
||||
pub fn new(name: String, path: &Path) -> Result<Self> {
|
||||
pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
|
||||
let meta = File::open(path.join("meta.json"))?;
|
||||
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
||||
|
||||
let mut created_at = None;
|
||||
let mut updated_at = None;
|
||||
|
||||
for line in tasks.lines() {
|
||||
let task: Task = serde_json::from_str(&line?)?;
|
||||
if !(task.uuid == index_uuid.uuid && task.is_finished()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let new_created_at = match task.update.meta() {
|
||||
Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
|
||||
_ => None,
|
||||
};
|
||||
let new_updated_at = task.update.finished_at();
|
||||
|
||||
if created_at.is_none() || created_at > new_created_at {
|
||||
created_at = new_created_at;
|
||||
}
|
||||
|
||||
if updated_at.is_none() || updated_at < new_updated_at {
|
||||
updated_at = new_updated_at;
|
||||
}
|
||||
}
|
||||
|
||||
let current_time = OffsetDateTime::now_utc();
|
||||
|
||||
let metadata = IndexMetadata {
|
||||
uid: name,
|
||||
uid: index_uuid.uid.clone(),
|
||||
primary_key: meta.primary_key,
|
||||
// FIXME: Iterate over the whole task queue to find the creation and last update date.
|
||||
created_at: OffsetDateTime::now_utc(),
|
||||
updated_at: OffsetDateTime::now_utc(),
|
||||
created_at: created_at.unwrap_or(current_time),
|
||||
updated_at: updated_at.unwrap_or(current_time),
|
||||
};
|
||||
|
||||
let ret = V2IndexReader {
|
||||
@ -248,12 +277,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -263,12 +292,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -293,12 +322,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -340,12 +369,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -355,12 +384,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -370,12 +399,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||
}
|
||||
"###);
|
||||
|
||||
|
@ -227,4 +227,14 @@ impl UpdateStatus {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finished_at(&self) -> Option<OffsetDateTime> {
|
||||
match self {
|
||||
UpdateStatus::Processing(_) => None,
|
||||
UpdateStatus::Enqueued(_) => None,
|
||||
UpdateStatus::Processed(u) => Some(u.processed_at),
|
||||
UpdateStatus::Aborted(_) => None,
|
||||
UpdateStatus::Failed(u) => Some(u.failed_at),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -19,18 +19,18 @@ one indexing operation.
|
||||
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::ffi::OsStr;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File};
|
||||
use std::io::BufWriter;
|
||||
|
||||
use dump::IndexMetadata;
|
||||
use log::{debug, error, info};
|
||||
use log::{debug, error, info, trace};
|
||||
use meilisearch_types::error::Code;
|
||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::{
|
||||
DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
Settings as MilliSettings,
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
};
|
||||
use meilisearch_types::milli::{self, Filter, BEU32};
|
||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||
@ -43,7 +43,7 @@ use uuid::Uuid;
|
||||
|
||||
use crate::autobatcher::{self, BatchKind};
|
||||
use crate::utils::{self, swap_index_uid_in_task};
|
||||
use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
|
||||
use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
|
||||
|
||||
/// Represents a combination of tasks that can all be processed at the same time.
|
||||
///
|
||||
@ -199,6 +199,29 @@ impl Batch {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Batch {
|
||||
/// A text used when we debug the profiling reports.
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let index_uid = self.index_uid();
|
||||
let tasks = self.ids();
|
||||
match self {
|
||||
Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?,
|
||||
Batch::TaskDeletion(_) => f.write_str("TaskDeletion")?,
|
||||
Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?,
|
||||
Batch::Dump(_) => f.write_str("Dump")?,
|
||||
Batch::IndexOperation { op, .. } => write!(f, "{op}")?,
|
||||
Batch::IndexCreation { .. } => f.write_str("IndexCreation")?,
|
||||
Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?,
|
||||
Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?,
|
||||
Batch::IndexSwap { .. } => f.write_str("IndexSwap")?,
|
||||
};
|
||||
match index_uid {
|
||||
Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")),
|
||||
None => f.write_fmt(format_args!(" from tasks: {tasks:?}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexOperation {
|
||||
pub fn index_uid(&self) -> &str {
|
||||
match self {
|
||||
@ -213,6 +236,30 @@ impl IndexOperation {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for IndexOperation {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::DocumentOperation")
|
||||
}
|
||||
IndexOperation::DocumentDeletion { .. } => {
|
||||
f.write_str("IndexOperation::DocumentDeletion")
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||
}
|
||||
IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"),
|
||||
IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"),
|
||||
IndexOperation::DocumentClearAndSetting { .. } => {
|
||||
f.write_str("IndexOperation::DocumentClearAndSetting")
|
||||
}
|
||||
IndexOperation::SettingsAndDocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::SettingsAndDocumentOperation")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexScheduler {
|
||||
/// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`].
|
||||
///
|
||||
@ -581,7 +628,7 @@ impl IndexScheduler {
|
||||
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
|
||||
}
|
||||
|
||||
puffin::profile_function!(format!("{:?}", batch));
|
||||
puffin::profile_function!(batch.to_string());
|
||||
|
||||
match batch {
|
||||
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
|
||||
@ -848,7 +895,7 @@ impl IndexScheduler {
|
||||
})?;
|
||||
|
||||
// 4. Dump experimental feature settings
|
||||
let features = self.features()?.runtime_features();
|
||||
let features = self.features().runtime_features();
|
||||
dump.create_experimental_features(features)?;
|
||||
|
||||
let dump_uid = started_at.format(format_description!(
|
||||
@ -1143,7 +1190,7 @@ impl IndexScheduler {
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
@ -1190,7 +1237,8 @@ impl IndexScheduler {
|
||||
let (new_builder, user_result) =
|
||||
builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
let provided_ids =
|
||||
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
||||
task.details
|
||||
@ -1201,23 +1249,11 @@ impl IndexScheduler {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
match user_result {
|
||||
Ok(count) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(0),
|
||||
});
|
||||
task.error = Some(milli::Error::from(e).into());
|
||||
}
|
||||
}
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1232,7 +1268,7 @@ impl IndexScheduler {
|
||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
builder.reset_primary_key();
|
||||
builder.execute(
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.clone().get(),
|
||||
)?;
|
||||
}
|
||||
@ -1240,21 +1276,42 @@ impl IndexScheduler {
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
|
||||
let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
|
||||
documents.iter().flatten().for_each(|id| {
|
||||
builder.delete_external_id(id);
|
||||
});
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
let must_stop_processing = self.must_stop_processing.clone();
|
||||
|
||||
let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
index_wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let document_ids = documents.iter().flatten().cloned().collect();
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
for (task, documents) in tasks.iter_mut().zip(documents) {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids: documents.len(),
|
||||
deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
|
||||
deleted_documents: Some(count.min(documents.len() as u64)),
|
||||
});
|
||||
}
|
||||
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
let addition = builder.execute()?;
|
||||
info!("document deletion done: {:?}", addition);
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
@ -1266,7 +1323,13 @@ impl IndexScheduler {
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
|
||||
let deleted_documents = delete_document_by_filter(
|
||||
index_wtxn,
|
||||
filter,
|
||||
self.index_mapper.indexer_config(),
|
||||
self.must_stop_processing.clone(),
|
||||
index,
|
||||
);
|
||||
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: _,
|
||||
@ -1500,6 +1563,8 @@ impl IndexScheduler {
|
||||
fn delete_document_by_filter<'a>(
|
||||
wtxn: &mut RwTxn<'a, '_>,
|
||||
filter: &serde_json::Value,
|
||||
indexer_config: &IndexerConfig,
|
||||
must_stop_processing: MustStopProcessing,
|
||||
index: &'a Index,
|
||||
) -> Result<u64> {
|
||||
let filter = Filter::from_json(filter)?;
|
||||
@ -1510,9 +1575,41 @@ fn delete_document_by_filter<'a>(
|
||||
}
|
||||
e => e.into(),
|
||||
})?;
|
||||
let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
|
||||
delete_operation.delete_documents(&candidates);
|
||||
delete_operation.execute().map(|result| result.deleted_documents)?
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(wtxn, candidates)?
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let document_ids = match external_to_internal {
|
||||
Ok(external_ids) => external_ids,
|
||||
Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
|
||||
};
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
let _ = builder.execute()?;
|
||||
|
||||
count
|
||||
} else {
|
||||
0
|
||||
})
|
||||
|
@ -1,6 +1,8 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
|
||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
|
||||
use meilisearch_types::heed::{Database, Env, RwTxn};
|
||||
|
||||
use crate::error::FeatureNotEnabledError;
|
||||
use crate::Result;
|
||||
@ -9,20 +11,19 @@ const EXPERIMENTAL_FEATURES: &str = "experimental-features";
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct FeatureData {
|
||||
runtime: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
|
||||
instance: InstanceTogglableFeatures,
|
||||
persisted: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
|
||||
runtime: Arc<RwLock<RuntimeTogglableFeatures>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RoFeatures {
|
||||
runtime: RuntimeTogglableFeatures,
|
||||
instance: InstanceTogglableFeatures,
|
||||
}
|
||||
|
||||
impl RoFeatures {
|
||||
fn new(txn: RoTxn<'_>, data: &FeatureData) -> Result<Self> {
|
||||
let runtime = data.runtime_features(txn)?;
|
||||
Ok(Self { runtime, instance: data.instance })
|
||||
fn new(data: &FeatureData) -> Self {
|
||||
let runtime = data.runtime_features();
|
||||
Self { runtime }
|
||||
}
|
||||
|
||||
pub fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
||||
@ -43,13 +44,13 @@ impl RoFeatures {
|
||||
}
|
||||
|
||||
pub fn check_metrics(&self) -> Result<()> {
|
||||
if self.instance.metrics {
|
||||
if self.runtime.metrics {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Getting metrics",
|
||||
feature: "metrics",
|
||||
issue_link: "https://github.com/meilisearch/meilisearch/discussions/3518",
|
||||
issue_link: "https://github.com/meilisearch/product/discussions/625",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
@ -67,15 +68,36 @@ impl RoFeatures {
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_puffin(&self) -> Result<()> {
|
||||
if self.runtime.export_puffin_reports {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Outputting Puffin reports to disk",
|
||||
feature: "export puffin reports",
|
||||
issue_link: "https://github.com/meilisearch/product/discussions/693",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FeatureData {
|
||||
pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result<Self> {
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let runtime_features = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
|
||||
let runtime_features_db = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
|
||||
wtxn.commit()?;
|
||||
|
||||
Ok(Self { runtime: runtime_features, instance: instance_features })
|
||||
let txn = env.read_txn()?;
|
||||
let persisted_features: RuntimeTogglableFeatures =
|
||||
runtime_features_db.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default();
|
||||
let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures {
|
||||
metrics: instance_features.metrics || persisted_features.metrics,
|
||||
..persisted_features
|
||||
}));
|
||||
|
||||
Ok(Self { persisted: runtime_features_db, runtime })
|
||||
}
|
||||
|
||||
pub fn put_runtime_features(
|
||||
@ -83,16 +105,25 @@ impl FeatureData {
|
||||
mut wtxn: RwTxn,
|
||||
features: RuntimeTogglableFeatures,
|
||||
) -> Result<()> {
|
||||
self.runtime.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
|
||||
self.persisted.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
|
||||
wtxn.commit()?;
|
||||
|
||||
// safe to unwrap, the lock will only fail if:
|
||||
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
|
||||
// 2. there's a panic while the thread is held -> it is only used for an assignment here.
|
||||
let mut toggled_features = self.runtime.write().unwrap();
|
||||
*toggled_features = features;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn runtime_features(&self, txn: RoTxn) -> Result<RuntimeTogglableFeatures> {
|
||||
Ok(self.runtime.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default())
|
||||
fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
||||
// sound to unwrap, the lock will only fail if:
|
||||
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
|
||||
// 2. there's a panic while the thread is held -> it is only used for copying the data here
|
||||
*self.runtime.read().unwrap()
|
||||
}
|
||||
|
||||
pub fn features(&self, txn: RoTxn) -> Result<RoFeatures> {
|
||||
RoFeatures::new(txn, self)
|
||||
pub fn features(&self) -> RoFeatures {
|
||||
RoFeatures::new(self)
|
||||
}
|
||||
}
|
||||
|
@ -30,6 +30,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
||||
index_mapper,
|
||||
features: _,
|
||||
max_number_of_tasks: _,
|
||||
puffin_frame: _,
|
||||
wake_up: _,
|
||||
dumps_path: _,
|
||||
snapshots_path: _,
|
||||
|
@ -33,6 +33,7 @@ pub type Result<T> = std::result::Result<T, Error>;
|
||||
pub type TaskId = u32;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@ -52,6 +53,7 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
||||
use meilisearch_types::milli::update::IndexerConfig;
|
||||
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||
use puffin::FrameView;
|
||||
use roaring::RoaringBitmap;
|
||||
use synchronoise::SignalEvent;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
@ -314,6 +316,9 @@ pub struct IndexScheduler {
|
||||
/// the finished tasks automatically.
|
||||
pub(crate) max_number_of_tasks: usize,
|
||||
|
||||
/// A frame to output the indexation profiling files to disk.
|
||||
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
|
||||
|
||||
/// The path used to create the dumps.
|
||||
pub(crate) dumps_path: PathBuf,
|
||||
|
||||
@ -364,6 +369,7 @@ impl IndexScheduler {
|
||||
wake_up: self.wake_up.clone(),
|
||||
autobatching_enabled: self.autobatching_enabled,
|
||||
max_number_of_tasks: self.max_number_of_tasks,
|
||||
puffin_frame: self.puffin_frame.clone(),
|
||||
snapshots_path: self.snapshots_path.clone(),
|
||||
dumps_path: self.dumps_path.clone(),
|
||||
auth_path: self.auth_path.clone(),
|
||||
@ -457,6 +463,7 @@ impl IndexScheduler {
|
||||
env,
|
||||
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
|
||||
wake_up: Arc::new(SignalEvent::auto(true)),
|
||||
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
|
||||
autobatching_enabled: options.autobatching_enabled,
|
||||
max_number_of_tasks: options.max_number_of_tasks,
|
||||
dumps_path: options.dumps_path,
|
||||
@ -572,17 +579,46 @@ impl IndexScheduler {
|
||||
run.wake_up.wait();
|
||||
|
||||
loop {
|
||||
let puffin_enabled = run.features().check_puffin().is_ok();
|
||||
puffin::set_scopes_on(puffin_enabled);
|
||||
puffin::GlobalProfiler::lock().new_frame();
|
||||
|
||||
match run.tick() {
|
||||
Ok(TickOutcome::TickAgain(_)) => (),
|
||||
Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(),
|
||||
Err(e) => {
|
||||
log::error!("{}", e);
|
||||
log::error!("{e}");
|
||||
// Wait one second when an irrecoverable error occurs.
|
||||
if !e.is_recoverable() {
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Let's write the previous frame to disk but only if
|
||||
// the user wanted to profile with puffin.
|
||||
if puffin_enabled {
|
||||
let mut frame_view = run.puffin_frame.lock();
|
||||
if !frame_view.is_empty() {
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let mut file = match File::create(format!("{}.puffin", now)) {
|
||||
Ok(file) => file,
|
||||
Err(e) => {
|
||||
log::error!("{e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if let Err(e) = frame_view.save_to_writer(&mut file) {
|
||||
log::error!("{e}");
|
||||
}
|
||||
if let Err(e) = file.sync_all() {
|
||||
log::error!("{e}");
|
||||
}
|
||||
// We erase this frame view as it is no more useful. We want to
|
||||
// measure the new frames now that we exported the previous ones.
|
||||
*frame_view = FrameView::default();
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
@ -1062,8 +1098,6 @@ impl IndexScheduler {
|
||||
self.breakpoint(Breakpoint::Start);
|
||||
}
|
||||
|
||||
puffin::GlobalProfiler::lock().new_frame();
|
||||
|
||||
self.cleanup_task_queue()?;
|
||||
|
||||
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
||||
@ -1259,9 +1293,8 @@ impl IndexScheduler {
|
||||
Ok(IndexStats { is_indexing, inner_stats: index_stats })
|
||||
}
|
||||
|
||||
pub fn features(&self) -> Result<RoFeatures> {
|
||||
let rtxn = self.read_txn()?;
|
||||
self.features.features(rtxn)
|
||||
pub fn features(&self) -> RoFeatures {
|
||||
self.features.features()
|
||||
}
|
||||
|
||||
pub fn put_runtime_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {
|
||||
|
@ -324,7 +324,6 @@ impl ErrorCode for milli::Error {
|
||||
UserError::SerdeJson(_)
|
||||
| UserError::InvalidLmdbOpenOptions
|
||||
| UserError::DocumentLimitReached
|
||||
| UserError::AccessingSoftDeletedDocument { .. }
|
||||
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
||||
UserError::InvalidStoreFile => Code::InvalidStoreFile,
|
||||
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
||||
|
@ -5,6 +5,8 @@ use serde::{Deserialize, Serialize};
|
||||
pub struct RuntimeTogglableFeatures {
|
||||
pub score_details: bool,
|
||||
pub vector_store: bool,
|
||||
pub metrics: bool,
|
||||
pub export_puffin_reports: bool,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy)]
|
||||
|
@ -69,8 +69,7 @@ permissive-json-pointer = { path = "../permissive-json-pointer" }
|
||||
pin-project-lite = "0.2.9"
|
||||
platform-dirs = "0.3.0"
|
||||
prometheus = { version = "0.13.3", features = ["process"] }
|
||||
puffin = "0.16.0"
|
||||
puffin_http = { version = "0.13.0", optional = true }
|
||||
puffin = { version = "0.16.0", features = ["serialization"] }
|
||||
rand = "0.8.5"
|
||||
rayon = "1.7.0"
|
||||
regex = "1.7.3"
|
||||
@ -135,7 +134,6 @@ zip = { version = "0.6.4", optional = true }
|
||||
[features]
|
||||
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
|
||||
analytics = ["segment"]
|
||||
profile-with-puffin = ["dep:puffin_http"]
|
||||
mini-dashboard = [
|
||||
"actix-web-static-files",
|
||||
"static-files",
|
||||
|
@ -114,10 +114,7 @@ pub fn create_app(
|
||||
.configure(routes::configure)
|
||||
.configure(|s| dashboard(s, enable_dashboard));
|
||||
|
||||
let app = app.wrap(actix_web::middleware::Condition::new(
|
||||
opt.experimental_enable_metrics,
|
||||
middleware::RouteMetrics,
|
||||
));
|
||||
let app = app.wrap(middleware::RouteMetrics);
|
||||
app.wrap(
|
||||
Cors::default()
|
||||
.send_wildcard()
|
||||
@ -365,7 +362,7 @@ fn import_dump(
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
},
|
||||
|indexing_step| log::debug!("update: {:?}", indexing_step),
|
||||
|indexing_step| log::trace!("update: {:?}", indexing_step),
|
||||
|| false,
|
||||
)?;
|
||||
|
||||
|
@ -30,10 +30,6 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let (opt, config_read_from) = Opt::try_build()?;
|
||||
|
||||
#[cfg(feature = "profile-with-puffin")]
|
||||
let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
|
||||
puffin::set_scopes_on(cfg!(feature = "profile-with-puffin"));
|
||||
|
||||
anyhow::ensure!(
|
||||
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
|
||||
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
|
||||
|
@ -3,8 +3,10 @@
|
||||
use std::future::{ready, Ready};
|
||||
|
||||
use actix_web::dev::{self, Service, ServiceRequest, ServiceResponse, Transform};
|
||||
use actix_web::web::Data;
|
||||
use actix_web::Error;
|
||||
use futures_util::future::LocalBoxFuture;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use prometheus::HistogramTimer;
|
||||
|
||||
pub struct RouteMetrics;
|
||||
@ -47,19 +49,27 @@ where
|
||||
|
||||
fn call(&self, req: ServiceRequest) -> Self::Future {
|
||||
let mut histogram_timer: Option<HistogramTimer> = None;
|
||||
let request_path = req.path();
|
||||
let is_registered_resource = req.resource_map().has_resource(request_path);
|
||||
if is_registered_resource {
|
||||
let request_method = req.method().to_string();
|
||||
histogram_timer = Some(
|
||||
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
||||
|
||||
// calling unwrap here is safe because index scheduler is added to app data while creating actix app.
|
||||
// also, the tests will fail if this is not present.
|
||||
let index_scheduler = req.app_data::<Data<IndexScheduler>>().unwrap();
|
||||
let features = index_scheduler.features();
|
||||
|
||||
if features.check_metrics().is_ok() {
|
||||
let request_path = req.path();
|
||||
let is_registered_resource = req.resource_map().has_resource(request_path);
|
||||
if is_registered_resource {
|
||||
let request_method = req.method().to_string();
|
||||
histogram_timer = Some(
|
||||
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.start_timer(),
|
||||
);
|
||||
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.start_timer(),
|
||||
);
|
||||
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.inc();
|
||||
}
|
||||
.inc();
|
||||
}
|
||||
};
|
||||
|
||||
let fut = self.service.call(req);
|
||||
|
||||
|
@ -29,12 +29,12 @@ async fn get_features(
|
||||
>,
|
||||
req: HttpRequest,
|
||||
analytics: Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let features = index_scheduler.features()?;
|
||||
) -> HttpResponse {
|
||||
let features = index_scheduler.features();
|
||||
|
||||
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
|
||||
debug!("returns: {:?}", features.runtime_features());
|
||||
Ok(HttpResponse::Ok().json(features.runtime_features()))
|
||||
HttpResponse::Ok().json(features.runtime_features())
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr)]
|
||||
@ -44,6 +44,10 @@ pub struct RuntimeTogglableFeatures {
|
||||
pub score_details: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub vector_store: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub metrics: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub export_puffin_reports: Option<bool>,
|
||||
}
|
||||
|
||||
async fn patch_features(
|
||||
@ -55,26 +59,36 @@ async fn patch_features(
|
||||
req: HttpRequest,
|
||||
analytics: Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let features = index_scheduler.features()?;
|
||||
let features = index_scheduler.features();
|
||||
|
||||
let old_features = features.runtime_features();
|
||||
|
||||
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
|
||||
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
|
||||
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
|
||||
metrics: new_features.0.metrics.unwrap_or(old_features.metrics),
|
||||
export_puffin_reports: new_features
|
||||
.0
|
||||
.export_puffin_reports
|
||||
.unwrap_or(old_features.export_puffin_reports),
|
||||
};
|
||||
|
||||
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
|
||||
// the it renames to camelCase, which we don't want for analytics.
|
||||
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
|
||||
let meilisearch_types::features::RuntimeTogglableFeatures { score_details, vector_store } =
|
||||
new_features;
|
||||
let meilisearch_types::features::RuntimeTogglableFeatures {
|
||||
score_details,
|
||||
vector_store,
|
||||
metrics,
|
||||
export_puffin_reports,
|
||||
} = new_features;
|
||||
|
||||
analytics.publish(
|
||||
"Experimental features Updated".to_string(),
|
||||
json!({
|
||||
"score_details": score_details,
|
||||
"vector_store": vector_store,
|
||||
"metrics": metrics,
|
||||
"export_puffin_reports": export_puffin_reports,
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
|
@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>(
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
let internal_id = index
|
||||
.external_documents_ids(&txn)?
|
||||
.get(doc_id.as_bytes())
|
||||
.external_documents_ids()
|
||||
.get(&txn, doc_id)?
|
||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||
|
||||
let document = index
|
||||
|
@ -68,7 +68,7 @@ pub async fn search(
|
||||
}
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let features = index_scheduler.features()?;
|
||||
let features = index_scheduler.features();
|
||||
let search_result = tokio::task::spawn_blocking(move || {
|
||||
perform_facet_search(&index, search_query, facet_query, facet_name, features)
|
||||
})
|
||||
|
@ -157,7 +157,7 @@ pub async fn search_with_url_query(
|
||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let features = index_scheduler.features()?;
|
||||
let features = index_scheduler.features();
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
@ -192,7 +192,7 @@ pub async fn search_with_post(
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
|
||||
let features = index_scheduler.features()?;
|
||||
let features = index_scheduler.features();
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
|
@ -19,7 +19,7 @@ pub async fn get_metrics(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
|
||||
auth_controller: Data<AuthController>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
index_scheduler.features()?.check_metrics()?;
|
||||
index_scheduler.features().check_metrics()?;
|
||||
let auth_filters = index_scheduler.filters();
|
||||
if !auth_filters.all_indexes_authorized() {
|
||||
let mut error = ResponseError::from(AuthenticationError::InvalidToken);
|
||||
|
@ -41,7 +41,7 @@ pub async fn multi_search_with_post(
|
||||
let queries = params.into_inner().queries;
|
||||
|
||||
let mut multi_aggregate = MultiSearchAggregator::from_queries(&queries, &req);
|
||||
let features = index_scheduler.features()?;
|
||||
let features = index_scheduler.features();
|
||||
|
||||
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
|
||||
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code
|
||||
|
@ -2,10 +2,12 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use ::time::format_description::well_known::Rfc3339;
|
||||
use maplit::{hashmap, hashset};
|
||||
use meilisearch::Opt;
|
||||
use once_cell::sync::Lazy;
|
||||
use tempfile::TempDir;
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use crate::common::{Server, Value};
|
||||
use crate::common::{default_settings, Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
|
||||
@ -195,7 +197,9 @@ async fn access_authorized_master_key() {
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn access_authorized_restricted_index() {
|
||||
let mut server = Server::new_auth().await;
|
||||
let dir = TempDir::new().unwrap();
|
||||
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
|
||||
let mut server = Server::new_auth_with_options(enable_metrics, dir).await;
|
||||
for ((method, route), actions) in AUTHORIZATIONS.iter() {
|
||||
for action in actions {
|
||||
// create a new API key letting only the needed action.
|
||||
|
@ -202,6 +202,10 @@ impl Server {
|
||||
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
|
||||
self.service.patch("/experimental-features", value).await
|
||||
}
|
||||
|
||||
pub async fn get_metrics(&self) -> (Value, StatusCode) {
|
||||
self.service.get("/metrics").await
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
@ -221,7 +225,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
skip_index_budget: true,
|
||||
..Parser::parse_from(None as Option<&str>)
|
||||
},
|
||||
experimental_enable_metrics: true,
|
||||
experimental_enable_metrics: false,
|
||||
..Parser::parse_from(None as Option<&str>)
|
||||
}
|
||||
}
|
||||
|
@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"providedIds": 0,
|
||||
"deletedDocuments": 4,
|
||||
"deletedDocuments": 2,
|
||||
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
|
||||
},
|
||||
"error": null,
|
||||
|
@ -1,4 +1,7 @@
|
||||
use crate::common::Server;
|
||||
use meilisearch::Opt;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::common::{default_settings, Server};
|
||||
use crate::json;
|
||||
|
||||
/// Feature name to test against.
|
||||
@ -16,7 +19,9 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false
|
||||
"vectorStore": false,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -26,7 +31,9 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -36,7 +43,9 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -47,7 +56,9 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@ -58,11 +69,73 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn experimental_feature_metrics() {
|
||||
// instance flag for metrics enables metrics at startup
|
||||
let dir = TempDir::new().unwrap();
|
||||
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
|
||||
let server = Server::new_with_options(enable_metrics).await.unwrap();
|
||||
|
||||
let (response, code) = server.get_features().await;
|
||||
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": true,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = server.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
|
||||
// metrics are not returned in json format
|
||||
// so the test server will return null
|
||||
meili_snap::snapshot!(response, @"null");
|
||||
|
||||
// disabling metrics results in invalid request
|
||||
let (response, code) = server.set_features(json!({"metrics": false})).await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response["metrics"], @"false");
|
||||
|
||||
let (response, code) = server.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
|
||||
// enabling metrics via HTTP results in valid request
|
||||
let (response, code) = server.set_features(json!({"metrics": true})).await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response["metrics"], @"true");
|
||||
|
||||
let (response, code) = server.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response, @"null");
|
||||
|
||||
// startup without flag respects persisted metrics value
|
||||
let disable_metrics =
|
||||
Opt { experimental_enable_metrics: false, ..default_settings(dir.path()) };
|
||||
let server_no_flag = Server::new_with_options(disable_metrics).await.unwrap();
|
||||
let (response, code) = server_no_flag.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response, @"null");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn errors() {
|
||||
let server = Server::new().await;
|
||||
@ -73,7 +146,7 @@ async fn errors() {
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`",
|
||||
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
|
63
meilisearch/tests/search/distinct.rs
Normal file
63
meilisearch/tests/search/distinct.rs
Normal file
@ -0,0 +1,63 @@
|
||||
use meili_snap::snapshot;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{"productId": 1, "shopId": 1},
|
||||
{"productId": 2, "shopId": 1},
|
||||
{"productId": 3, "shopId": 2},
|
||||
{"productId": 4, "shopId": 2},
|
||||
{"productId": 5, "shopId": 3},
|
||||
{"productId": 6, "shopId": 3},
|
||||
{"productId": 7, "shopId": 4},
|
||||
{"productId": 8, "shopId": 4},
|
||||
{"productId": 9, "shopId": 5},
|
||||
{"productId": 10, "shopId": 5}
|
||||
])
|
||||
});
|
||||
|
||||
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
|
||||
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
|
||||
|
||||
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||
#[actix_rt::test]
|
||||
async fn distinct_search_with_offset_no_ranking() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
|
||||
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
fn get_hits(Value(response): Value) -> Vec<i64> {
|
||||
let hits_array = response["hits"].as_array().unwrap();
|
||||
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @"[1, 2]");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @"[3, 4]");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"1");
|
||||
snapshot!(format!("{:?}", hits), @"[5]");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"0");
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
// This modules contains all the test concerning search. Each particular feature of the search
|
||||
// should be tested in its own module to isolate tests and keep the tests readable.
|
||||
|
||||
mod distinct;
|
||||
mod errors;
|
||||
mod facet_search;
|
||||
mod formatted;
|
||||
@ -816,7 +817,7 @@ async fn experimental_feature_score_details() {
|
||||
},
|
||||
"proximity": {
|
||||
"order": 2,
|
||||
"score": 0.875
|
||||
"score": 0.75
|
||||
},
|
||||
"attribute": {
|
||||
"order": 3,
|
||||
|
@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.4", default-features = false, features = [
|
||||
"tempfile",
|
||||
grenad = { version = "0.4.5", default-features = false, features = [
|
||||
"rayon", "tempfile"
|
||||
] }
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
||||
"lmdb", "read-txn-no-tls"
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, str};
|
||||
|
||||
use obkv::KvReader;
|
||||
@ -19,14 +20,14 @@ use crate::FieldId;
|
||||
pub struct EnrichedDocumentsBatchReader<R> {
|
||||
documents: DocumentsBatchReader<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::ReaderCursor<File>,
|
||||
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||
pub fn new(
|
||||
documents: DocumentsBatchReader<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::Reader<File>,
|
||||
external_ids: grenad::Reader<BufReader<File>>,
|
||||
) -> Result<Self, Error> {
|
||||
if documents.documents_count() as u64 == external_ids.len() {
|
||||
Ok(EnrichedDocumentsBatchReader {
|
||||
@ -75,7 +76,7 @@ pub struct EnrichedDocument<'a> {
|
||||
pub struct EnrichedDocumentsBatchCursor<R> {
|
||||
documents: DocumentsBatchCursor<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::ReaderCursor<File>,
|
||||
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||
}
|
||||
|
||||
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||
|
@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry {
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum UserError {
|
||||
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
|
||||
AccessingSoftDeletedDocument { document_id: DocumentId },
|
||||
#[error("A document cannot contain more than 65,535 fields.")]
|
||||
AttributeLimitReached,
|
||||
#[error(transparent)]
|
||||
|
@ -1,159 +1,146 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::{fmt, str};
|
||||
|
||||
use fst::map::IndexedValue;
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use heed::types::{OwnedType, Str};
|
||||
use heed::{Database, RoIter, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
const DELETED_ID: u64 = u64::MAX;
|
||||
use crate::{DocumentId, BEU32};
|
||||
|
||||
pub struct ExternalDocumentsIds<'a> {
|
||||
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
|
||||
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
|
||||
soft_deleted_docids: RoaringBitmap,
|
||||
pub enum DocumentOperationKind {
|
||||
Create,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl<'a> ExternalDocumentsIds<'a> {
|
||||
pub fn new(
|
||||
hard: fst::Map<Cow<'a, [u8]>>,
|
||||
soft: fst::Map<Cow<'a, [u8]>>,
|
||||
soft_deleted_docids: RoaringBitmap,
|
||||
) -> ExternalDocumentsIds<'a> {
|
||||
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
|
||||
}
|
||||
pub struct DocumentOperation {
|
||||
pub external_id: String,
|
||||
pub internal_id: DocumentId,
|
||||
pub kind: DocumentOperationKind,
|
||||
}
|
||||
|
||||
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
|
||||
ExternalDocumentsIds {
|
||||
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
||||
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
||||
soft_deleted_docids: self.soft_deleted_docids,
|
||||
}
|
||||
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
|
||||
|
||||
impl ExternalDocumentsIds {
|
||||
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
|
||||
ExternalDocumentsIds(db)
|
||||
}
|
||||
|
||||
/// Returns `true` if hard and soft external documents lists are empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.hard.is_empty() && self.soft.is_empty()
|
||||
pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> {
|
||||
self.0.is_empty(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
|
||||
let external_id = external_id.as_ref();
|
||||
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
|
||||
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
|
||||
Some(id.try_into().unwrap())
|
||||
}
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
||||
/// don't contain any soft deleted document id.
|
||||
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, docids)) = iter.next() {
|
||||
// prefer selecting the ids from soft, always
|
||||
let id = indexed_last_value(docids).unwrap();
|
||||
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
|
||||
new_hard_builder.insert(external_id, id)?;
|
||||
}
|
||||
}
|
||||
drop(iter);
|
||||
|
||||
// Delete soft map completely
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
// We save the new map as the new hard map.
|
||||
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
|
||||
let union_op = self.soft.op().add(other).r#union();
|
||||
|
||||
let mut new_soft_builder = fst::MapBuilder::memory();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let id = indexed_last_value(marked_docids).unwrap();
|
||||
new_soft_builder.insert(external_id, id)?;
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We save the new map as the new soft map.
|
||||
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
||||
self.merge_soft_into_hard()
|
||||
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
|
||||
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
/// soft and hard fst maps, combined.
|
||||
pub fn to_hash_map(&self) -> HashMap<String, u32> {
|
||||
let mut map = HashMap::new();
|
||||
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let id = indexed_last_value(marked_docids).unwrap();
|
||||
if id != DELETED_ID {
|
||||
let external_id = str::from_utf8(external_id).unwrap();
|
||||
map.insert(external_id.to_owned(), id.try_into().unwrap());
|
||||
}
|
||||
pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> {
|
||||
let mut map = HashMap::default();
|
||||
for result in self.0.iter(rtxn)? {
|
||||
let (external, internal) = result?;
|
||||
map.insert(external.to_owned(), internal.get());
|
||||
}
|
||||
|
||||
map
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
/// Return an fst of the combined hard and soft deleted ID.
|
||||
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
||||
if self.soft.is_empty() {
|
||||
return Ok(Cow::Borrowed(&self.hard));
|
||||
}
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
|
||||
let mut iter = union_op.into_stream();
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let value = indexed_last_value(marked_docids).unwrap();
|
||||
if value != DELETED_ID {
|
||||
new_hard_builder.insert(external_id, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
||||
/// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
|
||||
/// these internal ids and their external id.
|
||||
///
|
||||
/// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`,
|
||||
/// where the returned values can be:
|
||||
/// - `Ok((external_id, internal_id))`: if a mapping was found
|
||||
/// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
|
||||
/// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
|
||||
/// the entire fst.
|
||||
pub fn find_external_id_of<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
internal_ids: RoaringBitmap,
|
||||
) -> heed::Result<ExternalToInternalOwnedIterator<'t>> {
|
||||
self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids })
|
||||
}
|
||||
|
||||
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
||||
if self.soft.len() >= self.hard.len() / 2 {
|
||||
self.hard = self.to_fst()?.into_owned();
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
|
||||
///
|
||||
/// If the list contains multiple operations on the same external id, then the result is unspecified.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If attempting to delete a document that doesn't exist
|
||||
/// - If attempting to create a document that already exists
|
||||
pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> {
|
||||
for DocumentOperation { external_id, internal_id, kind } in operations {
|
||||
match kind {
|
||||
DocumentOperationKind::Create => {
|
||||
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
|
||||
}
|
||||
DocumentOperationKind::Delete => {
|
||||
if !self.0.delete(wtxn, &external_id)? {
|
||||
panic!("Attempting to delete a non-existing document")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ExternalDocumentsIds<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
|
||||
/// Returns an iterator over all the external ids.
|
||||
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
|
||||
self.0.iter(rtxn)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExternalDocumentsIds<'static> {
|
||||
fn default() -> Self {
|
||||
ExternalDocumentsIds {
|
||||
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
||||
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
||||
soft_deleted_docids: RoaringBitmap::new(),
|
||||
/// An iterator over mappings between requested internal ids and external ids.
|
||||
///
|
||||
/// See [`ExternalDocumentsIds::find_external_id_of`] for details.
|
||||
pub struct ExternalToInternalOwnedIterator<'t> {
|
||||
iter: RoIter<'t, Str, OwnedType<BEU32>>,
|
||||
internal_ids: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> {
|
||||
/// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
|
||||
type Item = Result<(&'t str, DocumentId), RoaringBitmap>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// if all requested ids were found, we won't find any other, so short-circuit
|
||||
if self.internal_ids.is_empty() {
|
||||
return None;
|
||||
}
|
||||
loop {
|
||||
let (external, internal) = match self.iter.next() {
|
||||
Some(Ok((external, internal))) => (external, internal),
|
||||
// TODO manage this better, remove panic
|
||||
Some(Err(e)) => panic!("{}", e),
|
||||
_ => {
|
||||
// we exhausted the stream but we still have some internal ids to find
|
||||
let remaining_ids = std::mem::take(&mut self.internal_ids);
|
||||
return Some(Err(remaining_ids));
|
||||
// note: next calls to `next` will return `None` since we replaced the internal_ids
|
||||
// with the default empty bitmap
|
||||
}
|
||||
};
|
||||
let internal = internal.get();
|
||||
let was_contained = self.internal_ids.remove(internal);
|
||||
if was_contained {
|
||||
return Some(Ok((external, internal)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value of the `IndexedValue` with the highest _index_.
|
||||
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
||||
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
||||
impl<'t> ExternalToInternalOwnedIterator<'t> {
|
||||
/// Returns the bitmap of internal ids whose external id are yet to be found
|
||||
pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
|
||||
&self.internal_ids
|
||||
}
|
||||
|
||||
/// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
|
||||
///
|
||||
/// Use this when you don't need the mapping between the external and the internal ids.
|
||||
pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't {
|
||||
self.map(|res| res.map(|(external, _internal)| external.to_owned()))
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
@ -99,6 +100,33 @@ impl CboRoaringBitmapCodec {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into<'a>(
|
||||
deladd: KvReaderDelAdd<'_>,
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> io::Result<Option<&'a [u8]>> {
|
||||
// Deserialize the bitmap that is already there
|
||||
let mut previous = Self::deserialize_from(previous)?;
|
||||
|
||||
// Remove integers we no more want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||
previous -= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
// Insert the new integers we want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||
previous |= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
if previous.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::serialize_into(&previous, buffer);
|
||||
Ok(Some(&buffer[..]))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@ use std::cmp;
|
||||
|
||||
use crate::{relative_from_absolute_position, Position};
|
||||
|
||||
pub const MAX_DISTANCE: u32 = 8;
|
||||
pub const MAX_DISTANCE: u32 = 4;
|
||||
|
||||
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs <= rhs {
|
||||
|
@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
||||
/// The documents returned by the iterator are grouped by the facet values that
|
||||
/// determined their rank. For example, given the documents:
|
||||
///
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// 0: { "colour": ["blue", "green"] }
|
||||
/// 1: { "colour": ["blue", "red"] }
|
||||
/// 2: { "colour": ["orange", "red"] }
|
||||
@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
||||
/// ```
|
||||
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
||||
/// over the following elements:
|
||||
/// ```ignore
|
||||
/// ```text
|
||||
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
||||
/// [3] // same for "green"
|
||||
/// [2] // same for "orange"
|
||||
|
@ -223,12 +223,9 @@ impl<'a> Filter<'a> {
|
||||
impl<'a> Filter<'a> {
|
||||
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
|
||||
// and finally we delete all the soft_deleted_documents, again, only once at the very end
|
||||
self.inner_evaluate(rtxn, index, &filterable_fields)
|
||||
.map(|result| result - soft_deleted_documents)
|
||||
}
|
||||
|
||||
fn evaluate_operator(
|
||||
|
@ -46,18 +46,27 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
if let Some(distinct_fid) = distinct_fid {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut results = vec![];
|
||||
let mut skip = 0;
|
||||
for docid in universe.iter() {
|
||||
if results.len() >= from + length {
|
||||
if results.len() >= length {
|
||||
break;
|
||||
}
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||
skip += 1;
|
||||
if skip <= from {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(docid);
|
||||
}
|
||||
|
||||
let mut all_candidates = universe - excluded;
|
||||
all_candidates.extend(results.iter().copied());
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: vec![Default::default(); results.len()],
|
||||
docids: results,
|
||||
|
@ -1,6 +1,7 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use super::ProximityCondition;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::SearchContext;
|
||||
@ -35,7 +36,7 @@ pub fn build_edges(
|
||||
}
|
||||
|
||||
let mut conditions = vec![];
|
||||
for cost in right_ngram_max..(7 + right_ngram_max) {
|
||||
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
|
||||
conditions.push((
|
||||
cost as u32,
|
||||
conditions_interner.insert(ProximityCondition::Uninit {
|
||||
@ -47,7 +48,7 @@ pub fn build_edges(
|
||||
}
|
||||
|
||||
conditions.push((
|
||||
(7 + right_ngram_max) as u32,
|
||||
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
|
||||
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||
));
|
||||
|
||||
|
@ -273,7 +273,7 @@ fn test_proximity_simple() {
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 5, 2, 3, 0, 1]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 2, 3, 5, 1, 0]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
@ -282,11 +282,11 @@ fn test_proximity_simple() {
|
||||
"\"the quickbrown fox jumps over the lazy dog\"",
|
||||
"\"the really quick brown fox jumps over the lazy dog\"",
|
||||
"\"the really quick brown fox jumps over the very lazy dog\"",
|
||||
"\"brown quick fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy. dog\"",
|
||||
"\"dog the quick brown fox jumps over the lazy\"",
|
||||
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
||||
"\"brown quick fox jumps over the lazy dog\"",
|
||||
"\"the. quick brown fox jumps over the lazy. dog\"",
|
||||
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best s");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@ -382,9 +382,9 @@ fn test_proximity_prefix_db() {
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
@ -396,7 +396,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best su");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@ -406,10 +406,10 @@ fn test_proximity_prefix_db() {
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best win");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@ -447,7 +447,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wint");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@ -457,10 +457,10 @@ fn test_proximity_prefix_db() {
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@ -471,7 +471,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wi");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@ -481,9 +481,9 @@ fn test_proximity_prefix_db() {
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
]
|
||||
"###);
|
||||
|
@ -68,8 +68,8 @@ fn test_trap_basic() {
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
@ -82,8 +82,8 @@ fn test_trap_basic() {
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
|
@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -101,8 +101,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -127,8 +127,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -153,8 +153,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -179,8 +179,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 22,
|
||||
rank: 9,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -205,8 +205,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 17,
|
||||
max_rank: 22,
|
||||
rank: 5,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -231,8 +231,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 17,
|
||||
max_rank: 22,
|
||||
rank: 5,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -3,59 +3,35 @@ source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -63,7 +39,31 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,40 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -47,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -55,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -63,7 +55,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,40 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -47,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -55,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -63,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -71,7 +63,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -3,59 +3,35 @@ source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -63,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -71,7 +47,31 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,8 +6,32 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -15,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -23,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -31,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -39,31 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,24 +6,24 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 15,
|
||||
rank: 4,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 50,
|
||||
max_rank: 50,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -24,132 +24,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 50,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 49,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 49,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 48,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 41,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 40,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 36,
|
||||
max_rank: 36,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 31,
|
||||
max_rank: 36,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
@ -160,14 +34,126 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
rank: 21,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 17,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 16,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 16,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -180,8 +166,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -194,8 +180,22 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 42,
|
||||
max_rank: 43,
|
||||
rank: 18,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 42,
|
||||
max_rank: 43,
|
||||
rank: 18,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 41,
|
||||
max_rank: 43,
|
||||
rank: 17,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 34,
|
||||
max_rank: 43,
|
||||
rank: 14,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 33,
|
||||
max_rank: 43,
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 36,
|
||||
max_rank: 36,
|
||||
rank: 16,
|
||||
max_rank: 16,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 29,
|
||||
max_rank: 29,
|
||||
rank: 13,
|
||||
max_rank: 13,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 29,
|
||||
rank: 10,
|
||||
max_rank: 13,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 55,
|
||||
max_rank: 57,
|
||||
rank: 23,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 52,
|
||||
max_rank: 57,
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 51,
|
||||
max_rank: 57,
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 48,
|
||||
max_rank: 57,
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 47,
|
||||
max_rank: 57,
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 57,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 50,
|
||||
max_rank: 50,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 38,
|
||||
max_rank: 43,
|
||||
rank: 16,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -222,8 +222,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 29,
|
||||
max_rank: 29,
|
||||
rank: 13,
|
||||
max_rank: 13,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -236,8 +236,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -250,8 +250,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -264,8 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 55,
|
||||
max_rank: 57,
|
||||
rank: 23,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 52,
|
||||
max_rank: 57,
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 47,
|
||||
max_rank: 57,
|
||||
rank: 18,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 45,
|
||||
max_rank: 57,
|
||||
rank: 18,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 57,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 47,
|
||||
max_rank: 50,
|
||||
rank: 19,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 40,
|
||||
max_rank: 43,
|
||||
rank: 16,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 35,
|
||||
max_rank: 43,
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -222,8 +222,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 26,
|
||||
max_rank: 29,
|
||||
rank: 10,
|
||||
max_rank: 13,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -236,8 +236,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 22,
|
||||
rank: 7,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -250,8 +250,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 22,
|
||||
rank: 7,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -264,8 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 22,
|
||||
rank: 7,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 15,
|
||||
rank: 5,
|
||||
max_rank: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -6,88 +6,88 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 55,
|
||||
max_rank: 57,
|
||||
rank: 23,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 52,
|
||||
max_rank: 57,
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 51,
|
||||
max_rank: 57,
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 48,
|
||||
max_rank: 57,
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 47,
|
||||
max_rank: 57,
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
@ -95,7 +95,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 57,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
@ -259,8 +259,8 @@ fn test_ignore_stop_words() {
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
@ -411,8 +411,8 @@ fn test_stop_words_in_phrase() {
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
|
@ -277,7 +277,7 @@ fn test_words_proximity_tms_last_simple() {
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
@ -289,10 +289,10 @@ fn test_words_proximity_tms_last_simple() {
|
||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
@ -312,7 +312,7 @@ fn test_words_proximity_tms_last_simple() {
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
// 10 is better than 9 because of the proximity between "quick" and "brown"
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 15, 16, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
@ -326,8 +326,8 @@ fn test_words_proximity_tms_last_simple() {
|
||||
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
@ -427,7 +427,7 @@ fn test_words_tms_all() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
@ -439,10 +439,10 @@ fn test_words_tms_all() {
|
||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||
]
|
||||
"###);
|
||||
|
@ -4,9 +4,8 @@ use std::path::Path;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, Index};
|
||||
|
||||
#[track_caller]
|
||||
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
|
||||
@ -98,7 +97,6 @@ Create a snapshot test of the given database.
|
||||
- `facet_id_string_docids`
|
||||
- `documents_ids`
|
||||
- `stop_words`
|
||||
- `soft_deleted_documents_ids`
|
||||
- `field_distribution`
|
||||
- `fields_ids_map`
|
||||
- `geo_faceted_documents_ids`
|
||||
@ -308,12 +306,6 @@ pub fn snap_stop_words(index: &Index) -> String {
|
||||
let snap = format!("{stop_words:?}");
|
||||
snap
|
||||
}
|
||||
pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
|
||||
|
||||
display_bitmap(&soft_deleted_documents_ids)
|
||||
}
|
||||
pub fn snap_field_distributions(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut snap = String::new();
|
||||
@ -340,50 +332,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
|
||||
}
|
||||
pub fn snap_external_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
|
||||
let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
|
||||
// ensure fixed order (not guaranteed by hashmap)
|
||||
let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
|
||||
external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
|
||||
|
||||
let mut snap = String::new();
|
||||
|
||||
writeln!(&mut snap, "soft:").unwrap();
|
||||
let stream_soft = soft.stream();
|
||||
let soft_external_ids = stream_soft.into_str_vec().unwrap();
|
||||
for (key, id) in soft_external_ids {
|
||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||
}
|
||||
writeln!(&mut snap, "hard:").unwrap();
|
||||
let stream_hard = hard.stream();
|
||||
let hard_external_ids = stream_hard.into_str_vec().unwrap();
|
||||
for (key, id) in hard_external_ids {
|
||||
writeln!(&mut snap, "docids:").unwrap();
|
||||
for (key, id) in external_ids {
|
||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||
}
|
||||
|
||||
snap
|
||||
}
|
||||
pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let number_faceted_documents_ids =
|
||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
|
||||
.unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
|
||||
let mut snap = String::new();
|
||||
for field_id in fields_ids_map.ids() {
|
||||
let string_faceted_documents_ids =
|
||||
index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
|
||||
writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
|
||||
.unwrap();
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snap_words_fst(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let words_fst = index.words_fst(&rtxn).unwrap();
|
||||
@ -516,9 +479,6 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, stop_words) => {{
|
||||
$crate::snapshot_tests::snap_stop_words(&$index)
|
||||
}};
|
||||
($index:ident, soft_deleted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, field_distribution) => {{
|
||||
$crate::snapshot_tests::snap_field_distributions(&$index)
|
||||
}};
|
||||
@ -531,12 +491,6 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, external_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_external_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, number_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, string_faceted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, words_fst) => {{
|
||||
$crate::snapshot_tests::snap_words_fst(&$index)
|
||||
}};
|
||||
|
@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds {
|
||||
}
|
||||
|
||||
impl AvailableDocumentsIds {
|
||||
pub fn from_documents_ids(
|
||||
docids: &RoaringBitmap,
|
||||
soft_deleted_docids: &RoaringBitmap,
|
||||
) -> AvailableDocumentsIds {
|
||||
let used_docids = docids | soft_deleted_docids;
|
||||
|
||||
match used_docids.max() {
|
||||
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
|
||||
match docids.max() {
|
||||
Some(last_id) => {
|
||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||
available -= used_docids;
|
||||
available -= docids;
|
||||
|
||||
let iter = match last_id.checked_add(1) {
|
||||
Some(id) => id..=u32::max_value(),
|
||||
@ -50,7 +45,7 @@ mod tests {
|
||||
#[test]
|
||||
fn empty() {
|
||||
let base = RoaringBitmap::new();
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let right = 0..=u32::max_value();
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
@ -63,28 +58,8 @@ mod tests {
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn soft_deleted() {
|
||||
let mut base = RoaringBitmap::new();
|
||||
base.insert(0);
|
||||
base.insert(10);
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let mut soft_deleted = RoaringBitmap::new();
|
||||
soft_deleted.insert(1);
|
||||
soft_deleted.insert(11);
|
||||
soft_deleted.insert(101);
|
||||
soft_deleted.insert(406);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
|
||||
let right =
|
||||
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,7 @@
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
||||
use crate::{FieldDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@ -21,6 +20,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
let Index {
|
||||
env: _env,
|
||||
main: _main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
@ -51,36 +51,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
|
||||
// We retrieve the number of documents ids that we are deleting.
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
|
||||
|
||||
// We clean some of the main engine datastructures.
|
||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
|
||||
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
|
||||
// We clean all the faceted documents ids.
|
||||
for field_id in faceted_fields {
|
||||
self.index.put_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
FacetType::Number,
|
||||
&empty_roaring,
|
||||
)?;
|
||||
self.index.put_faceted_documents_ids(
|
||||
self.wtxn,
|
||||
field_id,
|
||||
FacetType::String,
|
||||
&empty_roaring,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
@ -140,7 +122,7 @@ mod tests {
|
||||
|
||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
|
||||
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,9 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
||||
use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
@ -12,17 +12,15 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
/// by rebuilding the database "from scratch".
|
||||
///
|
||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||
/// higher levels are cleared and recomputed from the content of level 0.
|
||||
///
|
||||
/// Finally, the `faceted_documents_ids` value in the main database of `Index`
|
||||
/// is updated to contain the new set of faceted documents.
|
||||
pub struct FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
group_size: u8,
|
||||
@ -30,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
new_data: Option<grenad::Reader<File>>,
|
||||
delta_data: Option<grenad::Reader<BufReader<File>>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
@ -38,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
@ -48,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
group_size,
|
||||
min_level_size,
|
||||
facet_type,
|
||||
new_data: Some(new_data),
|
||||
delta_data: Some(delta_data),
|
||||
}
|
||||
}
|
||||
|
||||
@ -63,13 +61,13 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
new_data: None,
|
||||
delta_data: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||
|
||||
let db = match facet_type {
|
||||
FacetType::String => index
|
||||
@ -80,12 +78,9 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
}
|
||||
};
|
||||
|
||||
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
|
||||
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
||||
|
||||
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
|
||||
index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
|
||||
Ok(())
|
||||
})?;
|
||||
inner.update(wtxn, &field_ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@ -94,26 +89,19 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
pub new_data: Option<grenad::Reader<R>>,
|
||||
pub delta_data: Option<grenad::Reader<R>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
pub fn update(
|
||||
mut self,
|
||||
wtxn: &mut RwTxn,
|
||||
field_ids: &[u16],
|
||||
mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
|
||||
self.update_level0(wtxn)?;
|
||||
for &field_id in field_ids.iter() {
|
||||
self.clear_levels(wtxn, field_id)?;
|
||||
}
|
||||
|
||||
for &field_id in field_ids.iter() {
|
||||
let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
handle_all_docids(wtxn, field_id, all_docids)?;
|
||||
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
for level_reader in level_readers {
|
||||
let mut cursor = level_reader.into_cursor()?;
|
||||
@ -133,20 +121,26 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
|
||||
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
let new_data = match self.new_data.take() {
|
||||
let delta_data = match self.delta_data.take() {
|
||||
Some(x) => x,
|
||||
None => return Ok(()),
|
||||
};
|
||||
if self.db.is_empty(wtxn)? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
|
||||
let mut cursor = new_data.into_cursor()?;
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.clear();
|
||||
// the group size for level 0
|
||||
buffer.push(1);
|
||||
@ -158,11 +152,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
let mut buffer = Vec::new();
|
||||
let database = self.db.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
let mut cursor = new_data.into_cursor()?;
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
buffer.clear();
|
||||
@ -170,17 +167,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
// then we extend the buffer with the docids bitmap
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||
let old_bitmap = &prev_value[1..];
|
||||
CboRoaringBitmapCodec::merge_into(
|
||||
&[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
|
||||
&mut buffer,
|
||||
)?;
|
||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||
}
|
||||
None => {
|
||||
// it is safe to ignore the del in that case.
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
// won't put the key in DB as the value would be empty
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.extend_from_slice(value);
|
||||
}
|
||||
};
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
let new_bitmap = &buffer[1..];
|
||||
// if the new bitmap is empty, let's remove it
|
||||
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||
database.delete(wtxn, key)?;
|
||||
} else {
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@ -189,16 +196,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
txn: &RoTxn,
|
||||
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
|
||||
let mut all_docids = RoaringBitmap::new();
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
|
||||
for bitmap in bitmaps {
|
||||
all_docids |= bitmap;
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||
|
||||
Ok((subwriters, all_docids))
|
||||
Ok(subwriters)
|
||||
}
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn read_level_0<'t>(
|
||||
@ -261,7 +262,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||
) -> Result<Vec<grenad::Reader<File>>> {
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
if level == 0 {
|
||||
self.read_level_0(rtxn, field_id, handle_group)?;
|
||||
// Level 0 is already in the database
|
||||
@ -492,7 +493,6 @@ mod tests {
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -1,360 +0,0 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use heed::RwTxn;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
///
|
||||
/// Depending on the number of removed elements and the existing size of the database, we use either
|
||||
/// a bulk delete method or an incremental delete method.
|
||||
pub struct FacetsDelete<'i, 'b> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
||||
docids_to_delete: &'b RoaringBitmap,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
}
|
||||
impl<'i, 'b> FacetsDelete<'i, 'b> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
||||
docids_to_delete: &'b RoaringBitmap,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
|
||||
}
|
||||
};
|
||||
Self {
|
||||
index,
|
||||
database,
|
||||
facet_type,
|
||||
affected_facet_values,
|
||||
docids_to_delete,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
for (field_id, affected_facet_values) in self.affected_facet_values {
|
||||
// This is an incorrect condition, since we assume that the length of the database is equal
|
||||
// to the number of facet values for the given field_id. It means that in some cases, we might
|
||||
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
|
||||
// really be a performance problem is when we fully delete a large ratio of all facet values for
|
||||
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
|
||||
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
|
||||
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
|
||||
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
|
||||
// Bulk delete
|
||||
let mut modified = false;
|
||||
|
||||
for facet_value in affected_facet_values {
|
||||
let key =
|
||||
FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
|
||||
let mut old = self.database.get(wtxn, &key)?.unwrap();
|
||||
let previous_len = old.bitmap.len();
|
||||
old.bitmap -= self.docids_to_delete;
|
||||
if old.bitmap.is_empty() {
|
||||
modified = true;
|
||||
self.database.delete(wtxn, &key)?;
|
||||
} else if old.bitmap.len() != previous_len {
|
||||
modified = true;
|
||||
self.database.put(wtxn, &key, &old)?;
|
||||
}
|
||||
}
|
||||
if modified {
|
||||
let builder = FacetsUpdateBulk::new_not_updating_level_0(
|
||||
self.index,
|
||||
vec![field_id],
|
||||
self.facet_type,
|
||||
);
|
||||
builder.execute(wtxn)?;
|
||||
}
|
||||
} else {
|
||||
// Incremental
|
||||
let inc = FacetsUpdateIncrementalInner {
|
||||
db: self.database,
|
||||
group_size: self.group_size,
|
||||
min_level_size: self.min_level_size,
|
||||
max_group_size: self.max_group_size,
|
||||
};
|
||||
for facet_value in affected_facet_values {
|
||||
inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::SeedableRng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::facet::test_helpers::ordered_string;
|
||||
use crate::update::{DeleteDocuments, DeletionStrategy};
|
||||
|
||||
#[test]
|
||||
fn delete_mixed_incremental_and_bulk() {
|
||||
// The point of this test is to create an index populated with documents
|
||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": i / 10,
|
||||
"colour": i / 100,
|
||||
"timestamp": i / 2,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
||||
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
||||
// by deleting the first 100 documents, we expect that:
|
||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
||||
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
|
||||
}
|
||||
|
||||
// Same test as above but working with string values for the facets
|
||||
#[test]
|
||||
fn delete_mixed_incremental_and_bulk_string() {
|
||||
// The point of this test is to create an index populated with documents
|
||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": ordered_string(i / 10),
|
||||
"colour": ordered_string(i / 100),
|
||||
"timestamp": ordered_string(i / 2),
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
||||
// by deleting the first 100 documents, we expect that:
|
||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
||||
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_almost_all_incrementally_string() {
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": ordered_string(i / 10),
|
||||
"colour": ordered_string(i / 100),
|
||||
"timestamp": ordered_string(i / 2),
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
|
||||
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
|
||||
docids_to_delete.shuffle(&mut rng);
|
||||
for docid in docids_to_delete.into_iter().take(990) {
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
||||
db_snap!(index, string_faceted_documents_ids, 2, @r###"
|
||||
0 []
|
||||
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||
2 [292, 324, 358, 381, 493, 839, 852, ]
|
||||
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
|
||||
"###);
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
use std::iter::once;
|
||||
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::update::facet::test_helpers::FacetIndex;
|
||||
|
||||
// This is a simple test to get an intuition on the relative speed
|
||||
// of the incremental vs. bulk indexer.
|
||||
//
|
||||
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
||||
// each facet value contains only one document ID.
|
||||
//
|
||||
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
|
||||
// bulk indexer.
|
||||
// #[test]
|
||||
fn benchmark_facet_indexing_delete() {
|
||||
let mut r = rand::thread_rng();
|
||||
|
||||
for i in 1..=20 {
|
||||
let size = 50_000 * i;
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..size {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
let timer = std::time::Instant::now();
|
||||
index.bulk_insert(&mut txn, &[0], elements.iter());
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!("bulk {size} : {time_spent}ms");
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
for nbr_doc in [1, 100, 1000, 10_000] {
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let timer = std::time::Instant::now();
|
||||
//
|
||||
// delete one document
|
||||
//
|
||||
for _ in 0..nbr_doc {
|
||||
let deleted_u32 = r.gen::<u32>() % size;
|
||||
let deleted_f64 = deleted_u32 as f64;
|
||||
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
|
||||
}
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!(" delete {nbr_doc} : {time_spent}ms");
|
||||
txn.abort().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,8 +1,9 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
@ -11,8 +12,9 @@ use crate::heed_codec::facet::{
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
enum InsertionResult {
|
||||
InPlace,
|
||||
@ -27,27 +29,21 @@ enum DeletionResult {
|
||||
|
||||
/// Algorithm to incrementally insert and delete elememts into the
|
||||
/// `facet_id_(string/f64)_docids` databases.
|
||||
///
|
||||
/// Rhe `faceted_documents_ids` value in the main database of `Index`
|
||||
/// is also updated to contain the new set of faceted documents.
|
||||
pub struct FacetsUpdateIncremental<'i> {
|
||||
index: &'i Index,
|
||||
pub struct FacetsUpdateIncremental {
|
||||
inner: FacetsUpdateIncrementalInner,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateIncremental<'i> {
|
||||
impl FacetsUpdateIncremental {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
index: &Index,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
max_group_size: u8,
|
||||
) -> Self {
|
||||
FacetsUpdateIncremental {
|
||||
index,
|
||||
inner: FacetsUpdateIncrementalInner {
|
||||
db: match facet_type {
|
||||
FacetType::String => index
|
||||
@ -61,31 +57,41 @@ impl<'i> FacetsUpdateIncremental<'i> {
|
||||
max_group_size,
|
||||
min_level_size,
|
||||
},
|
||||
facet_type,
|
||||
new_data,
|
||||
delta_data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
|
||||
let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
|
||||
|
||||
let mut cursor = self.new_data.into_cursor()?;
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||
let mut cursor = self.delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
|
||||
.ok_or(heed::Error::Encoding)?;
|
||||
let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
|
||||
*new_faceted_docids.entry(key.field_id).or_default() |= docids;
|
||||
let value = KvReader::new(value);
|
||||
|
||||
let docids_to_delete = value
|
||||
.get(DelAdd::Deletion)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||
|
||||
let docids_to_add = value
|
||||
.get(DelAdd::Addition)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.ok_or(heed::Error::Encoding));
|
||||
|
||||
if let Some(docids_to_delete) = docids_to_delete {
|
||||
let docids_to_delete = docids_to_delete?;
|
||||
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
|
||||
}
|
||||
|
||||
if let Some(docids_to_add) = docids_to_add {
|
||||
let docids_to_add = docids_to_add?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (field_id, new_docids) in new_faceted_docids {
|
||||
let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
|
||||
docids |= new_docids;
|
||||
self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -14,7 +14,7 @@ The databases must be able to return results for queries such as:
|
||||
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||
|
||||
To make these queries fast to compute, the database adopts a tree structure:
|
||||
```ignore
|
||||
```text
|
||||
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||
│Level 2│ │ │ │ │
|
||||
@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`.
|
||||
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||
[`FacetGroupValue`], which have the following format:
|
||||
|
||||
```ignore
|
||||
```text
|
||||
FacetGroupKey:
|
||||
- field id : u16
|
||||
- level : u8
|
||||
@ -78,6 +78,7 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
@ -97,7 +98,6 @@ use crate::update::merge_btreeset_string;
|
||||
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod delete;
|
||||
pub mod incremental;
|
||||
|
||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
@ -108,14 +108,17 @@ pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
new_data: grenad::Reader<File>,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
}
|
||||
impl<'i> FacetsUpdate<'i> {
|
||||
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
|
||||
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
.facet_id_string_docids
|
||||
@ -131,26 +134,26 @@ impl<'i> FacetsUpdate<'i> {
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
new_data,
|
||||
delta_data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
if self.new_data.is_empty() {
|
||||
if self.delta_data.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||
if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.new_data,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
@ -159,7 +162,7 @@ impl<'i> FacetsUpdate<'i> {
|
||||
let incremental_update = FacetsUpdateIncremental::new(
|
||||
self.index,
|
||||
self.facet_type,
|
||||
self.new_data,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
self.max_group_size,
|
||||
@ -275,6 +278,7 @@ pub(crate) mod test_helpers {
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::FacetsUpdateIncrementalInner;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
@ -451,20 +455,22 @@ pub(crate) mod test_helpers {
|
||||
let key: FacetGroupKey<&[u8]> =
|
||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
|
||||
let mut inner_writer = KvWriterDelAdd::memory();
|
||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||
writer.insert(&key, &value).unwrap();
|
||||
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
|
||||
let update = FacetsUpdateBulkInner {
|
||||
db: self.content,
|
||||
new_data: Some(reader),
|
||||
delta_data: Some(reader),
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
|
||||
update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap();
|
||||
update.update(wtxn, field_ids).unwrap();
|
||||
}
|
||||
|
||||
pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {
|
||||
@ -552,101 +558,6 @@ pub(crate) mod test_helpers {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::DeletionStrategy;
|
||||
|
||||
#[test]
|
||||
fn replace_all_identical_soft_deletion_then_hard_deletion() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("size") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
||||
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
|
||||
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..999 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
"other": 0,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
||||
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
||||
|
||||
// Then replace the last document while disabling soft_deletion
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
let mut documents = vec![];
|
||||
for i in 999..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
"other": 0,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
||||
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
|
@ -1,4 +1,4 @@
|
||||
use std::io::{Read, Seek};
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
use std::{fmt, iter};
|
||||
|
||||
@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
|
||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
||||
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
|
@ -1,6 +1,7 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
@ -29,7 +30,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
@ -55,7 +56,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
|
||||
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
@ -114,6 +115,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let (add_obkv, add_script_language_word_count) = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::new(del_obkv),
|
||||
@ -121,8 +123,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write them into the sorter.
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
@ -150,8 +152,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
}
|
||||
}
|
||||
|
||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
.map(|reader| (reader, script_language_docids))
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
@ -195,7 +198,7 @@ fn tokenizer_builder<'a>(
|
||||
}
|
||||
|
||||
if let Some(script_language) = script_language {
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
tokenizer_builder.allow_list(script_language);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
@ -203,6 +206,7 @@ fn tokenizer_builder<'a>(
|
||||
|
||||
/// Extract words maped with their positions of a document,
|
||||
/// ensuring no Language detection mistakes was made.
|
||||
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
|
||||
fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
@ -217,9 +221,9 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
buffers,
|
||||
@ -244,8 +248,8 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words,
|
||||
dictionary,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
Some(&script_language),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
@ -254,7 +258,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
|
||||
// rerun the extraction.
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
@ -265,6 +269,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
|
||||
Ok((&buffers.obkv_buffer, script_language_word_count))
|
||||
}
|
||||
|
||||
@ -330,6 +335,7 @@ fn tokens_from_document<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
// returns a KV<FieldId, KV<u16, String>>
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
@ -20,7 +20,7 @@ use crate::Result;
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, str};
|
||||
|
||||
use heed::BytesEncode;
|
||||
@ -18,7 +19,7 @@ use crate::{FieldId, Result};
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader};
|
||||
use std::mem::size_of;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
@ -29,11 +29,11 @@ const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
@ -102,11 +102,11 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
|
||||
|
@ -1,14 +1,15 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
@ -22,14 +23,14 @@ const MAX_COUNTED_WORDS: usize = 30;
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@ -37,18 +38,52 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
|
||||
if word_count <= MAX_COUNTED_WORDS {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
if deletion != addition {
|
||||
// Insert deleted word count in sorter if exist.
|
||||
if let Some(word_count) = deletion {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
// Insert added word count in sorter if exist.
|
||||
if let Some(word_count) = addition {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,11 +1,12 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::GeoError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::extract_finite_float_from_value;
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
@ -18,7 +19,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
(lat_fid, lng_fid): (FieldId, FieldId),
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
@ -30,39 +31,71 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// first we get the two fields
|
||||
let lat = obkv.get(lat_fid);
|
||||
let lng = obkv.get(lng_fid);
|
||||
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
|
||||
(Some(lat), Some(lng)) => {
|
||||
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
|
||||
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
|
||||
|
||||
if let Some((lat, lng)) = lat.zip(lng) {
|
||||
// then we extract the values
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
// then we extract the values
|
||||
let del_lat_lng = deladd_lat_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
|
||||
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||
.transpose()?;
|
||||
let add_lat_lng = deladd_lat_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.zip(deladd_lng_obkv.get(DelAdd::Addition))
|
||||
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||
.transpose()?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
} else if lat.is_none() && lng.is_some() {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
|
||||
} else if lat.is_some() && lng.is_none() {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
if let Some([lat, lng]) = del_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||
}
|
||||
if let Some([lat, lng]) = add_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Addition, bytes)?;
|
||||
}
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||
}
|
||||
(Some(_), None) => {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, None) => (),
|
||||
}
|
||||
// else => the _geo object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
|
||||
Ok([lat, lng])
|
||||
}
|
||||
|
@ -1,13 +1,24 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::UserError;
|
||||
use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
@ -16,9 +27,8 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
vectors_fid: FieldId,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
@ -27,43 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
from_slice(document_id).unwrap()
|
||||
};
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
// first we retrieve the _vectors field
|
||||
if let Some(vectors) = obkv.get(vectors_fid) {
|
||||
// extract the vectors
|
||||
let vectors = match from_slice(vectors) {
|
||||
Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
|
||||
Err(_) => {
|
||||
return Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into())
|
||||
}
|
||||
};
|
||||
if let Some(value) = obkv.get(vectors_fid) {
|
||||
let vectors_obkv = KvReaderDelAdd::new(value);
|
||||
|
||||
if let Some(vectors) = vectors {
|
||||
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
let index = u16::try_from(i).unwrap();
|
||||
let mut key = docid_bytes.to_vec();
|
||||
key.extend_from_slice(&index.to_be_bytes());
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(key, bytes)?;
|
||||
}
|
||||
}
|
||||
// then we extract the values
|
||||
let del_vectors = vectors_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add_vectors = vectors_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut writer,
|
||||
&mut key_buffer,
|
||||
del_vectors.unwrap_or_default(),
|
||||
add_vectors.unwrap_or_default(),
|
||||
)?;
|
||||
}
|
||||
// else => the `_vectors` object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn push_vectors_diff(
|
||||
writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_vectors: Vec<Vec<f32>>,
|
||||
mut add_vectors: Vec<Vec<f32>>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the vectors
|
||||
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
let merged_vectors_iter =
|
||||
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(vector) => {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
EitherOrBoth::Right(vector) => {
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compares two vectors by using the OrderingFloat helper.
|
||||
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
/// Extracts the vectors from a JSON value.
|
||||
fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
|
||||
match from_slice(value) {
|
||||
Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
|
||||
Err(_) => Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(value).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into()),
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::BytesDecode;
|
||||
use obkv::KvReaderU16;
|
||||
@ -28,7 +28,11 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
exact_attributes: &HashSet<FieldId>,
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
@ -53,17 +57,17 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
|
||||
for (_pos, word) in KvReaderU16::new(deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::new(&addition).iter() {
|
||||
for (_pos, word) in KvReaderU16::new(addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
@ -118,9 +122,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||
if exact_attributes.contains(&fid) {
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
} else {
|
||||
word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||
word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
}
|
||||
}
|
||||
|
||||
@ -165,7 +169,7 @@ fn words_into_sorter(
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&word_bytes);
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
|
@ -1,5 +1,6 @@
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{cmp, io};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
@ -22,13 +23,12 @@ use crate::{DocumentId, Result};
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.into_iter()
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
@ -74,7 +74,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
@ -103,7 +103,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
@ -169,7 +169,7 @@ fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
@ -200,7 +200,7 @@ fn document_word_positions_into_sorter(
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.push(*prox as u8);
|
||||
key_buffer.push(*prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
@ -22,7 +22,7 @@ use crate::{bucketed_position, DocumentId, Result};
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
@ -60,7 +60,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||
|
@ -11,6 +11,7 @@ mod extract_word_position_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use log::debug;
|
||||
@ -27,8 +28,8 @@ use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{
|
||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
||||
MergeableReader,
|
||||
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
||||
MergeFn, MergeableReader,
|
||||
};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::{FieldId, Result};
|
||||
@ -37,8 +38,8 @@ use crate::{FieldId, Result};
|
||||
/// Send data in grenad file over provided Sender.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn data_from_obkv_documents(
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: Option<HashSet<FieldId>>,
|
||||
@ -62,7 +63,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
vectors_field_id,
|
||||
primary_key_id,
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
@ -107,7 +107,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-exists-docids");
|
||||
match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||
}
|
||||
@ -123,7 +123,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-null-docids");
|
||||
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
@ -139,7 +139,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
@ -150,36 +150,40 @@ pub(crate) fn data_from_obkv_documents(
|
||||
});
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordcountDocids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
|
||||
Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>,
|
||||
>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
@ -190,32 +194,32 @@ pub(crate) fn data_from_obkv_documents(
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
fid_docid_facet_strings_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
fid_docid_facet_numbers_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx,
|
||||
extract_facet_number_docids,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
@ -265,11 +269,10 @@ fn spawn_extraction_task<FE, FS, M>(
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents
|
||||
fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<File>>,
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@ -278,12 +281,7 @@ fn send_original_documents_data(
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned,
|
||||
indexer,
|
||||
primary_key_id,
|
||||
vectors_field_id,
|
||||
);
|
||||
let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
|
||||
let _ = match result {
|
||||
Ok(vector_points) => {
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||
@ -307,7 +305,7 @@ fn send_original_documents_data(
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
@ -324,7 +322,10 @@ fn send_and_extract_flattened_documents_data(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
),
|
||||
),
|
||||
),
|
||||
)> {
|
||||
@ -347,7 +348,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
||||
let (docid_word_positions_chunk, script_language_pair) =
|
||||
extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
@ -358,9 +359,6 @@ fn send_and_extract_flattened_documents_data(
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
// send documents_ids to DB writer
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek};
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
|
||||
typ: grenad::CompressionType,
|
||||
level: Option<u32>,
|
||||
file: R,
|
||||
) -> grenad::Writer<R> {
|
||||
) -> grenad::Writer<BufWriter<R>> {
|
||||
let mut builder = grenad::Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(file)
|
||||
builder.build(BufWriter::new(file))
|
||||
}
|
||||
|
||||
pub fn create_sorter(
|
||||
@ -47,13 +47,14 @@ pub fn create_sorter(
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.sort_in_parallel(true);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn sorter_into_reader(
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
@ -65,16 +66,18 @@ pub fn sorter_into_reader(
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
pub fn writer_into_reader(
|
||||
writer: grenad::Writer<BufWriter<File>>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
file.rewind()?;
|
||||
grenad::Reader::new(file).map_err(Into::into)
|
||||
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub unsafe fn as_cloneable_grenad(
|
||||
reader: &grenad::Reader<File>,
|
||||
reader: &grenad::Reader<BufReader<File>>,
|
||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||
let file = reader.get_ref();
|
||||
let file = reader.get_ref().get_ref();
|
||||
let mmap = memmap2::Mmap::map(file)?;
|
||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||
let reader = grenad::Reader::new(cursor)?;
|
||||
@ -90,8 +93,8 @@ where
|
||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<grenad::Reader<File>> {
|
||||
type Output = grenad::Reader<File>;
|
||||
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||
type Output = grenad::Reader<BufReader<File>>;
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut merger = MergerBuilder::new(merge_fn);
|
||||
@ -100,8 +103,8 @@ impl MergeableReader for Vec<grenad::Reader<File>> {
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
type Output = (grenad::Reader<File>, grenad::Reader<File>);
|
||||
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
@ -114,8 +117,18 @@ impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
|
||||
impl MergeableReader
|
||||
for Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>
|
||||
{
|
||||
type Output = (
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
@ -142,7 +155,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
|
||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let merger = self.0.build();
|
||||
let mut writer = create_writer(
|
||||
params.chunk_compression_type,
|
||||
@ -193,7 +206,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
documents_chunk_size: usize,
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||
let mut continue_reading = true;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
|
||||
@ -210,11 +223,13 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
if !obkv.is_empty() {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,7 +157,7 @@ fn inner_merge_del_add_obkvs<'a>(
|
||||
let mut acc = newest[1..].to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
// reverse iter from the most recent to the oldest.
|
||||
for current in obkvs.into_iter().rev() {
|
||||
for current in obkvs.iter().rev() {
|
||||
// if in the previous iteration there was a complete deletion,
|
||||
// stop the merge process.
|
||||
if acc_operation_type == Operation::Deletion as u8 {
|
||||
|
@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
|
||||
WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
@ -89,7 +89,6 @@ pub struct IndexDocumentsConfig {
|
||||
pub words_positions_level_group_size: Option<NonZeroU32>,
|
||||
pub words_positions_min_level_size: Option<NonZeroU32>,
|
||||
pub update_method: IndexDocumentsMethod,
|
||||
pub deletion_strategy: DeletionStrategy,
|
||||
pub autogenerate_docids: bool,
|
||||
}
|
||||
|
||||
@ -181,6 +180,7 @@ where
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
return Ok((self, Ok(0)));
|
||||
}
|
||||
|
||||
@ -193,6 +193,7 @@ where
|
||||
|
||||
self.deleted_documents += deleted_documents;
|
||||
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
Ok((self, Ok(deleted_documents)))
|
||||
}
|
||||
|
||||
@ -200,7 +201,7 @@ where
|
||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||
puffin::profile_function!();
|
||||
|
||||
if self.added_documents == 0 {
|
||||
if self.added_documents == 0 && self.deleted_documents == 0 {
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||
}
|
||||
@ -244,9 +245,6 @@ where
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
field_distribution,
|
||||
new_external_documents_ids,
|
||||
new_documents_ids,
|
||||
replaced_documents_ids,
|
||||
documents_count,
|
||||
original_documents,
|
||||
flattened_documents,
|
||||
@ -370,29 +368,12 @@ where
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be droped to avoid channel waiting lock.
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx)
|
||||
});
|
||||
|
||||
// We delete the documents that this document addition replaces. This way we are
|
||||
// able to simply insert all the documents even if they already exist in the database.
|
||||
if !replaced_documents_ids.is_empty() {
|
||||
let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
|
||||
deletion_builder.strategy(self.config.deletion_strategy);
|
||||
debug!("documents to delete {:?}", replaced_documents_ids);
|
||||
deletion_builder.delete_documents(&replaced_documents_ids);
|
||||
let deleted_documents_result = deletion_builder.execute_inner()?;
|
||||
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
|
||||
}
|
||||
|
||||
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let index_is_empty = index_documents_ids.is_empty();
|
||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
let mut word_pair_proximity_docids = None;
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
|
||||
let mut databases_seen = 0;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
@ -405,38 +386,9 @@ where
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let typed_chunk = match result? {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
exact_word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_pair_proximity_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPairProximityDocids(chunk)
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_position_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
otherwise => otherwise,
|
||||
};
|
||||
let typed_chunk = result?;
|
||||
|
||||
// FIXME: return newly added as well as newly deleted documents
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
|
||||
if !docids.is_empty() {
|
||||
@ -466,15 +418,6 @@ where
|
||||
// We write the primary key field id into the main database
|
||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||
|
||||
// We write the external documents ids into the main database.
|
||||
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
||||
let external_documents_ids = external_documents_ids.into_static();
|
||||
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
|
||||
|
||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
||||
|
||||
// TODO: reactivate prefix DB with diff-indexing
|
||||
// self.execute_prefix_databases(
|
||||
// word_docids,
|
||||
@ -484,7 +427,7 @@ where
|
||||
// word_fid_docids,
|
||||
// )?;
|
||||
|
||||
Ok(all_documents_ids.len())
|
||||
self.index.number_of_documents(self.wtxn)
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
@ -718,14 +661,15 @@ fn execute_word_prefix_docids(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use fst::IntoStreamer;
|
||||
use heed::RwTxn;
|
||||
use maplit::hashset;
|
||||
|
||||
use super::*;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::{db_snap, BEU16};
|
||||
use crate::{db_snap, Filter, Search, BEU16};
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@ -816,11 +760,10 @@ mod tests {
|
||||
assert_eq!(count, 1);
|
||||
|
||||
// Check that we get only one document from the database.
|
||||
// Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
|
||||
let docs = index.documents(&rtxn, Some(1)).unwrap();
|
||||
let docs = index.documents(&rtxn, Some(0)).unwrap();
|
||||
assert_eq!(docs.len(), 1);
|
||||
let (id, doc) = docs[0];
|
||||
assert_eq!(id, 1);
|
||||
assert_eq!(id, 0);
|
||||
|
||||
// Check that this document is equal to the last one sent.
|
||||
let mut doc_iter = doc.iter();
|
||||
@ -881,7 +824,7 @@ mod tests {
|
||||
assert_eq!(count, 3);
|
||||
|
||||
// the document 0 has been deleted and reinserted with the id 3
|
||||
let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
|
||||
let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
|
||||
let kevin_position =
|
||||
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
||||
assert_eq!(kevin_position, 2);
|
||||
@ -1027,7 +970,6 @@ mod tests {
|
||||
assert_eq!(count, 6);
|
||||
|
||||
db_snap!(index, word_docids, "updated");
|
||||
db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]");
|
||||
|
||||
drop(rtxn);
|
||||
}
|
||||
@ -1130,17 +1072,15 @@ mod tests {
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]))
|
||||
.unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_external_id("30");
|
||||
builder.execute().unwrap();
|
||||
index.delete_document("30");
|
||||
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_none());
|
||||
wtxn.commit().unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
|
||||
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
@ -1149,8 +1089,8 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let wtxn = index.write_txn().unwrap();
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_some());
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index
|
||||
@ -1444,8 +1384,10 @@ mod tests {
|
||||
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
|
||||
assert!(external_documents_ids.get("1").is_some());
|
||||
let all_documents_count = index.all_documents(&rtxn).unwrap().count();
|
||||
assert_eq!(all_documents_count, 1);
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -1499,12 +1441,6 @@ mod tests {
|
||||
3 2 second second
|
||||
3 3 third third
|
||||
"###);
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -1528,12 +1464,6 @@ mod tests {
|
||||
|
||||
db_snap!(index, facet_id_string_docids, @"");
|
||||
db_snap!(index, field_id_docid_facet_strings, @"");
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -1560,12 +1490,6 @@ mod tests {
|
||||
3 2 second second
|
||||
3 3 third third
|
||||
"###);
|
||||
db_snap!(index, string_faceted_documents_ids, @r###"
|
||||
0 []
|
||||
1 []
|
||||
2 []
|
||||
3 [0, 1, 2, 3, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
@ -1728,7 +1652,7 @@ mod tests {
|
||||
|
||||
let wtxn = index.read_txn().unwrap();
|
||||
|
||||
let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map();
|
||||
let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
|
||||
let ids = map.values().collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(ids.len(), map.len());
|
||||
@ -2540,17 +2464,8 @@ mod tests {
|
||||
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
||||
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_external_id("0");
|
||||
builder.delete_external_id("3");
|
||||
let result = builder.execute().unwrap();
|
||||
println!("{result:?}");
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
index.delete_documents(vec!["0".into(), "3".into()]);
|
||||
|
||||
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||
@ -2605,8 +2520,7 @@ mod tests {
|
||||
),
|
||||
]
|
||||
*/
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
let index = TempIndex::new();
|
||||
|
||||
// START OF BATCH
|
||||
|
||||
@ -2646,8 +2560,7 @@ mod tests {
|
||||
{"id":1,"doggo":"bernese"}
|
||||
"###);
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
docids:
|
||||
1 0
|
||||
"###);
|
||||
|
||||
@ -2692,13 +2605,10 @@ mod tests {
|
||||
"###);
|
||||
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
soft:
|
||||
hard:
|
||||
docids:
|
||||
0 1
|
||||
"###);
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
// BATCH 3
|
||||
|
||||
println!("--- ENTERING BATCH 3");
|
||||
@ -2740,4 +2650,537 @@ mod tests {
|
||||
let res = index.search(&rtxn).execute().unwrap();
|
||||
index.documents(&rtxn, res.documents_ids).unwrap();
|
||||
}
|
||||
|
||||
fn delete_documents<'t>(
|
||||
wtxn: &mut RwTxn<'t, '_>,
|
||||
index: &'t TempIndex,
|
||||
external_ids: &[&str],
|
||||
) -> Vec<u32> {
|
||||
let external_document_ids = index.external_documents_ids();
|
||||
let ids_to_delete: Vec<u32> = external_ids
|
||||
.iter()
|
||||
.map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
|
||||
.collect();
|
||||
|
||||
// Delete some documents.
|
||||
index.delete_documents_using_wtxn(
|
||||
wtxn,
|
||||
external_ids.iter().map(ToString::to_string).collect(),
|
||||
);
|
||||
|
||||
ids_to_delete
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_numbers_as_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
||||
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// All these snapshots should be empty since the database was cleared
|
||||
db_snap!(index, documents_ids);
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, facet_id_exists_docids);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_strange_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
|
||||
.unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "mysuperid": 0, "name": "kevin" },
|
||||
{ "mysuperid": 1, "name": "kevina" },
|
||||
{ "mysuperid": 2, "name": "benoit" }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids);
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
settings.set_filterable_fields(hashset! { S("label"), S("label2") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
|
||||
|
||||
// Placeholder search with filter
|
||||
let filter = Filter::from_str("label = sign").unwrap().unwrap();
|
||||
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||
assert!(results.documents_ids.is_empty());
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, facet_id_f64_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, facet_id_exists_docids);
|
||||
db_snap!(index, facet_id_string_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
|
||||
|
||||
// Placeholder search
|
||||
let results = index.search(&wtxn).execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||
|
||||
// search for abstract
|
||||
let results = index.search(&wtxn).query("abstract").execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_filterable_fields(hashset!(S("_geo")));
|
||||
settings.set_sortable_fields(hashset!(S("_geo")));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
|
||||
{ "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
|
||||
{ "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
|
||||
{ "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
|
||||
{ "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
|
||||
{ "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
|
||||
{ "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
|
||||
{ "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
|
||||
{ "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
|
||||
{ "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
|
||||
{ "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
|
||||
{ "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
|
||||
{ "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
|
||||
{ "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
|
||||
{ "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
|
||||
{ "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
|
||||
{ "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
|
||||
{ "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
|
||||
{ "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
|
||||
{ "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
|
||||
])).unwrap();
|
||||
|
||||
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
|
||||
|
||||
// Placeholder search with geo filter
|
||||
let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
|
||||
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids);
|
||||
db_snap!(index, facet_id_string_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_documents_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_external_ids = ["1_7", "1_52"];
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
|
||||
|
||||
// list all documents
|
||||
let results = index.all_documents(&wtxn).unwrap();
|
||||
for result in results {
|
||||
let (id, _) = result.unwrap();
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(&id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
// list internal document ids
|
||||
let results = index.documents_ids(&wtxn).unwrap();
|
||||
for id in results {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(&id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// get internal docids from deleted external document ids
|
||||
let results = index.external_documents_ids();
|
||||
for id in deleted_external_ids {
|
||||
assert!(
|
||||
results.get(&rtxn, id).unwrap().is_none(),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stats_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "docid": "1_4", "label": ["sign"]},
|
||||
{ "docid": "1_5", "label": ["letter"]},
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"]},
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"]},
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
|
||||
{ "docid": "1_39", "label": ["abstract"]},
|
||||
{ "docid": "1_40", "label": ["cartoon"]},
|
||||
{ "docid": "1_41", "label": ["art","drawing"]},
|
||||
{ "docid": "1_42", "label": ["art","pattern"]},
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
|
||||
{ "docid": "1_44", "label": ["drawing"], "number": 44i32},
|
||||
{ "docid": "1_45", "label": ["art"]},
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"]},
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"]},
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"]},
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"]},
|
||||
{ "docid": "1_68", "label": ["design"]},
|
||||
{ "docid": "1_69", "label": ["geometry"]}
|
||||
])).unwrap();
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||
|
||||
// count internal documents
|
||||
let results = index.number_of_documents(&wtxn).unwrap();
|
||||
assert_eq!(18, results);
|
||||
|
||||
// count field distribution
|
||||
let results = index.field_distribution(&wtxn).unwrap();
|
||||
assert_eq!(Some(&18), results.get("label"));
|
||||
assert_eq!(Some(&1), results.get("title"));
|
||||
assert_eq!(Some(&2), results.get("number"));
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
|
||||
use charabia::{Language, Script};
|
||||
let index = TempIndex::new();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
|
||||
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
|
||||
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
|
||||
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
|
||||
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let key_cmn = (Script::Cj, Language::Cmn);
|
||||
let cj_cmn_docs =
|
||||
index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||
expected_cj_cmn_docids.push(1);
|
||||
expected_cj_cmn_docids.push(5);
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1"]);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let cj_cmn_docs =
|
||||
index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||
expected_cj_cmn_docids.push(5);
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_words_exact_attributes() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_searchable_fields(vec![S("text"), S("exact")]);
|
||||
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "text": "hello" },
|
||||
{ "id": 1, "exact": "hello"}
|
||||
]))
|
||||
.unwrap();
|
||||
db_snap!(index, word_docids, 1, @r###"
|
||||
hello [0, ]
|
||||
"###);
|
||||
db_snap!(index, exact_word_docids, 1, @r###"
|
||||
hello [1, ]
|
||||
"###);
|
||||
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_docids, 2, @r###"
|
||||
hello [0, ]
|
||||
"###);
|
||||
db_snap!(index, exact_word_docids, 2, @"");
|
||||
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||
|
||||
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
|
||||
let txn = index.read_txn().unwrap();
|
||||
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
|
||||
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("hello");
|
||||
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[]
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[2, ]
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
benoit [2, ]
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
|
||||
2 [21, ]
|
@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
2 0 2.2 1 [21, ]
|
||||
|
@ -1,5 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/delete_documents.rs
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
1 0 aquarium 1 [5, ]
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user