mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-07 13:15:43 +00:00
Compare commits
35 Commits
prototype-
...
diff-index
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d433abf06 | ||
|
|
d8d45845b2 | ||
|
|
2b7c5829d5 | ||
|
|
9b4627cc8b | ||
|
|
cd40f6b28c | ||
|
|
4941543dfe | ||
|
|
87f8cb8b62 | ||
|
|
81fe3eaa09 | ||
|
|
066221fd2b | ||
|
|
b8fed737ef | ||
|
|
c63ff5298b | ||
|
|
d50408d670 | ||
|
|
e0dc413521 | ||
|
|
0f6a0b1ab8 | ||
|
|
061f490204 | ||
|
|
5c43ff72c1 | ||
|
|
c445e9daec | ||
|
|
178a9802fa | ||
|
|
7d546b9c22 | ||
|
|
c829feb40b | ||
|
|
b88fd7994c | ||
|
|
096d7705c7 | ||
|
|
e8f8730467 | ||
|
|
26ef0b3a07 | ||
|
|
20394fda04 | ||
|
|
27161bcd05 | ||
|
|
04fd44b5e2 | ||
|
|
9078e60024 | ||
|
|
8fb96b8274 | ||
|
|
50ba751244 | ||
|
|
f36c36e368 | ||
|
|
c2dcd66d32 | ||
|
|
d4594306d3 | ||
|
|
93d0680903 | ||
|
|
01101d55ac |
8
.github/ISSUE_TEMPLATE/sprint_issue.md
vendored
8
.github/ISSUE_TEMPLATE/sprint_issue.md
vendored
@@ -7,17 +7,19 @@ assignees: ''
|
||||
|
||||
---
|
||||
|
||||
Related product team resources: [PRD]() (_internal only_)
|
||||
Related product team resources: [roadmap card]() (_internal only_) and [PRD]() (_internal only_)
|
||||
Related product discussion:
|
||||
Related spec: WIP
|
||||
|
||||
## Motivation
|
||||
|
||||
<!---Copy/paste the information in PRD or briefly detail the product motivation. Ask product team if any hesitation.-->
|
||||
<!---Copy/paste the information in the roadmap resources or briefly detail the product motivation. Ask product team if any hesitation.-->
|
||||
|
||||
## Usage
|
||||
|
||||
<!---Link to the public part of the PRD, or to the related product discussion for experimental features-->
|
||||
<!---Write a quick description of the usage if the usage has already been defined-->
|
||||
|
||||
Refer to the final spec to know the details and the final decisions about the usage.
|
||||
|
||||
## TODO
|
||||
|
||||
|
||||
@@ -8,11 +8,11 @@ env:
|
||||
|
||||
jobs:
|
||||
run-benchmarks-on-comment:
|
||||
if: startsWith(github.event.comment.body, '/benchmark')
|
||||
name: Run and upload benchmarks
|
||||
runs-on: benchmarks
|
||||
timeout-minutes: 4320 # 72h
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
@@ -27,25 +27,14 @@ jobs:
|
||||
reaction-type: "eyes"
|
||||
repo-token: ${{ env.GH_TOKEN }}
|
||||
|
||||
- uses: xt0rted/pull-request-comment-branch@v2
|
||||
id: comment-branch
|
||||
with:
|
||||
repo_token: ${{ env.GH_TOKEN }}
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
if: success()
|
||||
with:
|
||||
fetch-depth: 0 # fetch full history to be able to get main commit sha
|
||||
ref: ${{ steps.comment-branch.outputs.head_ref }}
|
||||
|
||||
# Set variables
|
||||
- name: Set current branch name
|
||||
shell: bash
|
||||
run: echo "name=$(git rev-parse --abbrev-ref HEAD)" >> $GITHUB_OUTPUT
|
||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
|
||||
id: current_branch
|
||||
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
|
||||
shell: bash
|
||||
run: echo "name=$(git rev-parse --abbrev-ref HEAD | tr '/' '_')" >> $GITHUB_OUTPUT
|
||||
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
|
||||
id: normalized_current_branch
|
||||
- name: Set shorter commit SHA
|
||||
shell: bash
|
||||
@@ -87,11 +76,9 @@ jobs:
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||
run: |
|
||||
set -x
|
||||
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
||||
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
||||
export base=$(git log --pretty=%p -n 1)
|
||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
||||
echo '```' >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
echo '```' >> body.txt
|
||||
gh pr comment ${{ steps.current_branch.outputs.name }} --body-file body.txt
|
||||
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
|
||||
60
Cargo.lock
generated
60
Cargo.lock
generated
@@ -468,7 +468,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
||||
|
||||
[[package]]
|
||||
name = "benchmarks"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
@@ -1206,7 +1206,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "dump"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
@@ -1417,7 +1417,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "file-store"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"faux",
|
||||
"tempfile",
|
||||
@@ -1439,7 +1439,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "filter-parser"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"nom",
|
||||
@@ -1459,7 +1459,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "flatten-serde-json"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@@ -1577,7 +1577,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fuzzers"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"clap",
|
||||
@@ -1663,13 +1663,12 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
|
||||
[[package]]
|
||||
name = "grenad"
|
||||
version = "0.4.5"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
|
||||
checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"rayon",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
@@ -1892,7 +1891,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "index-scheduler"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
@@ -2089,7 +2088,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "json-depth-checker"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@@ -2501,7 +2500,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "meili-snap"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"md5",
|
||||
@@ -2510,7 +2509,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"actix-cors",
|
||||
"actix-http",
|
||||
@@ -2565,6 +2564,7 @@ dependencies = [
|
||||
"platform-dirs",
|
||||
"prometheus",
|
||||
"puffin",
|
||||
"puffin_http",
|
||||
"rand",
|
||||
"rayon",
|
||||
"regex",
|
||||
@@ -2600,7 +2600,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-auth"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"base64 0.21.2",
|
||||
"enum-iterator",
|
||||
@@ -2619,7 +2619,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-types"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"actix-web",
|
||||
"anyhow",
|
||||
@@ -2673,7 +2673,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "milli"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"big_s",
|
||||
"bimap",
|
||||
@@ -2867,9 +2867,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "obkv"
|
||||
version = "0.2.1"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
|
||||
checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
@@ -2996,7 +2996,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"
|
||||
|
||||
[[package]]
|
||||
name = "permissive-json-pointer"
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
dependencies = [
|
||||
"big_s",
|
||||
"serde_json",
|
||||
@@ -3194,7 +3194,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"rustix 0.36.16",
|
||||
"rustix 0.36.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3237,6 +3237,18 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "puffin_http"
|
||||
version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13bffc600c35913d282ae1e96a6ffcdf36dc7a7cdb9310e0ba15914d258c8193"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"crossbeam-channel",
|
||||
"log",
|
||||
"puffin",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.32"
|
||||
@@ -3467,9 +3479,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.16"
|
||||
version = "0.36.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
||||
checksum = "c37f1bd5ef1b5422177b7646cba67430579cfe2ace80f284fee876bca52ad941"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"errno",
|
||||
@@ -4432,9 +4444,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "webpki"
|
||||
version = "0.22.2"
|
||||
version = "0.22.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
|
||||
checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"untrusted",
|
||||
|
||||
@@ -18,7 +18,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.4.1"
|
||||
version = "1.4.0"
|
||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||
description = "Meilisearch HTTP server"
|
||||
homepage = "https://meilisearch.com"
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
# Profiling Meilisearch
|
||||
|
||||
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options [in Puffin Viewer](https://github.com/embarkstudios/puffin#ui).
|
||||
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options.
|
||||
|
||||

|
||||
|
||||
## Profiling the Indexing Process
|
||||
|
||||
When you enable [the `exportPuffinReports` experimental feature](https://www.meilisearch.com/docs/learn/experimental/overview) of Meilisearch, Puffin reports with the `.puffin` extension will be automatically exported to disk. When this option is enabled, the engine will automatically create a "frame" whenever it executes the `IndexScheduler::tick` method.
|
||||
When you enable the `profile-with-puffin` feature of Meilisearch, a Puffin HTTP server will run on Meilisearch and listen on the default _0.0.0.0:8585_ address. This server will record a "frame" whenever it executes the `IndexScheduler::tick` method.
|
||||
|
||||
[Puffin Viewer](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) is used to analyze the reports. Those reports show areas where Meilisearch spent time during indexing.
|
||||
Once your Meilisearch is running and awaits new indexation operations, you must [install and run the `puffin_viewer` tool](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) to see the profiling results. I advise you to run the viewer with the `RUST_LOG=puffin_http::client=debug` environment variable to see the client trying to connect to your server.
|
||||
|
||||
Another piece of advice on the Puffin viewer UI interface is to consider the _Merge children with same ID_ option. It can hide the exact actual timings at which events were sent. Please turn it off when you see strange gaps on the Flamegraph. It can help.
|
||||
|
||||
|
||||
@@ -25,12 +25,6 @@
|
||||
|
||||
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
|
||||
|
||||
---
|
||||
|
||||
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
|
||||
|
||||
---
|
||||
|
||||
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
|
||||
|
||||
<p align="center" name="demo">
|
||||
|
||||
@@ -6,7 +6,9 @@ use std::path::Path;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use milli::heed::{EnvOpenOptions, RwTxn};
|
||||
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use milli::update::{
|
||||
DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
|
||||
};
|
||||
use milli::Index;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand_chacha::rand_core::SeedableRng;
|
||||
@@ -264,7 +266,17 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
},
|
||||
)
|
||||
});
|
||||
@@ -601,7 +613,17 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
},
|
||||
)
|
||||
});
|
||||
@@ -853,41 +875,22 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
},
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
let indexer_config = IndexerConfig::default();
|
||||
for ids in document_ids_to_delete {
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(&wtxn, ids)
|
||||
.unwrap()
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let ids = external_to_internal.unwrap();
|
||||
let config = IndexDocumentsConfig::default();
|
||||
|
||||
let mut builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
|
||||
.unwrap();
|
||||
(builder, _) = builder.remove_documents(ids).unwrap();
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
fn indexing_movies_in_three_batches(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("indexing");
|
||||
group.sample_size(BENCHMARK_ITERATION);
|
||||
@@ -1109,7 +1112,17 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
},
|
||||
)
|
||||
});
|
||||
@@ -1325,7 +1338,17 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
|
||||
(index, document_ids_to_delete)
|
||||
},
|
||||
move |(index, document_ids_to_delete)| {
|
||||
delete_documents_from_ids(index, document_ids_to_delete)
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
for ids in document_ids_to_delete {
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_documents(&ids);
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index.prepare_for_closing().wait();
|
||||
},
|
||||
)
|
||||
});
|
||||
|
||||
@@ -526,12 +526,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -541,12 +541,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -571,12 +571,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -617,12 +617,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -632,12 +632,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -647,12 +647,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: spells.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: products.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {
|
||||
"android": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"iphone": [
|
||||
"phone",
|
||||
"smartphone"
|
||||
],
|
||||
"phone": [
|
||||
"android",
|
||||
"iphone",
|
||||
"smartphone"
|
||||
]
|
||||
},
|
||||
"distinctAttribute": null
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
source: dump/src/reader/mod.rs
|
||||
expression: movies.settings().unwrap()
|
||||
---
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"genres",
|
||||
"id"
|
||||
],
|
||||
"rankingRules": [
|
||||
"typo",
|
||||
"words",
|
||||
"proximity",
|
||||
"attribute",
|
||||
"exactness",
|
||||
"release_date:asc"
|
||||
],
|
||||
"stopWords": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null
|
||||
}
|
||||
@@ -46,7 +46,6 @@ pub type Checked = settings::Checked;
|
||||
pub type Unchecked = settings::Unchecked;
|
||||
|
||||
pub type Task = updates::UpdateEntry;
|
||||
pub type Kind = updates::UpdateMeta;
|
||||
|
||||
// everything related to the errors
|
||||
pub type ResponseError = errors::ResponseError;
|
||||
@@ -108,11 +107,8 @@ impl V2Reader {
|
||||
pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
|
||||
Ok(self.index_uuid.iter().map(|index| -> Result<_> {
|
||||
V2IndexReader::new(
|
||||
index.uid.clone(),
|
||||
&self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
|
||||
index,
|
||||
BufReader::new(
|
||||
File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
|
||||
),
|
||||
)
|
||||
}))
|
||||
}
|
||||
@@ -147,41 +143,16 @@ pub struct V2IndexReader {
|
||||
}
|
||||
|
||||
impl V2IndexReader {
|
||||
pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
|
||||
pub fn new(name: String, path: &Path) -> Result<Self> {
|
||||
let meta = File::open(path.join("meta.json"))?;
|
||||
let meta: DumpMeta = serde_json::from_reader(meta)?;
|
||||
|
||||
let mut created_at = None;
|
||||
let mut updated_at = None;
|
||||
|
||||
for line in tasks.lines() {
|
||||
let task: Task = serde_json::from_str(&line?)?;
|
||||
if !(task.uuid == index_uuid.uuid && task.is_finished()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let new_created_at = match task.update.meta() {
|
||||
Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
|
||||
_ => None,
|
||||
};
|
||||
let new_updated_at = task.update.finished_at();
|
||||
|
||||
if created_at.is_none() || created_at > new_created_at {
|
||||
created_at = new_created_at;
|
||||
}
|
||||
|
||||
if updated_at.is_none() || updated_at < new_updated_at {
|
||||
updated_at = new_updated_at;
|
||||
}
|
||||
}
|
||||
|
||||
let current_time = OffsetDateTime::now_utc();
|
||||
|
||||
let metadata = IndexMetadata {
|
||||
uid: index_uuid.uid.clone(),
|
||||
uid: name,
|
||||
primary_key: meta.primary_key,
|
||||
created_at: created_at.unwrap_or(current_time),
|
||||
updated_at: updated_at.unwrap_or(current_time),
|
||||
// FIXME: Iterate over the whole task queue to find the creation and last update date.
|
||||
created_at: OffsetDateTime::now_utc(),
|
||||
updated_at: OffsetDateTime::now_utc(),
|
||||
};
|
||||
|
||||
let ret = V2IndexReader {
|
||||
@@ -277,12 +248,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "2022-10-09T20:27:22.688964637Z",
|
||||
"updatedAt": "2022-10-09T20:27:23.951017769Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -292,12 +263,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "2022-10-09T20:27:22.197788495Z",
|
||||
"updatedAt": "2022-10-09T20:28:01.93111053Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -322,12 +293,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "2022-10-09T20:27:24.242683494Z",
|
||||
"updatedAt": "2022-10-09T20:27:24.312809641Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -369,12 +340,12 @@ pub(crate) mod test {
|
||||
assert!(indexes.is_empty());
|
||||
|
||||
// products
|
||||
insta::assert_json_snapshot!(products.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "products",
|
||||
"primaryKey": "sku",
|
||||
"createdAt": "2023-01-30T16:25:56.595257Z",
|
||||
"updatedAt": "2023-01-30T16:25:58.70348Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -384,12 +355,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");
|
||||
|
||||
// movies
|
||||
insta::assert_json_snapshot!(movies.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "movies",
|
||||
"primaryKey": "id",
|
||||
"createdAt": "2023-01-30T16:25:56.192178Z",
|
||||
"updatedAt": "2023-01-30T16:25:56.455714Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -399,12 +370,12 @@ pub(crate) mod test {
|
||||
meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");
|
||||
|
||||
// spells
|
||||
insta::assert_json_snapshot!(spells.metadata(), @r###"
|
||||
insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
|
||||
{
|
||||
"uid": "dnd_spells",
|
||||
"primaryKey": "index",
|
||||
"createdAt": "2023-01-30T16:25:58.876405Z",
|
||||
"updatedAt": "2023-01-30T16:25:59.079906Z"
|
||||
"createdAt": "[now]",
|
||||
"updatedAt": "[now]"
|
||||
}
|
||||
"###);
|
||||
|
||||
|
||||
@@ -227,14 +227,4 @@ impl UpdateStatus {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finished_at(&self) -> Option<OffsetDateTime> {
|
||||
match self {
|
||||
UpdateStatus::Processing(_) => None,
|
||||
UpdateStatus::Enqueued(_) => None,
|
||||
UpdateStatus::Processed(u) => Some(u.processed_at),
|
||||
UpdateStatus::Aborted(_) => None,
|
||||
UpdateStatus::Failed(u) => Some(u.failed_at),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,18 +19,18 @@ one indexing operation.
|
||||
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::ffi::OsStr;
|
||||
use std::fmt;
|
||||
use std::fs::{self, File};
|
||||
use std::io::BufWriter;
|
||||
|
||||
use dump::IndexMetadata;
|
||||
use log::{debug, error, info, trace};
|
||||
use log::{debug, error, info};
|
||||
use meilisearch_types::error::Code;
|
||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::{
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
Settings as MilliSettings,
|
||||
};
|
||||
use meilisearch_types::milli::{self, Filter, BEU32};
|
||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||
@@ -43,7 +43,7 @@ use uuid::Uuid;
|
||||
|
||||
use crate::autobatcher::{self, BatchKind};
|
||||
use crate::utils::{self, swap_index_uid_in_task};
|
||||
use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
|
||||
use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
|
||||
|
||||
/// Represents a combination of tasks that can all be processed at the same time.
|
||||
///
|
||||
@@ -199,29 +199,6 @@ impl Batch {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Batch {
|
||||
/// A text used when we debug the profiling reports.
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let index_uid = self.index_uid();
|
||||
let tasks = self.ids();
|
||||
match self {
|
||||
Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?,
|
||||
Batch::TaskDeletion(_) => f.write_str("TaskDeletion")?,
|
||||
Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?,
|
||||
Batch::Dump(_) => f.write_str("Dump")?,
|
||||
Batch::IndexOperation { op, .. } => write!(f, "{op}")?,
|
||||
Batch::IndexCreation { .. } => f.write_str("IndexCreation")?,
|
||||
Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?,
|
||||
Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?,
|
||||
Batch::IndexSwap { .. } => f.write_str("IndexSwap")?,
|
||||
};
|
||||
match index_uid {
|
||||
Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")),
|
||||
None => f.write_fmt(format_args!(" from tasks: {tasks:?}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexOperation {
|
||||
pub fn index_uid(&self) -> &str {
|
||||
match self {
|
||||
@@ -236,30 +213,6 @@ impl IndexOperation {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for IndexOperation {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::DocumentOperation")
|
||||
}
|
||||
IndexOperation::DocumentDeletion { .. } => {
|
||||
f.write_str("IndexOperation::DocumentDeletion")
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||
}
|
||||
IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"),
|
||||
IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"),
|
||||
IndexOperation::DocumentClearAndSetting { .. } => {
|
||||
f.write_str("IndexOperation::DocumentClearAndSetting")
|
||||
}
|
||||
IndexOperation::SettingsAndDocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::SettingsAndDocumentOperation")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexScheduler {
|
||||
/// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`].
|
||||
///
|
||||
@@ -628,7 +581,7 @@ impl IndexScheduler {
|
||||
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
|
||||
}
|
||||
|
||||
puffin::profile_function!(batch.to_string());
|
||||
puffin::profile_function!(format!("{:?}", batch));
|
||||
|
||||
match batch {
|
||||
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
|
||||
@@ -895,7 +848,7 @@ impl IndexScheduler {
|
||||
})?;
|
||||
|
||||
// 4. Dump experimental feature settings
|
||||
let features = self.features().runtime_features();
|
||||
let features = self.features()?.runtime_features();
|
||||
dump.create_experimental_features(features)?;
|
||||
|
||||
let dump_uid = started_at.format(format_description!(
|
||||
@@ -1190,7 +1143,7 @@ impl IndexScheduler {
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
@@ -1237,8 +1190,7 @@ impl IndexScheduler {
|
||||
let (new_builder, user_result) =
|
||||
builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
let provided_ids =
|
||||
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
||||
task.details
|
||||
@@ -1249,11 +1201,23 @@ impl IndexScheduler {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
});
|
||||
match user_result {
|
||||
Ok(count) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
});
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(0),
|
||||
});
|
||||
task.error = Some(milli::Error::from(e).into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1268,7 +1232,7 @@ impl IndexScheduler {
|
||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
builder.reset_primary_key();
|
||||
builder.execute(
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.clone().get(),
|
||||
)?;
|
||||
}
|
||||
@@ -1276,42 +1240,21 @@ impl IndexScheduler {
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
let must_stop_processing = self.must_stop_processing.clone();
|
||||
let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
|
||||
documents.iter().flatten().for_each(|id| {
|
||||
builder.delete_external_id(id);
|
||||
});
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
index_wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let document_ids = documents.iter().flatten().cloned().collect();
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
|
||||
|
||||
for (task, documents) in tasks.iter_mut().zip(documents) {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids: documents.len(),
|
||||
deleted_documents: Some(count.min(documents.len() as u64)),
|
||||
deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
|
||||
});
|
||||
}
|
||||
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
let addition = builder.execute()?;
|
||||
info!("document deletion done: {:?}", addition);
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
@@ -1323,13 +1266,7 @@ impl IndexScheduler {
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
let deleted_documents = delete_document_by_filter(
|
||||
index_wtxn,
|
||||
filter,
|
||||
self.index_mapper.indexer_config(),
|
||||
self.must_stop_processing.clone(),
|
||||
index,
|
||||
);
|
||||
let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
|
||||
let original_filter = if let Some(Details::DocumentDeletionByFilter {
|
||||
original_filter,
|
||||
deleted_documents: _,
|
||||
@@ -1563,8 +1500,6 @@ impl IndexScheduler {
|
||||
fn delete_document_by_filter<'a>(
|
||||
wtxn: &mut RwTxn<'a, '_>,
|
||||
filter: &serde_json::Value,
|
||||
indexer_config: &IndexerConfig,
|
||||
must_stop_processing: MustStopProcessing,
|
||||
index: &'a Index,
|
||||
) -> Result<u64> {
|
||||
let filter = Filter::from_json(filter)?;
|
||||
@@ -1575,41 +1510,9 @@ fn delete_document_by_filter<'a>(
|
||||
}
|
||||
e => e.into(),
|
||||
})?;
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(wtxn, candidates)?
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let document_ids = match external_to_internal {
|
||||
Ok(external_ids) => external_ids,
|
||||
Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
|
||||
};
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| debug!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
let _ = builder.execute()?;
|
||||
|
||||
count
|
||||
let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
|
||||
delete_operation.delete_documents(&candidates);
|
||||
delete_operation.execute().map(|result| result.deleted_documents)?
|
||||
} else {
|
||||
0
|
||||
})
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
|
||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||
use meilisearch_types::heed::{Database, Env, RwTxn};
|
||||
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
|
||||
|
||||
use crate::error::FeatureNotEnabledError;
|
||||
use crate::Result;
|
||||
@@ -11,19 +9,20 @@ const EXPERIMENTAL_FEATURES: &str = "experimental-features";
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct FeatureData {
|
||||
persisted: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
|
||||
runtime: Arc<RwLock<RuntimeTogglableFeatures>>,
|
||||
runtime: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
|
||||
instance: InstanceTogglableFeatures,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RoFeatures {
|
||||
runtime: RuntimeTogglableFeatures,
|
||||
instance: InstanceTogglableFeatures,
|
||||
}
|
||||
|
||||
impl RoFeatures {
|
||||
fn new(data: &FeatureData) -> Self {
|
||||
let runtime = data.runtime_features();
|
||||
Self { runtime }
|
||||
fn new(txn: RoTxn<'_>, data: &FeatureData) -> Result<Self> {
|
||||
let runtime = data.runtime_features(txn)?;
|
||||
Ok(Self { runtime, instance: data.instance })
|
||||
}
|
||||
|
||||
pub fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
||||
@@ -44,13 +43,13 @@ impl RoFeatures {
|
||||
}
|
||||
|
||||
pub fn check_metrics(&self) -> Result<()> {
|
||||
if self.runtime.metrics {
|
||||
if self.instance.metrics {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Getting metrics",
|
||||
feature: "metrics",
|
||||
issue_link: "https://github.com/meilisearch/product/discussions/625",
|
||||
issue_link: "https://github.com/meilisearch/meilisearch/discussions/3518",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
@@ -68,36 +67,15 @@ impl RoFeatures {
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_puffin(&self) -> Result<()> {
|
||||
if self.runtime.export_puffin_reports {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Outputting Puffin reports to disk",
|
||||
feature: "export puffin reports",
|
||||
issue_link: "https://github.com/meilisearch/product/discussions/693",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FeatureData {
|
||||
pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result<Self> {
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let runtime_features_db = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
|
||||
let runtime_features = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
|
||||
wtxn.commit()?;
|
||||
|
||||
let txn = env.read_txn()?;
|
||||
let persisted_features: RuntimeTogglableFeatures =
|
||||
runtime_features_db.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default();
|
||||
let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures {
|
||||
metrics: instance_features.metrics || persisted_features.metrics,
|
||||
..persisted_features
|
||||
}));
|
||||
|
||||
Ok(Self { persisted: runtime_features_db, runtime })
|
||||
Ok(Self { runtime: runtime_features, instance: instance_features })
|
||||
}
|
||||
|
||||
pub fn put_runtime_features(
|
||||
@@ -105,25 +83,16 @@ impl FeatureData {
|
||||
mut wtxn: RwTxn,
|
||||
features: RuntimeTogglableFeatures,
|
||||
) -> Result<()> {
|
||||
self.persisted.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
|
||||
self.runtime.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
|
||||
wtxn.commit()?;
|
||||
|
||||
// safe to unwrap, the lock will only fail if:
|
||||
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
|
||||
// 2. there's a panic while the thread is held -> it is only used for an assignment here.
|
||||
let mut toggled_features = self.runtime.write().unwrap();
|
||||
*toggled_features = features;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn runtime_features(&self) -> RuntimeTogglableFeatures {
|
||||
// sound to unwrap, the lock will only fail if:
|
||||
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
|
||||
// 2. there's a panic while the thread is held -> it is only used for copying the data here
|
||||
*self.runtime.read().unwrap()
|
||||
fn runtime_features(&self, txn: RoTxn) -> Result<RuntimeTogglableFeatures> {
|
||||
Ok(self.runtime.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn features(&self) -> RoFeatures {
|
||||
RoFeatures::new(self)
|
||||
pub fn features(&self, txn: RoTxn) -> Result<RoFeatures> {
|
||||
RoFeatures::new(txn, self)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,7 +30,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
||||
index_mapper,
|
||||
features: _,
|
||||
max_number_of_tasks: _,
|
||||
puffin_frame: _,
|
||||
wake_up: _,
|
||||
dumps_path: _,
|
||||
snapshots_path: _,
|
||||
|
||||
@@ -33,7 +33,6 @@ pub type Result<T> = std::result::Result<T, Error>;
|
||||
pub type TaskId = u32;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@@ -53,7 +52,6 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
||||
use meilisearch_types::milli::update::IndexerConfig;
|
||||
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||
use puffin::FrameView;
|
||||
use roaring::RoaringBitmap;
|
||||
use synchronoise::SignalEvent;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
@@ -316,9 +314,6 @@ pub struct IndexScheduler {
|
||||
/// the finished tasks automatically.
|
||||
pub(crate) max_number_of_tasks: usize,
|
||||
|
||||
/// A frame to output the indexation profiling files to disk.
|
||||
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
|
||||
|
||||
/// The path used to create the dumps.
|
||||
pub(crate) dumps_path: PathBuf,
|
||||
|
||||
@@ -369,7 +364,6 @@ impl IndexScheduler {
|
||||
wake_up: self.wake_up.clone(),
|
||||
autobatching_enabled: self.autobatching_enabled,
|
||||
max_number_of_tasks: self.max_number_of_tasks,
|
||||
puffin_frame: self.puffin_frame.clone(),
|
||||
snapshots_path: self.snapshots_path.clone(),
|
||||
dumps_path: self.dumps_path.clone(),
|
||||
auth_path: self.auth_path.clone(),
|
||||
@@ -463,7 +457,6 @@ impl IndexScheduler {
|
||||
env,
|
||||
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
|
||||
wake_up: Arc::new(SignalEvent::auto(true)),
|
||||
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
|
||||
autobatching_enabled: options.autobatching_enabled,
|
||||
max_number_of_tasks: options.max_number_of_tasks,
|
||||
dumps_path: options.dumps_path,
|
||||
@@ -579,46 +572,17 @@ impl IndexScheduler {
|
||||
run.wake_up.wait();
|
||||
|
||||
loop {
|
||||
let puffin_enabled = run.features().check_puffin().is_ok();
|
||||
puffin::set_scopes_on(puffin_enabled);
|
||||
puffin::GlobalProfiler::lock().new_frame();
|
||||
|
||||
match run.tick() {
|
||||
Ok(TickOutcome::TickAgain(_)) => (),
|
||||
Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(),
|
||||
Err(e) => {
|
||||
log::error!("{e}");
|
||||
log::error!("{}", e);
|
||||
// Wait one second when an irrecoverable error occurs.
|
||||
if !e.is_recoverable() {
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Let's write the previous frame to disk but only if
|
||||
// the user wanted to profile with puffin.
|
||||
if puffin_enabled {
|
||||
let mut frame_view = run.puffin_frame.lock();
|
||||
if !frame_view.is_empty() {
|
||||
let now = OffsetDateTime::now_utc();
|
||||
let mut file = match File::create(format!("{}.puffin", now)) {
|
||||
Ok(file) => file,
|
||||
Err(e) => {
|
||||
log::error!("{e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if let Err(e) = frame_view.save_to_writer(&mut file) {
|
||||
log::error!("{e}");
|
||||
}
|
||||
if let Err(e) = file.sync_all() {
|
||||
log::error!("{e}");
|
||||
}
|
||||
// We erase this frame view as it is no more useful. We want to
|
||||
// measure the new frames now that we exported the previous ones.
|
||||
*frame_view = FrameView::default();
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
@@ -1098,6 +1062,8 @@ impl IndexScheduler {
|
||||
self.breakpoint(Breakpoint::Start);
|
||||
}
|
||||
|
||||
puffin::GlobalProfiler::lock().new_frame();
|
||||
|
||||
self.cleanup_task_queue()?;
|
||||
|
||||
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
||||
@@ -1293,8 +1259,9 @@ impl IndexScheduler {
|
||||
Ok(IndexStats { is_indexing, inner_stats: index_stats })
|
||||
}
|
||||
|
||||
pub fn features(&self) -> RoFeatures {
|
||||
self.features.features()
|
||||
pub fn features(&self) -> Result<RoFeatures> {
|
||||
let rtxn = self.read_txn()?;
|
||||
self.features.features(rtxn)
|
||||
}
|
||||
|
||||
pub fn put_runtime_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {
|
||||
|
||||
@@ -324,6 +324,7 @@ impl ErrorCode for milli::Error {
|
||||
UserError::SerdeJson(_)
|
||||
| UserError::InvalidLmdbOpenOptions
|
||||
| UserError::DocumentLimitReached
|
||||
| UserError::AccessingSoftDeletedDocument { .. }
|
||||
| UserError::UnknownInternalDocumentId { .. } => Code::Internal,
|
||||
UserError::InvalidStoreFile => Code::InvalidStoreFile,
|
||||
UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
|
||||
|
||||
@@ -5,8 +5,6 @@ use serde::{Deserialize, Serialize};
|
||||
pub struct RuntimeTogglableFeatures {
|
||||
pub score_details: bool,
|
||||
pub vector_store: bool,
|
||||
pub metrics: bool,
|
||||
pub export_puffin_reports: bool,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy)]
|
||||
|
||||
@@ -69,7 +69,8 @@ permissive-json-pointer = { path = "../permissive-json-pointer" }
|
||||
pin-project-lite = "0.2.9"
|
||||
platform-dirs = "0.3.0"
|
||||
prometheus = { version = "0.13.3", features = ["process"] }
|
||||
puffin = { version = "0.16.0", features = ["serialization"] }
|
||||
puffin = "0.16.0"
|
||||
puffin_http = { version = "0.13.0", optional = true }
|
||||
rand = "0.8.5"
|
||||
rayon = "1.7.0"
|
||||
regex = "1.7.3"
|
||||
@@ -134,6 +135,7 @@ zip = { version = "0.6.4", optional = true }
|
||||
[features]
|
||||
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
|
||||
analytics = ["segment"]
|
||||
profile-with-puffin = ["dep:puffin_http"]
|
||||
mini-dashboard = [
|
||||
"actix-web-static-files",
|
||||
"static-files",
|
||||
|
||||
@@ -114,7 +114,10 @@ pub fn create_app(
|
||||
.configure(routes::configure)
|
||||
.configure(|s| dashboard(s, enable_dashboard));
|
||||
|
||||
let app = app.wrap(middleware::RouteMetrics);
|
||||
let app = app.wrap(actix_web::middleware::Condition::new(
|
||||
opt.experimental_enable_metrics,
|
||||
middleware::RouteMetrics,
|
||||
));
|
||||
app.wrap(
|
||||
Cors::default()
|
||||
.send_wildcard()
|
||||
@@ -362,7 +365,7 @@ fn import_dump(
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
},
|
||||
|indexing_step| log::trace!("update: {:?}", indexing_step),
|
||||
|indexing_step| log::debug!("update: {:?}", indexing_step),
|
||||
|| false,
|
||||
)?;
|
||||
|
||||
|
||||
@@ -30,6 +30,10 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let (opt, config_read_from) = Opt::try_build()?;
|
||||
|
||||
#[cfg(feature = "profile-with-puffin")]
|
||||
let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
|
||||
puffin::set_scopes_on(cfg!(feature = "profile-with-puffin"));
|
||||
|
||||
anyhow::ensure!(
|
||||
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
|
||||
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"
|
||||
|
||||
@@ -3,10 +3,8 @@
|
||||
use std::future::{ready, Ready};
|
||||
|
||||
use actix_web::dev::{self, Service, ServiceRequest, ServiceResponse, Transform};
|
||||
use actix_web::web::Data;
|
||||
use actix_web::Error;
|
||||
use futures_util::future::LocalBoxFuture;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use prometheus::HistogramTimer;
|
||||
|
||||
pub struct RouteMetrics;
|
||||
@@ -49,27 +47,19 @@ where
|
||||
|
||||
fn call(&self, req: ServiceRequest) -> Self::Future {
|
||||
let mut histogram_timer: Option<HistogramTimer> = None;
|
||||
|
||||
// calling unwrap here is safe because index scheduler is added to app data while creating actix app.
|
||||
// also, the tests will fail if this is not present.
|
||||
let index_scheduler = req.app_data::<Data<IndexScheduler>>().unwrap();
|
||||
let features = index_scheduler.features();
|
||||
|
||||
if features.check_metrics().is_ok() {
|
||||
let request_path = req.path();
|
||||
let is_registered_resource = req.resource_map().has_resource(request_path);
|
||||
if is_registered_resource {
|
||||
let request_method = req.method().to_string();
|
||||
histogram_timer = Some(
|
||||
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.start_timer(),
|
||||
);
|
||||
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
|
||||
let request_path = req.path();
|
||||
let is_registered_resource = req.resource_map().has_resource(request_path);
|
||||
if is_registered_resource {
|
||||
let request_method = req.method().to_string();
|
||||
histogram_timer = Some(
|
||||
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.inc();
|
||||
}
|
||||
};
|
||||
.start_timer(),
|
||||
);
|
||||
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
|
||||
.with_label_values(&[&request_method, request_path])
|
||||
.inc();
|
||||
}
|
||||
|
||||
let fut = self.service.call(req);
|
||||
|
||||
|
||||
@@ -29,12 +29,12 @@ async fn get_features(
|
||||
>,
|
||||
req: HttpRequest,
|
||||
analytics: Data<dyn Analytics>,
|
||||
) -> HttpResponse {
|
||||
let features = index_scheduler.features();
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let features = index_scheduler.features()?;
|
||||
|
||||
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
|
||||
debug!("returns: {:?}", features.runtime_features());
|
||||
HttpResponse::Ok().json(features.runtime_features())
|
||||
Ok(HttpResponse::Ok().json(features.runtime_features()))
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr)]
|
||||
@@ -44,10 +44,6 @@ pub struct RuntimeTogglableFeatures {
|
||||
pub score_details: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub vector_store: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub metrics: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub export_puffin_reports: Option<bool>,
|
||||
}
|
||||
|
||||
async fn patch_features(
|
||||
@@ -59,36 +55,26 @@ async fn patch_features(
|
||||
req: HttpRequest,
|
||||
analytics: Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let features = index_scheduler.features();
|
||||
let features = index_scheduler.features()?;
|
||||
|
||||
let old_features = features.runtime_features();
|
||||
|
||||
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
|
||||
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
|
||||
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
|
||||
metrics: new_features.0.metrics.unwrap_or(old_features.metrics),
|
||||
export_puffin_reports: new_features
|
||||
.0
|
||||
.export_puffin_reports
|
||||
.unwrap_or(old_features.export_puffin_reports),
|
||||
};
|
||||
|
||||
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
|
||||
// the it renames to camelCase, which we don't want for analytics.
|
||||
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
|
||||
let meilisearch_types::features::RuntimeTogglableFeatures {
|
||||
score_details,
|
||||
vector_store,
|
||||
metrics,
|
||||
export_puffin_reports,
|
||||
} = new_features;
|
||||
let meilisearch_types::features::RuntimeTogglableFeatures { score_details, vector_store } =
|
||||
new_features;
|
||||
|
||||
analytics.publish(
|
||||
"Experimental features Updated".to_string(),
|
||||
json!({
|
||||
"score_details": score_details,
|
||||
"vector_store": vector_store,
|
||||
"metrics": metrics,
|
||||
"export_puffin_reports": export_puffin_reports,
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
|
||||
@@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>(
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
let internal_id = index
|
||||
.external_documents_ids()
|
||||
.get(&txn, doc_id)?
|
||||
.external_documents_ids(&txn)?
|
||||
.get(doc_id.as_bytes())
|
||||
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
|
||||
|
||||
let document = index
|
||||
|
||||
@@ -68,7 +68,7 @@ pub async fn search(
|
||||
}
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let features = index_scheduler.features();
|
||||
let features = index_scheduler.features()?;
|
||||
let search_result = tokio::task::spawn_blocking(move || {
|
||||
perform_facet_search(&index, search_query, facet_query, facet_name, features)
|
||||
})
|
||||
|
||||
@@ -157,7 +157,7 @@ pub async fn search_with_url_query(
|
||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let features = index_scheduler.features();
|
||||
let features = index_scheduler.features()?;
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
@@ -192,7 +192,7 @@ pub async fn search_with_post(
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
|
||||
let features = index_scheduler.features();
|
||||
let features = index_scheduler.features()?;
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
|
||||
@@ -19,7 +19,7 @@ pub async fn get_metrics(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
|
||||
auth_controller: Data<AuthController>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
index_scheduler.features().check_metrics()?;
|
||||
index_scheduler.features()?.check_metrics()?;
|
||||
let auth_filters = index_scheduler.filters();
|
||||
if !auth_filters.all_indexes_authorized() {
|
||||
let mut error = ResponseError::from(AuthenticationError::InvalidToken);
|
||||
|
||||
@@ -41,7 +41,7 @@ pub async fn multi_search_with_post(
|
||||
let queries = params.into_inner().queries;
|
||||
|
||||
let mut multi_aggregate = MultiSearchAggregator::from_queries(&queries, &req);
|
||||
let features = index_scheduler.features();
|
||||
let features = index_scheduler.features()?;
|
||||
|
||||
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
|
||||
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code
|
||||
|
||||
@@ -2,12 +2,10 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use ::time::format_description::well_known::Rfc3339;
|
||||
use maplit::{hashmap, hashset};
|
||||
use meilisearch::Opt;
|
||||
use once_cell::sync::Lazy;
|
||||
use tempfile::TempDir;
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use crate::common::{default_settings, Server, Value};
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
|
||||
@@ -197,9 +195,7 @@ async fn access_authorized_master_key() {
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn access_authorized_restricted_index() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
|
||||
let mut server = Server::new_auth_with_options(enable_metrics, dir).await;
|
||||
let mut server = Server::new_auth().await;
|
||||
for ((method, route), actions) in AUTHORIZATIONS.iter() {
|
||||
for action in actions {
|
||||
// create a new API key letting only the needed action.
|
||||
|
||||
@@ -202,10 +202,6 @@ impl Server {
|
||||
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
|
||||
self.service.patch("/experimental-features", value).await
|
||||
}
|
||||
|
||||
pub async fn get_metrics(&self) -> (Value, StatusCode) {
|
||||
self.service.get("/metrics").await
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
@@ -225,7 +221,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
skip_index_budget: true,
|
||||
..Parser::parse_from(None as Option<&str>)
|
||||
},
|
||||
experimental_enable_metrics: false,
|
||||
experimental_enable_metrics: true,
|
||||
..Parser::parse_from(None as Option<&str>)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"providedIds": 0,
|
||||
"deletedDocuments": 2,
|
||||
"deletedDocuments": 4,
|
||||
"originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
|
||||
},
|
||||
"error": null,
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
use meilisearch::Opt;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::common::{default_settings, Server};
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
/// Feature name to test against.
|
||||
@@ -19,9 +16,7 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
"vectorStore": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -31,9 +26,7 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
"vectorStore": true
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -43,9 +36,7 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
"vectorStore": true
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -56,9 +47,7 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
"vectorStore": true
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -69,73 +58,11 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
"vectorStore": true
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn experimental_feature_metrics() {
|
||||
// instance flag for metrics enables metrics at startup
|
||||
let dir = TempDir::new().unwrap();
|
||||
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
|
||||
let server = Server::new_with_options(enable_metrics).await.unwrap();
|
||||
|
||||
let (response, code) = server.get_features().await;
|
||||
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": true,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = server.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
|
||||
// metrics are not returned in json format
|
||||
// so the test server will return null
|
||||
meili_snap::snapshot!(response, @"null");
|
||||
|
||||
// disabling metrics results in invalid request
|
||||
let (response, code) = server.set_features(json!({"metrics": false})).await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response["metrics"], @"false");
|
||||
|
||||
let (response, code) = server.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
|
||||
// enabling metrics via HTTP results in valid request
|
||||
let (response, code) = server.set_features(json!({"metrics": true})).await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response["metrics"], @"true");
|
||||
|
||||
let (response, code) = server.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response, @"null");
|
||||
|
||||
// startup without flag respects persisted metrics value
|
||||
let disable_metrics =
|
||||
Opt { experimental_enable_metrics: false, ..default_settings(dir.path()) };
|
||||
let server_no_flag = Server::new_with_options(disable_metrics).await.unwrap();
|
||||
let (response, code) = server_no_flag.get_metrics().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response, @"null");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn errors() {
|
||||
let server = Server::new().await;
|
||||
@@ -146,7 +73,7 @@ async fn errors() {
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`",
|
||||
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
use meili_snap::snapshot;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{"productId": 1, "shopId": 1},
|
||||
{"productId": 2, "shopId": 1},
|
||||
{"productId": 3, "shopId": 2},
|
||||
{"productId": 4, "shopId": 2},
|
||||
{"productId": 5, "shopId": 3},
|
||||
{"productId": 6, "shopId": 3},
|
||||
{"productId": 7, "shopId": 4},
|
||||
{"productId": 8, "shopId": 4},
|
||||
{"productId": 9, "shopId": 5},
|
||||
{"productId": 10, "shopId": 5}
|
||||
])
|
||||
});
|
||||
|
||||
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
|
||||
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
|
||||
|
||||
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||
#[actix_rt::test]
|
||||
async fn distinct_search_with_offset_no_ranking() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
|
||||
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
fn get_hits(Value(response): Value) -> Vec<i64> {
|
||||
let hits_array = response["hits"].as_array().unwrap();
|
||||
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @"[1, 2]");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @"[3, 4]");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"1");
|
||||
snapshot!(format!("{:?}", hits), @"[5]");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
|
||||
let hits = get_hits(response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"0");
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
// This modules contains all the test concerning search. Each particular feature of the search
|
||||
// should be tested in its own module to isolate tests and keep the tests readable.
|
||||
|
||||
mod distinct;
|
||||
mod errors;
|
||||
mod facet_search;
|
||||
mod formatted;
|
||||
@@ -817,7 +816,7 @@ async fn experimental_feature_score_details() {
|
||||
},
|
||||
"proximity": {
|
||||
"order": 2,
|
||||
"score": 0.75
|
||||
"score": 0.875
|
||||
},
|
||||
"attribute": {
|
||||
"order": 3,
|
||||
|
||||
@@ -26,8 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.5", default-features = false, features = [
|
||||
"rayon", "tempfile"
|
||||
grenad = { version = "0.4.4", default-features = false, features = [
|
||||
"tempfile",
|
||||
] }
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
||||
"lmdb", "read-txn-no-tls"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, str};
|
||||
|
||||
use obkv::KvReader;
|
||||
@@ -20,14 +19,14 @@ use crate::FieldId;
|
||||
pub struct EnrichedDocumentsBatchReader<R> {
|
||||
documents: DocumentsBatchReader<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||
external_ids: grenad::ReaderCursor<File>,
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||
pub fn new(
|
||||
documents: DocumentsBatchReader<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::Reader<BufReader<File>>,
|
||||
external_ids: grenad::Reader<File>,
|
||||
) -> Result<Self, Error> {
|
||||
if documents.documents_count() as u64 == external_ids.len() {
|
||||
Ok(EnrichedDocumentsBatchReader {
|
||||
@@ -76,7 +75,7 @@ pub struct EnrichedDocument<'a> {
|
||||
pub struct EnrichedDocumentsBatchCursor<R> {
|
||||
documents: DocumentsBatchCursor<R>,
|
||||
primary_key: String,
|
||||
external_ids: grenad::ReaderCursor<BufReader<File>>,
|
||||
external_ids: grenad::ReaderCursor<File>,
|
||||
}
|
||||
|
||||
impl<R> EnrichedDocumentsBatchCursor<R> {
|
||||
|
||||
@@ -89,6 +89,8 @@ pub enum FieldIdMapMissingEntry {
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum UserError {
|
||||
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
|
||||
AccessingSoftDeletedDocument { document_id: DocumentId },
|
||||
#[error("A document cannot contain more than 65,535 fields.")]
|
||||
AttributeLimitReached,
|
||||
#[error(transparent)]
|
||||
|
||||
@@ -1,146 +1,159 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::{fmt, str};
|
||||
|
||||
use heed::types::{OwnedType, Str};
|
||||
use heed::{Database, RoIter, RoTxn, RwTxn};
|
||||
use fst::map::IndexedValue;
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{DocumentId, BEU32};
|
||||
const DELETED_ID: u64 = u64::MAX;
|
||||
|
||||
pub enum DocumentOperationKind {
|
||||
Create,
|
||||
Delete,
|
||||
pub struct ExternalDocumentsIds<'a> {
|
||||
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
|
||||
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
|
||||
soft_deleted_docids: RoaringBitmap,
|
||||
}
|
||||
|
||||
pub struct DocumentOperation {
|
||||
pub external_id: String,
|
||||
pub internal_id: DocumentId,
|
||||
pub kind: DocumentOperationKind,
|
||||
}
|
||||
impl<'a> ExternalDocumentsIds<'a> {
|
||||
pub fn new(
|
||||
hard: fst::Map<Cow<'a, [u8]>>,
|
||||
soft: fst::Map<Cow<'a, [u8]>>,
|
||||
soft_deleted_docids: RoaringBitmap,
|
||||
) -> ExternalDocumentsIds<'a> {
|
||||
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
|
||||
}
|
||||
|
||||
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
|
||||
|
||||
impl ExternalDocumentsIds {
|
||||
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
|
||||
ExternalDocumentsIds(db)
|
||||
pub fn into_static(self) -> ExternalDocumentsIds<'static> {
|
||||
ExternalDocumentsIds {
|
||||
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
||||
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
|
||||
soft_deleted_docids: self.soft_deleted_docids,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if hard and soft external documents lists are empty.
|
||||
pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> {
|
||||
self.0.is_empty(rtxn).map_err(Into::into)
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.hard.is_empty() && self.soft.is_empty()
|
||||
}
|
||||
|
||||
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
|
||||
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
/// soft and hard fst maps, combined.
|
||||
pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> {
|
||||
let mut map = HashMap::default();
|
||||
for result in self.0.iter(rtxn)? {
|
||||
let (external, internal) = result?;
|
||||
map.insert(external.to_owned(), internal.get());
|
||||
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
|
||||
let external_id = external_id.as_ref();
|
||||
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
|
||||
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
|
||||
Some(id.try_into().unwrap())
|
||||
}
|
||||
_otherwise => None,
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
/// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
|
||||
/// these internal ids and their external id.
|
||||
///
|
||||
/// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`,
|
||||
/// where the returned values can be:
|
||||
/// - `Ok((external_id, internal_id))`: if a mapping was found
|
||||
/// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
|
||||
/// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
|
||||
/// the entire fst.
|
||||
pub fn find_external_id_of<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
internal_ids: RoaringBitmap,
|
||||
) -> heed::Result<ExternalToInternalOwnedIterator<'t>> {
|
||||
self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids })
|
||||
}
|
||||
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
|
||||
/// don't contain any soft deleted document id.
|
||||
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
|
||||
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
|
||||
///
|
||||
/// If the list contains multiple operations on the same external id, then the result is unspecified.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If attempting to delete a document that doesn't exist
|
||||
/// - If attempting to create a document that already exists
|
||||
pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> {
|
||||
for DocumentOperation { external_id, internal_id, kind } in operations {
|
||||
match kind {
|
||||
DocumentOperationKind::Create => {
|
||||
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
|
||||
}
|
||||
DocumentOperationKind::Delete => {
|
||||
if !self.0.delete(wtxn, &external_id)? {
|
||||
panic!("Attempting to delete a non-existing document")
|
||||
}
|
||||
}
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, docids)) = iter.next() {
|
||||
// prefer selecting the ids from soft, always
|
||||
let id = indexed_last_value(docids).unwrap();
|
||||
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
|
||||
new_hard_builder.insert(external_id, id)?;
|
||||
}
|
||||
}
|
||||
drop(iter);
|
||||
|
||||
// Delete soft map completely
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
// We save the new map as the new hard map.
|
||||
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns an iterator over all the external ids.
|
||||
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
|
||||
self.0.iter(rtxn)
|
||||
}
|
||||
}
|
||||
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
|
||||
let union_op = self.soft.op().add(other).r#union();
|
||||
|
||||
/// An iterator over mappings between requested internal ids and external ids.
|
||||
///
|
||||
/// See [`ExternalDocumentsIds::find_external_id_of`] for details.
|
||||
pub struct ExternalToInternalOwnedIterator<'t> {
|
||||
iter: RoIter<'t, Str, OwnedType<BEU32>>,
|
||||
internal_ids: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> {
|
||||
/// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
|
||||
type Item = Result<(&'t str, DocumentId), RoaringBitmap>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// if all requested ids were found, we won't find any other, so short-circuit
|
||||
if self.internal_ids.is_empty() {
|
||||
return None;
|
||||
let mut new_soft_builder = fst::MapBuilder::memory();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let id = indexed_last_value(marked_docids).unwrap();
|
||||
new_soft_builder.insert(external_id, id)?;
|
||||
}
|
||||
loop {
|
||||
let (external, internal) = match self.iter.next() {
|
||||
Some(Ok((external, internal))) => (external, internal),
|
||||
// TODO manage this better, remove panic
|
||||
Some(Err(e)) => panic!("{}", e),
|
||||
_ => {
|
||||
// we exhausted the stream but we still have some internal ids to find
|
||||
let remaining_ids = std::mem::take(&mut self.internal_ids);
|
||||
return Some(Err(remaining_ids));
|
||||
// note: next calls to `next` will return `None` since we replaced the internal_ids
|
||||
// with the default empty bitmap
|
||||
}
|
||||
};
|
||||
let internal = internal.get();
|
||||
let was_contained = self.internal_ids.remove(internal);
|
||||
if was_contained {
|
||||
return Some(Ok((external, internal)));
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We save the new map as the new soft map.
|
||||
self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
|
||||
self.merge_soft_into_hard()
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
/// soft and hard fst maps, combined.
|
||||
pub fn to_hash_map(&self) -> HashMap<String, u32> {
|
||||
let mut map = HashMap::new();
|
||||
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
let mut iter = union_op.into_stream();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let id = indexed_last_value(marked_docids).unwrap();
|
||||
if id != DELETED_ID {
|
||||
let external_id = str::from_utf8(external_id).unwrap();
|
||||
map.insert(external_id.to_owned(), id.try_into().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
map
|
||||
}
|
||||
|
||||
/// Return an fst of the combined hard and soft deleted ID.
|
||||
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
|
||||
if self.soft.is_empty() {
|
||||
return Ok(Cow::Borrowed(&self.hard));
|
||||
}
|
||||
let union_op = self.hard.op().add(&self.soft).r#union();
|
||||
|
||||
let mut iter = union_op.into_stream();
|
||||
let mut new_hard_builder = fst::MapBuilder::memory();
|
||||
while let Some((external_id, marked_docids)) = iter.next() {
|
||||
let value = indexed_last_value(marked_docids).unwrap();
|
||||
if value != DELETED_ID {
|
||||
new_hard_builder.insert(external_id, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
|
||||
}
|
||||
|
||||
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
|
||||
if self.soft.len() >= self.hard.len() / 2 {
|
||||
self.hard = self.to_fst()?.into_owned();
|
||||
self.soft = fst::Map::default().map_data(Cow::Owned)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> ExternalToInternalOwnedIterator<'t> {
|
||||
/// Returns the bitmap of internal ids whose external id are yet to be found
|
||||
pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
|
||||
&self.internal_ids
|
||||
}
|
||||
|
||||
/// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
|
||||
///
|
||||
/// Use this when you don't need the mapping between the external and the internal ids.
|
||||
pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't {
|
||||
self.map(|res| res.map(|(external, _internal)| external.to_owned()))
|
||||
impl fmt::Debug for ExternalDocumentsIds<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ExternalDocumentsIds<'static> {
|
||||
fn default() -> Self {
|
||||
ExternalDocumentsIds {
|
||||
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
||||
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
|
||||
soft_deleted_docids: RoaringBitmap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the value of the `IndexedValue` with the highest _index_.
|
||||
fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
|
||||
indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
|
||||
}
|
||||
|
||||
@@ -102,11 +102,11 @@ impl CboRoaringBitmapCodec {
|
||||
}
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into<'a>(
|
||||
pub fn merge_deladd_into(
|
||||
deladd: KvReaderDelAdd<'_>,
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> io::Result<Option<&'a [u8]>> {
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> io::Result<()> {
|
||||
// Deserialize the bitmap that is already there
|
||||
let mut previous = Self::deserialize_from(previous)?;
|
||||
|
||||
@@ -120,12 +120,7 @@ impl CboRoaringBitmapCodec {
|
||||
previous |= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
if previous.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::serialize_into(&previous, buffer);
|
||||
Ok(Some(&buffer[..]))
|
||||
previous.serialize_into(buffer)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -40,6 +40,7 @@ pub mod main_key {
|
||||
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
|
||||
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
|
||||
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
|
||||
pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
|
||||
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
|
||||
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
|
||||
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
|
||||
@@ -51,9 +52,11 @@ pub mod main_key {
|
||||
/// It is concatenated with a big-endian encoded number (non-human readable).
|
||||
/// e.g. vector-hnsw0x0032.
|
||||
pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
|
||||
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
|
||||
pub const PRIMARY_KEY_KEY: &str = "primary-key";
|
||||
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
|
||||
pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
|
||||
pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
|
||||
pub const STOP_WORDS_KEY: &str = "stop-words";
|
||||
pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
|
||||
pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
|
||||
@@ -80,7 +83,6 @@ pub mod db_name {
|
||||
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
|
||||
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
|
||||
pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
|
||||
pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
|
||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||
@@ -112,9 +114,6 @@ pub struct Index {
|
||||
/// Contains many different types (e.g. the fields ids map).
|
||||
pub(crate) main: PolyDatabase,
|
||||
|
||||
/// Maps the external documents ids with the internal document id.
|
||||
pub external_documents_ids: Database<Str, OwnedType<BEU32>>,
|
||||
|
||||
/// A word and all the documents ids containing the word.
|
||||
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||
|
||||
@@ -186,15 +185,13 @@ impl Index {
|
||||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(26);
|
||||
options.max_dbs(25);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
|
||||
let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
|
||||
let external_documents_ids =
|
||||
env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?;
|
||||
let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
|
||||
let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
|
||||
let exact_word_prefix_docids =
|
||||
@@ -240,7 +237,6 @@ impl Index {
|
||||
Ok(Index {
|
||||
env,
|
||||
main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
@@ -372,6 +368,29 @@ impl Index {
|
||||
Ok(count.unwrap_or_default())
|
||||
}
|
||||
|
||||
/* deleted documents ids */
|
||||
|
||||
/// Writes the soft deleted documents ids.
|
||||
pub(crate) fn put_soft_deleted_documents_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
docids: &RoaringBitmap,
|
||||
) -> heed::Result<()> {
|
||||
self.main.put::<_, Str, RoaringBitmapCodec>(
|
||||
wtxn,
|
||||
main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
|
||||
docids,
|
||||
)
|
||||
}
|
||||
|
||||
/// Returns the soft deleted documents ids.
|
||||
pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
|
||||
Ok(self
|
||||
.main
|
||||
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
/* primary key */
|
||||
|
||||
/// Writes the documents primary key, this is the field name that is used to store the id.
|
||||
@@ -392,10 +411,45 @@ impl Index {
|
||||
|
||||
/* external documents ids */
|
||||
|
||||
/// Writes the external documents ids and internal ids (i.e. `u32`).
|
||||
pub(crate) fn put_external_documents_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
external_documents_ids: &ExternalDocumentsIds<'_>,
|
||||
) -> heed::Result<()> {
|
||||
let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
|
||||
let hard = hard.as_fst().as_bytes();
|
||||
let soft = soft.as_fst().as_bytes();
|
||||
self.main.put::<_, Str, ByteSlice>(
|
||||
wtxn,
|
||||
main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY,
|
||||
hard,
|
||||
)?;
|
||||
self.main.put::<_, Str, ByteSlice>(
|
||||
wtxn,
|
||||
main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY,
|
||||
soft,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the external documents ids map which associate the external ids
|
||||
/// with the internal ids (i.e. `u32`).
|
||||
pub fn external_documents_ids(&self) -> ExternalDocumentsIds {
|
||||
ExternalDocumentsIds::new(self.external_documents_ids)
|
||||
pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> {
|
||||
let hard =
|
||||
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
|
||||
let soft =
|
||||
self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
|
||||
let hard = match hard {
|
||||
Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?,
|
||||
None => fst::Map::default().map_data(Cow::Owned)?,
|
||||
};
|
||||
let soft = match soft {
|
||||
Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
|
||||
None => fst::Map::default().map_data(Cow::Owned)?,
|
||||
};
|
||||
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
|
||||
Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
|
||||
}
|
||||
|
||||
/* fields ids map */
|
||||
@@ -1150,7 +1204,12 @@ impl Index {
|
||||
rtxn: &'t RoTxn,
|
||||
ids: impl IntoIterator<Item = DocumentId> + 'a,
|
||||
) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
|
||||
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
|
||||
|
||||
Ok(ids.into_iter().map(move |id| {
|
||||
if soft_deleted_documents.contains(id) {
|
||||
return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?;
|
||||
}
|
||||
let kv = self
|
||||
.documents
|
||||
.get(rtxn, &BEU32::new(id))?
|
||||
@@ -1376,10 +1435,14 @@ impl Index {
|
||||
rtxn: &RoTxn,
|
||||
key: &(Script, Language),
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
self.script_language_docids.get(rtxn, key)
|
||||
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
|
||||
let doc_ids = self.script_language_docids.get(rtxn, key)?;
|
||||
Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
|
||||
}
|
||||
|
||||
pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Script, Vec<Language>>> {
|
||||
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
|
||||
|
||||
let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
|
||||
let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new();
|
||||
let mut total = 0;
|
||||
@@ -1387,7 +1450,7 @@ impl Index {
|
||||
let ((script, language), docids) = sl?;
|
||||
|
||||
// keep only Languages that contains at least 1 document.
|
||||
let remaining_documents_count = docids.len();
|
||||
let remaining_documents_count = (docids - &soft_deleted_documents).len();
|
||||
total += remaining_documents_count;
|
||||
if remaining_documents_count > 0 {
|
||||
script_language_doc_count.push((script, language, remaining_documents_count));
|
||||
@@ -1423,7 +1486,8 @@ pub(crate) mod tests {
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
use crate::update::{
|
||||
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
|
||||
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
|
||||
IndexDocumentsMethod, IndexerConfig, Settings,
|
||||
};
|
||||
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
||||
|
||||
@@ -1513,36 +1577,16 @@ pub(crate) mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn delete_documents_using_wtxn<'t>(
|
||||
&'t self,
|
||||
wtxn: &mut RwTxn<'t, '_>,
|
||||
external_document_ids: Vec<String>,
|
||||
) {
|
||||
let builder = IndexDocuments::new(
|
||||
wtxn,
|
||||
self,
|
||||
&self.indexer_config,
|
||||
self.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
pub fn delete_documents(&self, external_document_ids: Vec<String>) {
|
||||
pub fn delete_document(&self, external_document_id: &str) {
|
||||
let mut wtxn = self.write_txn().unwrap();
|
||||
|
||||
self.delete_documents_using_wtxn(&mut wtxn, external_document_ids);
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, self).unwrap();
|
||||
delete.strategy(self.index_documents_config.deletion_strategy);
|
||||
|
||||
delete.delete_external_id(external_document_id);
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
pub fn delete_document(&self, external_document_id: &str) {
|
||||
self.delete_documents(vec![external_document_id.to_string()])
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1856,7 +1900,9 @@ pub(crate) mod tests {
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
let index = TempIndex::new();
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
let index = index;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
@@ -1875,12 +1921,14 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 1, @r###"
|
||||
1 0 0 1 [0, ]
|
||||
1 0 1 1 [1, ]
|
||||
@@ -1896,37 +1944,44 @@ pub(crate) mod tests {
|
||||
}
|
||||
index.add_documents(documents!(docs)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
|
||||
db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
soft:
|
||||
hard:
|
||||
0 4
|
||||
1 5
|
||||
2 6
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]");
|
||||
db_snap!(index, facet_id_f64_docids, 2, @r###"
|
||||
1 0 1 1 [0, ]
|
||||
1 0 2 1 [1, ]
|
||||
1 0 3 1 [2, 3, ]
|
||||
1 0 0 1 [0, ]
|
||||
1 0 1 1 [1, 4, ]
|
||||
1 0 2 1 [2, 5, ]
|
||||
1 0 3 1 [3, 6, ]
|
||||
"###);
|
||||
|
||||
index
|
||||
.add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
|
||||
db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]");
|
||||
db_snap!(index, external_documents_ids, 3, @r###"
|
||||
docids:
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
soft:
|
||||
3 7
|
||||
hard:
|
||||
0 4
|
||||
1 5
|
||||
2 6
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]");
|
||||
db_snap!(index, facet_id_f64_docids, 3, @r###"
|
||||
1 0 1 1 [0, ]
|
||||
1 0 2 1 [1, ]
|
||||
1 0 3 1 [2, ]
|
||||
1 0 4 1 [3, ]
|
||||
1 0 0 1 [0, ]
|
||||
1 0 1 1 [1, 4, ]
|
||||
1 0 2 1 [2, 5, ]
|
||||
1 0 3 1 [3, 6, ]
|
||||
1 0 4 1 [7, ]
|
||||
"###);
|
||||
|
||||
index
|
||||
@@ -1935,30 +1990,300 @@ pub(crate) mod tests {
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
|
||||
db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]");
|
||||
db_snap!(index, external_documents_ids, 3, @r###"
|
||||
docids:
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
3 3
|
||||
soft:
|
||||
hard:
|
||||
0 4
|
||||
1 5
|
||||
2 6
|
||||
3 7
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 3, @r###"
|
||||
0 0 0 1 [0, ]
|
||||
0 0 1 1 [1, ]
|
||||
0 0 2 1 [2, ]
|
||||
0 0 3 1 [3, ]
|
||||
1 0 1 1 [0, ]
|
||||
1 0 2 1 [1, ]
|
||||
1 0 3 1 [2, ]
|
||||
1 0 4 1 [3, ]
|
||||
0 0 0 1 [4, ]
|
||||
0 0 1 1 [5, ]
|
||||
0 0 2 1 [6, ]
|
||||
0 0 3 1 [7, ]
|
||||
1 0 1 1 [4, ]
|
||||
1 0 2 1 [5, ]
|
||||
1 0 3 1 [6, ]
|
||||
1 0 4 1 [7, ]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replace_documents_in_batches_external_ids_and_soft_deletion_check() {
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
let mut index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("doggo") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let add_documents = |index: &TempIndex, docs: Vec<Vec<serde_json::Value>>| {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
for docs in docs {
|
||||
(builder, _) = builder.add_documents(documents!(docs)).unwrap();
|
||||
}
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
};
|
||||
// First Batch
|
||||
{
|
||||
let mut docs1 = vec![];
|
||||
for i in 0..4 {
|
||||
docs1.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i }
|
||||
));
|
||||
}
|
||||
add_documents(&index, vec![docs1]);
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
soft:
|
||||
hard:
|
||||
0 0
|
||||
1 1
|
||||
2 2
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 1, @r###"
|
||||
1 0 0 1 [0, ]
|
||||
1 0 1 1 [1, ]
|
||||
1 0 2 1 [2, ]
|
||||
1 0 3 1 [3, ]
|
||||
"###);
|
||||
}
|
||||
// Second Batch: replace the documents with soft-deletion
|
||||
{
|
||||
index.index_documents_config.deletion_strategy =
|
||||
crate::update::DeletionStrategy::AlwaysSoft;
|
||||
let mut docs1 = vec![];
|
||||
for i in 0..3 {
|
||||
docs1.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i+1 }
|
||||
));
|
||||
}
|
||||
let mut docs2 = vec![];
|
||||
for i in 0..3 {
|
||||
docs2.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i }
|
||||
));
|
||||
}
|
||||
add_documents(&index, vec![docs1, docs2]);
|
||||
|
||||
db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
soft:
|
||||
hard:
|
||||
0 4
|
||||
1 5
|
||||
2 6
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]");
|
||||
db_snap!(index, facet_id_f64_docids, 1, @r###"
|
||||
1 0 0 1 [0, 4, ]
|
||||
1 0 1 1 [1, 5, ]
|
||||
1 0 2 1 [2, 6, ]
|
||||
1 0 3 1 [3, ]
|
||||
"###);
|
||||
}
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(3),
|
||||
"doggo": Number(3),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0];
|
||||
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(0),
|
||||
"doggo": Number(0),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(1),
|
||||
"doggo": Number(1),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(2),
|
||||
"doggo": Number(2),
|
||||
}
|
||||
"###);
|
||||
drop(rtxn);
|
||||
// Third Batch: replace the documents with soft-deletion again
|
||||
{
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
let mut docs1 = vec![];
|
||||
for i in 0..3 {
|
||||
docs1.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i+1 }
|
||||
));
|
||||
}
|
||||
let mut docs2 = vec![];
|
||||
for i in 0..4 {
|
||||
docs2.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i }
|
||||
));
|
||||
}
|
||||
add_documents(&index, vec![docs1, docs2]);
|
||||
|
||||
db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
soft:
|
||||
hard:
|
||||
0 7
|
||||
1 8
|
||||
2 9
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]");
|
||||
db_snap!(index, facet_id_f64_docids, 1, @r###"
|
||||
1 0 0 1 [0, 4, 7, ]
|
||||
1 0 1 1 [1, 5, 8, ]
|
||||
1 0 2 1 [2, 6, 9, ]
|
||||
1 0 3 1 [3, ]
|
||||
"###);
|
||||
}
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(3),
|
||||
"doggo": Number(3),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(0),
|
||||
"doggo": Number(0),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(1),
|
||||
"doggo": Number(1),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(2),
|
||||
"doggo": Number(2),
|
||||
}
|
||||
"###);
|
||||
drop(rtxn);
|
||||
|
||||
// Fourth Batch: replace the documents without soft-deletion
|
||||
{
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
let mut docs1 = vec![];
|
||||
for i in 0..3 {
|
||||
docs1.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i+2 }
|
||||
));
|
||||
}
|
||||
let mut docs2 = vec![];
|
||||
for i in 0..1 {
|
||||
docs2.push(serde_json::json!(
|
||||
{ "id": i, "doggo": i }
|
||||
));
|
||||
}
|
||||
add_documents(&index, vec![docs1, docs2]);
|
||||
|
||||
db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
soft:
|
||||
hard:
|
||||
0 10
|
||||
1 11
|
||||
2 12
|
||||
3 3
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 1, @r###"
|
||||
1 0 0 1 [10, ]
|
||||
1 0 3 1 [3, 11, ]
|
||||
1 0 4 1 [12, ]
|
||||
"###);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(3),
|
||||
"doggo": Number(3),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(0),
|
||||
"doggo": Number(0),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(1),
|
||||
"doggo": Number(3),
|
||||
}
|
||||
"###);
|
||||
let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0];
|
||||
let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
|
||||
insta::assert_debug_snapshot!(json, @r###"
|
||||
{
|
||||
"id": Number(2),
|
||||
"doggo": Number(4),
|
||||
}
|
||||
"###);
|
||||
drop(rtxn);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3021_first() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3021
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
||||
|
||||
index
|
||||
@@ -1976,18 +2301,23 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
34 1
|
||||
38 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
|
||||
|
||||
index.delete_document("34");
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
34 1
|
||||
38 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]");
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
@@ -1999,9 +2329,11 @@ pub(crate) mod tests {
|
||||
// do not contain any entry for previously soft-deleted document ids
|
||||
db_snap!(index, documents_ids, @"[0, ]");
|
||||
db_snap!(index, external_documents_ids, 3, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
38 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
|
||||
|
||||
// So that this document addition works correctly now.
|
||||
// It would be wrongly interpreted as a replacement before
|
||||
@@ -2009,19 +2341,24 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||
db_snap!(index, external_documents_ids, 4, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
34 1
|
||||
38 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 4, @"[]");
|
||||
|
||||
// We do the test again, but deleting the document with id 0 instead of id 1 now
|
||||
index.delete_document("38");
|
||||
|
||||
db_snap!(index, documents_ids, @"[1, ]");
|
||||
db_snap!(index, external_documents_ids, 5, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
34 1
|
||||
38 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]");
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
@@ -2031,9 +2368,11 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[1, ]");
|
||||
db_snap!(index, external_documents_ids, 6, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
34 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
|
||||
|
||||
// And adding lots of documents afterwards instead of just one.
|
||||
// These extra subtests don't add much, but it's better than nothing.
|
||||
@@ -2041,7 +2380,8 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]");
|
||||
db_snap!(index, external_documents_ids, 7, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
34 1
|
||||
38 0
|
||||
39 2
|
||||
@@ -2049,38 +2389,14 @@ pub(crate) mod tests {
|
||||
41 3
|
||||
42 5
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_delete() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 30 },
|
||||
{ "id": 34 }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
docids:
|
||||
30 0
|
||||
34 1"###);
|
||||
|
||||
index.delete_document("34");
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
30 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 7, @"[]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3021_second() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3021
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
index
|
||||
@@ -2098,18 +2414,23 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
30 0
|
||||
34 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
|
||||
|
||||
index.delete_document("34");
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
30 0
|
||||
34 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]");
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
@@ -2121,9 +2442,11 @@ pub(crate) mod tests {
|
||||
// do not contain any entry for previously soft-deleted document ids
|
||||
db_snap!(index, documents_ids, @"[0, ]");
|
||||
db_snap!(index, external_documents_ids, 3, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
30 0
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
|
||||
|
||||
// So that when we add a new document
|
||||
index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap();
|
||||
@@ -2132,10 +2455,12 @@ pub(crate) mod tests {
|
||||
// The external documents ids don't have several external ids pointing to the same
|
||||
// internal document id
|
||||
db_snap!(index, external_documents_ids, 4, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
30 0
|
||||
35 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 4, @"[]");
|
||||
|
||||
// And when we add 34 again, we don't replace document 35
|
||||
index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap();
|
||||
@@ -2143,11 +2468,13 @@ pub(crate) mod tests {
|
||||
// And document 35 still exists, is not deleted
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
|
||||
db_snap!(index, external_documents_ids, 5, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
30 0
|
||||
34 2
|
||||
35 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 5, @"[]");
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0];
|
||||
@@ -2179,7 +2506,8 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]");
|
||||
db_snap!(index, external_documents_ids, 6, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
30 0
|
||||
34 2
|
||||
35 1
|
||||
@@ -2187,12 +2515,14 @@ pub(crate) mod tests {
|
||||
38 4
|
||||
39 5
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3021_third() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3021
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
index
|
||||
@@ -2211,29 +2541,38 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
|
||||
db_snap!(index, external_documents_ids, 1, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
3 0
|
||||
4 1
|
||||
5 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
|
||||
|
||||
index.delete_document("3");
|
||||
|
||||
db_snap!(index, documents_ids, @"[1, 2, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
3 0
|
||||
4 1
|
||||
5 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]");
|
||||
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
|
||||
index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[1, 2, ]");
|
||||
db_snap!(index, documents_ids, @"[2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
4 1
|
||||
soft:
|
||||
hard:
|
||||
4 3
|
||||
5 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
@@ -2241,13 +2580,15 @@ pub(crate) mod tests {
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
|
||||
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, 2, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
3 0
|
||||
4 1
|
||||
4 3
|
||||
5 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -2255,6 +2596,7 @@ pub(crate) mod tests {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3021
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
@@ -2271,10 +2613,12 @@ pub(crate) mod tests {
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
@@ -2283,23 +2627,31 @@ pub(crate) mod tests {
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
|
||||
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
docids:
|
||||
1 2
|
||||
soft:
|
||||
hard:
|
||||
1 3
|
||||
11 0
|
||||
4 1
|
||||
4 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[1, ]");
|
||||
|
||||
index.delete_documents(Default::default());
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
delete.strategy(DeletionStrategy::AlwaysHard);
|
||||
delete.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
|
||||
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
docids:
|
||||
1 2
|
||||
soft:
|
||||
hard:
|
||||
1 3
|
||||
11 0
|
||||
4 1
|
||||
4 2
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
@@ -2308,13 +2660,15 @@ pub(crate) mod tests {
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
|
||||
db_snap!(index, documents_ids, @"[0, 1, 4, ]");
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
docids:
|
||||
1 2
|
||||
soft:
|
||||
hard:
|
||||
1 4
|
||||
11 0
|
||||
4 1
|
||||
"###);
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let search = Search::new(&rtxn, &index);
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::cmp;
|
||||
|
||||
use crate::{relative_from_absolute_position, Position};
|
||||
|
||||
pub const MAX_DISTANCE: u32 = 4;
|
||||
pub const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||
if lhs <= rhs {
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
||||
/// The documents returned by the iterator are grouped by the facet values that
|
||||
/// determined their rank. For example, given the documents:
|
||||
///
|
||||
/// ```text
|
||||
/// ```ignore
|
||||
/// 0: { "colour": ["blue", "green"] }
|
||||
/// 1: { "colour": ["blue", "red"] }
|
||||
/// 2: { "colour": ["orange", "red"] }
|
||||
@@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec;
|
||||
/// ```
|
||||
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
||||
/// over the following elements:
|
||||
/// ```text
|
||||
/// ```ignore
|
||||
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
||||
/// [3] // same for "green"
|
||||
/// [2] // same for "orange"
|
||||
|
||||
@@ -223,9 +223,12 @@ impl<'a> Filter<'a> {
|
||||
impl<'a> Filter<'a> {
|
||||
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
|
||||
// and finally we delete all the soft_deleted_documents, again, only once at the very end
|
||||
self.inner_evaluate(rtxn, index, &filterable_fields)
|
||||
.map(|result| result - soft_deleted_documents)
|
||||
}
|
||||
|
||||
fn evaluate_operator(
|
||||
|
||||
@@ -46,27 +46,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
if let Some(distinct_fid) = distinct_fid {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut results = vec![];
|
||||
let mut skip = 0;
|
||||
for docid in universe.iter() {
|
||||
if results.len() >= length {
|
||||
if results.len() >= from + length {
|
||||
break;
|
||||
}
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||
skip += 1;
|
||||
if skip <= from {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.push(docid);
|
||||
}
|
||||
|
||||
let mut all_candidates = universe - excluded;
|
||||
all_candidates.extend(results.iter().copied());
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: vec![Default::default(); results.len()],
|
||||
docids: results,
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use super::ProximityCondition;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::SearchContext;
|
||||
@@ -36,7 +35,7 @@ pub fn build_edges(
|
||||
}
|
||||
|
||||
let mut conditions = vec![];
|
||||
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
|
||||
for cost in right_ngram_max..(7 + right_ngram_max) {
|
||||
conditions.push((
|
||||
cost as u32,
|
||||
conditions_interner.insert(ProximityCondition::Uninit {
|
||||
@@ -48,7 +47,7 @@ pub fn build_edges(
|
||||
}
|
||||
|
||||
conditions.push((
|
||||
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
|
||||
(7 + right_ngram_max) as u32,
|
||||
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||
));
|
||||
|
||||
|
||||
@@ -273,7 +273,7 @@ fn test_proximity_simple() {
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 2, 3, 5, 1, 0]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 5, 2, 3, 0, 1]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
@@ -282,11 +282,11 @@ fn test_proximity_simple() {
|
||||
"\"the quickbrown fox jumps over the lazy dog\"",
|
||||
"\"the really quick brown fox jumps over the lazy dog\"",
|
||||
"\"the really quick brown fox jumps over the very lazy dog\"",
|
||||
"\"brown quick fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy. dog\"",
|
||||
"\"dog the quick brown fox jumps over the lazy\"",
|
||||
"\"brown quick fox jumps over the lazy dog\"",
|
||||
"\"the. quick brown fox jumps over the lazy. dog\"",
|
||||
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
||||
"\"the. quick brown fox jumps over the lazy. dog\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
@@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best s");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@@ -382,9 +382,9 @@ fn test_proximity_prefix_db() {
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
@@ -396,7 +396,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best su");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@@ -406,10 +406,10 @@ fn test_proximity_prefix_db() {
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
@@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best win");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@@ -447,7 +447,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wint");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@@ -457,10 +457,10 @@ fn test_proximity_prefix_db() {
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@@ -471,7 +471,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wi");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@@ -481,9 +481,9 @@ fn test_proximity_prefix_db() {
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@@ -68,8 +68,8 @@ fn test_trap_basic() {
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
@@ -82,8 +82,8 @@ fn test_trap_basic() {
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
|
||||
@@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
rank: 35,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -101,8 +101,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -127,8 +127,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -153,8 +153,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -179,8 +179,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 10,
|
||||
rank: 21,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -205,8 +205,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 10,
|
||||
rank: 17,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -231,8 +231,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 10,
|
||||
rank: 17,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -3,35 +3,59 @@ source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -39,31 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,32 +6,40 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -39,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -47,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -55,15 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,32 +6,40 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -39,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -47,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -55,7 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -63,15 +71,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -3,35 +3,59 @@ source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -39,7 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -47,31 +71,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,32 +6,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -39,7 +15,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -47,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -55,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -63,7 +39,31 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,24 +6,24 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
rank: 5,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 7,
|
||||
rank: 8,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 50,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
rank: 50,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 22,
|
||||
rank: 49,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 22,
|
||||
rank: 49,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 22,
|
||||
rank: 48,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 17,
|
||||
max_rank: 22,
|
||||
rank: 41,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 22,
|
||||
rank: 40,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 16,
|
||||
rank: 36,
|
||||
max_rank: 36,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 16,
|
||||
rank: 31,
|
||||
max_rank: 36,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -166,8 +166,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
rank: 8,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 19,
|
||||
rank: 42,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 19,
|
||||
rank: 42,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 17,
|
||||
max_rank: 19,
|
||||
rank: 41,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 14,
|
||||
max_rank: 19,
|
||||
rank: 34,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
rank: 33,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 16,
|
||||
rank: 36,
|
||||
max_rank: 36,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 13,
|
||||
rank: 29,
|
||||
max_rank: 29,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 13,
|
||||
rank: 24,
|
||||
max_rank: 29,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 23,
|
||||
max_rank: 25,
|
||||
rank: 55,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
rank: 52,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
rank: 51,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
rank: 48,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
rank: 47,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 25,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -178,6 +178,62 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 50,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 43,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 38,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 29,
|
||||
max_rank: 29,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
@@ -188,42 +244,14 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 13,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -236,36 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
rank: 22,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 7,
|
||||
rank: 15,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 23,
|
||||
max_rank: 25,
|
||||
rank: 55,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
rank: 52,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 25,
|
||||
rank: 47,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 25,
|
||||
rank: 45,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 25,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -178,6 +178,62 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 47,
|
||||
max_rank: 50,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 40,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 35,
|
||||
max_rank: 43,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 26,
|
||||
max_rank: 29,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
@@ -188,42 +244,14 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 16,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 13,
|
||||
rank: 19,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -236,36 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 10,
|
||||
rank: 19,
|
||||
max_rank: 22,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 7,
|
||||
rank: 13,
|
||||
max_rank: 15,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -6,88 +6,88 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
rank: 57,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
rank: 56,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 23,
|
||||
max_rank: 25,
|
||||
rank: 55,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 22,
|
||||
max_rank: 25,
|
||||
rank: 54,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 25,
|
||||
rank: 53,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
rank: 52,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 25,
|
||||
rank: 51,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
rank: 48,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 25,
|
||||
rank: 47,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
@@ -95,7 +95,7 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 25,
|
||||
max_rank: 57,
|
||||
},
|
||||
),
|
||||
],
|
||||
|
||||
@@ -259,8 +259,8 @@ fn test_ignore_stop_words() {
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
rank: 7,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
@@ -411,8 +411,8 @@ fn test_stop_words_in_phrase() {
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
rank: 6,
|
||||
max_rank: 8,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
|
||||
@@ -277,7 +277,7 @@ fn test_words_proximity_tms_last_simple() {
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
@@ -289,10 +289,10 @@ fn test_words_proximity_tms_last_simple() {
|
||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
@@ -312,7 +312,7 @@ fn test_words_proximity_tms_last_simple() {
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
// 10 is better than 9 because of the proximity between "quick" and "brown"
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 15, 16, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
@@ -326,8 +326,8 @@ fn test_words_proximity_tms_last_simple() {
|
||||
"\"the great quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the really lazy dog\"",
|
||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
@@ -427,7 +427,7 @@ fn test_words_tms_all() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
@@ -439,10 +439,10 @@ fn test_words_tms_all() {
|
||||
"\"the mighty and quick brown fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox jumps over the really lazy dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
|
||||
"\"this quick brown and scary fox jumps over the lazy dog\"",
|
||||
"\"this quick brown and very scary fox jumps over the lazy dog\"",
|
||||
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@@ -4,8 +4,9 @@ use std::path::Path;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, Index};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
|
||||
|
||||
#[track_caller]
|
||||
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
|
||||
@@ -97,6 +98,7 @@ Create a snapshot test of the given database.
|
||||
- `facet_id_string_docids`
|
||||
- `documents_ids`
|
||||
- `stop_words`
|
||||
- `soft_deleted_documents_ids`
|
||||
- `field_distribution`
|
||||
- `fields_ids_map`
|
||||
- `geo_faceted_documents_ids`
|
||||
@@ -306,6 +308,12 @@ pub fn snap_stop_words(index: &Index) -> String {
|
||||
let snap = format!("{stop_words:?}");
|
||||
snap
|
||||
}
|
||||
pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
|
||||
|
||||
display_bitmap(&soft_deleted_documents_ids)
|
||||
}
|
||||
pub fn snap_field_distributions(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut snap = String::new();
|
||||
@@ -332,15 +340,20 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
|
||||
}
|
||||
pub fn snap_external_documents_ids(index: &Index) -> String {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
|
||||
// ensure fixed order (not guaranteed by hashmap)
|
||||
let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
|
||||
external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
|
||||
let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
|
||||
|
||||
let mut snap = String::new();
|
||||
|
||||
writeln!(&mut snap, "docids:").unwrap();
|
||||
for (key, id) in external_ids {
|
||||
writeln!(&mut snap, "soft:").unwrap();
|
||||
let stream_soft = soft.stream();
|
||||
let soft_external_ids = stream_soft.into_str_vec().unwrap();
|
||||
for (key, id) in soft_external_ids {
|
||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||
}
|
||||
writeln!(&mut snap, "hard:").unwrap();
|
||||
let stream_hard = hard.stream();
|
||||
let hard_external_ids = stream_hard.into_str_vec().unwrap();
|
||||
for (key, id) in hard_external_ids {
|
||||
writeln!(&mut snap, "{key:<24} {id}").unwrap();
|
||||
}
|
||||
|
||||
@@ -479,6 +492,9 @@ macro_rules! full_snap_of_db {
|
||||
($index:ident, stop_words) => {{
|
||||
$crate::snapshot_tests::snap_stop_words(&$index)
|
||||
}};
|
||||
($index:ident, soft_deleted_documents_ids) => {{
|
||||
$crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
|
||||
}};
|
||||
($index:ident, field_distribution) => {{
|
||||
$crate::snapshot_tests::snap_field_distributions(&$index)
|
||||
}};
|
||||
|
||||
@@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds {
|
||||
}
|
||||
|
||||
impl AvailableDocumentsIds {
|
||||
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
|
||||
match docids.max() {
|
||||
pub fn from_documents_ids(
|
||||
docids: &RoaringBitmap,
|
||||
soft_deleted_docids: &RoaringBitmap,
|
||||
) -> AvailableDocumentsIds {
|
||||
let used_docids = docids | soft_deleted_docids;
|
||||
|
||||
match used_docids.max() {
|
||||
Some(last_id) => {
|
||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||
available -= docids;
|
||||
available -= used_docids;
|
||||
|
||||
let iter = match last_id.checked_add(1) {
|
||||
Some(id) => id..=u32::max_value(),
|
||||
@@ -45,7 +50,7 @@ mod tests {
|
||||
#[test]
|
||||
fn empty() {
|
||||
let base = RoaringBitmap::new();
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
||||
let right = 0..=u32::max_value();
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
@@ -58,8 +63,28 @@ mod tests {
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
|
||||
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn soft_deleted() {
|
||||
let mut base = RoaringBitmap::new();
|
||||
base.insert(0);
|
||||
base.insert(10);
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let mut soft_deleted = RoaringBitmap::new();
|
||||
soft_deleted.insert(1);
|
||||
soft_deleted.insert(11);
|
||||
soft_deleted.insert(101);
|
||||
soft_deleted.insert(406);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
|
||||
let right =
|
||||
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::{FieldDistribution, Index, Result};
|
||||
use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
@@ -20,7 +20,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
let Index {
|
||||
env: _env,
|
||||
main: _main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
@@ -55,14 +54,15 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
// We clean some of the main engine datastructures.
|
||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
|
||||
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
self.index.delete_vector_hnsw(self.wtxn)?;
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
@@ -122,7 +122,7 @@ mod tests {
|
||||
|
||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
|
||||
assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
||||
|
||||
1235
milli/src/update/delete_documents.rs
Normal file
1235
milli/src/update/delete_documents.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,9 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
|
||||
use heed::{BytesEncode, Error, RoTxn, RwTxn};
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
@@ -12,9 +12,9 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
/// by rebuilding the database "from scratch".
|
||||
@@ -28,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
delta_data: Option<grenad::Reader<BufReader<File>>>,
|
||||
delta_data: Option<grenad::Reader<File>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
@@ -36,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
@@ -134,7 +134,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
let value: KvReader<DelAdd> = KvReader::new(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
@@ -158,7 +158,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
let value: KvReader<DelAdd> = KvReader::new(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
@@ -167,7 +167,6 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
// then we extend the buffer with the docids bitmap
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||
let old_bitmap = &prev_value[1..];
|
||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||
}
|
||||
@@ -181,13 +180,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
buffer.extend_from_slice(value);
|
||||
}
|
||||
};
|
||||
let new_bitmap = &buffer[1..];
|
||||
// if the new bitmap is empty, let's remove it
|
||||
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||
database.delete(wtxn, key)?;
|
||||
} else {
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
}
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -196,7 +189,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
txn: &RoTxn,
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
) -> Result<Vec<grenad::Reader<File>>> {
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||
|
||||
Ok(subwriters)
|
||||
@@ -262,7 +255,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
) -> Result<Vec<grenad::Reader<File>>> {
|
||||
if level == 0 {
|
||||
self.read_level_0(rtxn, field_id, handle_group)?;
|
||||
// Level 0 is already in the database
|
||||
|
||||
349
milli/src/update/facet/delete.rs
Normal file
349
milli/src/update/facet/delete.rs
Normal file
@@ -0,0 +1,349 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use heed::RwTxn;
|
||||
use log::debug;
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
///
|
||||
/// Depending on the number of removed elements and the existing size of the database, we use either
|
||||
/// a bulk delete method or an incremental delete method.
|
||||
pub struct FacetsDelete<'i, 'b> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
||||
docids_to_delete: &'b RoaringBitmap,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
}
|
||||
impl<'i, 'b> FacetsDelete<'i, 'b> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
|
||||
docids_to_delete: &'b RoaringBitmap,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
|
||||
}
|
||||
};
|
||||
Self {
|
||||
index,
|
||||
database,
|
||||
facet_type,
|
||||
affected_facet_values,
|
||||
docids_to_delete,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
for (field_id, affected_facet_values) in self.affected_facet_values {
|
||||
// This is an incorrect condition, since we assume that the length of the database is equal
|
||||
// to the number of facet values for the given field_id. It means that in some cases, we might
|
||||
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
|
||||
// really be a performance problem is when we fully delete a large ratio of all facet values for
|
||||
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
|
||||
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
|
||||
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
|
||||
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
|
||||
// Bulk delete
|
||||
let mut modified = false;
|
||||
|
||||
for facet_value in affected_facet_values {
|
||||
let key =
|
||||
FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
|
||||
let mut old = self.database.get(wtxn, &key)?.unwrap();
|
||||
let previous_len = old.bitmap.len();
|
||||
old.bitmap -= self.docids_to_delete;
|
||||
if old.bitmap.is_empty() {
|
||||
modified = true;
|
||||
self.database.delete(wtxn, &key)?;
|
||||
} else if old.bitmap.len() != previous_len {
|
||||
modified = true;
|
||||
self.database.put(wtxn, &key, &old)?;
|
||||
}
|
||||
}
|
||||
if modified {
|
||||
let builder = FacetsUpdateBulk::new_not_updating_level_0(
|
||||
self.index,
|
||||
vec![field_id],
|
||||
self.facet_type,
|
||||
);
|
||||
builder.execute(wtxn)?;
|
||||
}
|
||||
} else {
|
||||
// Incremental
|
||||
let inc = FacetsUpdateIncrementalInner {
|
||||
db: self.database,
|
||||
group_size: self.group_size,
|
||||
min_level_size: self.min_level_size,
|
||||
max_group_size: self.max_group_size,
|
||||
};
|
||||
for facet_value in affected_facet_values {
|
||||
inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::SeedableRng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::facet::test_helpers::ordered_string;
|
||||
use crate::update::{DeleteDocuments, DeletionStrategy};
|
||||
|
||||
#[test]
|
||||
fn delete_mixed_incremental_and_bulk() {
|
||||
// The point of this test is to create an index populated with documents
|
||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": i / 10,
|
||||
"colour": i / 100,
|
||||
"timestamp": i / 2,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
||||
// by deleting the first 100 documents, we expect that:
|
||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
|
||||
}
|
||||
|
||||
// Same test as above but working with string values for the facets
|
||||
#[test]
|
||||
fn delete_mixed_incremental_and_bulk_string() {
|
||||
// The point of this test is to create an index populated with documents
|
||||
// containing different filterable attributes. Then, we delete a bunch of documents
|
||||
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": ordered_string(i / 10),
|
||||
"colour": ordered_string(i / 100),
|
||||
"timestamp": ordered_string(i / 2),
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
|
||||
// by deleting the first 100 documents, we expect that:
|
||||
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
|
||||
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
|
||||
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
|
||||
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
|
||||
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_almost_all_incrementally_string() {
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(
|
||||
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
|
||||
);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"label": ordered_string(i / 10),
|
||||
"colour": ordered_string(i / 100),
|
||||
"timestamp": ordered_string(i / 2),
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
|
||||
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
|
||||
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
|
||||
docids_to_delete.shuffle(&mut rng);
|
||||
for docid in docids_to_delete.into_iter().take(990) {
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
|
||||
builder.execute().unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
use std::iter::once;
|
||||
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::update::facet::test_helpers::FacetIndex;
|
||||
|
||||
// This is a simple test to get an intuition on the relative speed
|
||||
// of the incremental vs. bulk indexer.
|
||||
//
|
||||
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
||||
// each facet value contains only one document ID.
|
||||
//
|
||||
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
|
||||
// bulk indexer.
|
||||
// #[test]
|
||||
fn benchmark_facet_indexing_delete() {
|
||||
let mut r = rand::thread_rng();
|
||||
|
||||
for i in 1..=20 {
|
||||
let size = 50_000 * i;
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..size {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
let timer = std::time::Instant::now();
|
||||
index.bulk_insert(&mut txn, &[0], elements.iter());
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!("bulk {size} : {time_spent}ms");
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
for nbr_doc in [1, 100, 1000, 10_000] {
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let timer = std::time::Instant::now();
|
||||
//
|
||||
// delete one document
|
||||
//
|
||||
for _ in 0..nbr_doc {
|
||||
let deleted_u32 = r.gen::<u32>() % size;
|
||||
let deleted_f64 = deleted_u32 as f64;
|
||||
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
|
||||
}
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!(" delete {nbr_doc} : {time_spent}ms");
|
||||
txn.abort().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||
@@ -31,14 +30,14 @@ enum DeletionResult {
|
||||
/// `facet_id_(string/f64)_docids` databases.
|
||||
pub struct FacetsUpdateIncremental {
|
||||
inner: FacetsUpdateIncrementalInner,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
}
|
||||
|
||||
impl FacetsUpdateIncremental {
|
||||
pub fn new(
|
||||
index: &Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
max_group_size: u8,
|
||||
|
||||
@@ -14,7 +14,7 @@ The databases must be able to return results for queries such as:
|
||||
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||
|
||||
To make these queries fast to compute, the database adopts a tree structure:
|
||||
```text
|
||||
```ignore
|
||||
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||
│Level 2│ │ │ │ │
|
||||
@@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`.
|
||||
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||
[`FacetGroupValue`], which have the following format:
|
||||
|
||||
```text
|
||||
```ignore
|
||||
FacetGroupKey:
|
||||
- field id : u16
|
||||
- level : u8
|
||||
@@ -78,7 +78,6 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
@@ -98,6 +97,7 @@ use crate::update::merge_btreeset_string;
|
||||
use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod delete;
|
||||
pub mod incremental;
|
||||
|
||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
@@ -108,17 +108,14 @@ pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: grenad::Reader<File>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
}
|
||||
impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
) -> Self {
|
||||
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
|
||||
pub fn new(index: &'i Index, facet_type: FacetType, delta_data: grenad::Reader<File>) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => index
|
||||
.facet_id_string_docids
|
||||
@@ -278,7 +275,6 @@ pub(crate) mod test_helpers {
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::FacetsUpdateIncrementalInner;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
@@ -455,10 +451,8 @@ pub(crate) mod test_helpers {
|
||||
let key: FacetGroupKey<&[u8]> =
|
||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||
let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
|
||||
let mut inner_writer = KvWriterDelAdd::memory();
|
||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||
writer.insert(&key, &value).unwrap();
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
@@ -470,7 +464,7 @@ pub(crate) mod test_helpers {
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
|
||||
update.update(wtxn, field_ids).unwrap();
|
||||
update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap();
|
||||
}
|
||||
|
||||
pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {
|
||||
@@ -558,6 +552,98 @@ pub(crate) mod test_helpers {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::DeletionStrategy;
|
||||
|
||||
#[test]
|
||||
fn replace_all_identical_soft_deletion_then_hard_deletion() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("size") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
|
||||
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..999 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
"other": 0,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
|
||||
|
||||
// Then replace the last document while disabling soft_deletion
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
let mut documents = vec![];
|
||||
for i in 999..1000 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i,
|
||||
"size": i % 250,
|
||||
"other": 0,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
|
||||
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::io::{Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
use std::{fmt, iter};
|
||||
|
||||
@@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
|
||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
|
||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
@@ -30,7 +29,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
@@ -56,7 +55,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
|
||||
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
@@ -115,7 +114,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let (add_obkv, add_script_language_word_count) = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::new(del_obkv),
|
||||
@@ -123,8 +121,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
||||
// write them into the sorter.
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
@@ -152,9 +150,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
}
|
||||
}
|
||||
|
||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
.map(|reader| (reader, script_language_docids))
|
||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
@@ -198,7 +195,7 @@ fn tokenizer_builder<'a>(
|
||||
}
|
||||
|
||||
if let Some(script_language) = script_language {
|
||||
tokenizer_builder.allow_list(script_language);
|
||||
tokenizer_builder.allow_list(&script_language);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
@@ -206,7 +203,6 @@ fn tokenizer_builder<'a>(
|
||||
|
||||
/// Extract words maped with their positions of a document,
|
||||
/// ensuring no Language detection mistakes was made.
|
||||
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
|
||||
fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
@@ -221,9 +217,9 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
let mut script_language_word_count = HashMap::new();
|
||||
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
tokenizer,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
buffers,
|
||||
@@ -248,8 +244,8 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
allowed_separators,
|
||||
Some(&script_language),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
@@ -258,7 +254,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
|
||||
// rerun the extraction.
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
@@ -269,7 +265,6 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
|
||||
Ok((&buffers.obkv_buffer, script_language_word_count))
|
||||
}
|
||||
|
||||
@@ -335,7 +330,6 @@ fn tokens_from_document<'a>(
|
||||
}
|
||||
}
|
||||
|
||||
// returns a KV<FieldId, KV<u16, String>>
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
@@ -20,7 +20,7 @@ use crate::Result;
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, str};
|
||||
|
||||
use heed::BytesEncode;
|
||||
@@ -19,7 +18,7 @@ use crate::{FieldId, Result};
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
@@ -29,11 +29,11 @@ const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
@@ -102,11 +102,11 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||
None => None,
|
||||
};
|
||||
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io;
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
@@ -23,14 +22,14 @@ const MAX_COUNTED_WORDS: usize = 30;
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -38,52 +37,18 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
if deletion != addition {
|
||||
// Insert deleted word count in sorter if exist.
|
||||
if let Some(word_count) = deletion {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
// Insert added word count in sorter if exist.
|
||||
if let Some(word_count) = addition {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
|
||||
if word_count <= MAX_COUNTED_WORDS {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io;
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::GeoError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::extract_finite_float_from_value;
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
@@ -19,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
(lat_fid, lng_fid): (FieldId, FieldId),
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
@@ -31,71 +30,39 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// first we get the two fields
|
||||
match (obkv.get(lat_fid), obkv.get(lng_fid)) {
|
||||
(Some(lat), Some(lng)) => {
|
||||
let deladd_lat_obkv = KvReaderDelAdd::new(lat);
|
||||
let deladd_lng_obkv = KvReaderDelAdd::new(lng);
|
||||
let lat = obkv.get(lat_fid);
|
||||
let lng = obkv.get(lng_fid);
|
||||
|
||||
// then we extract the values
|
||||
let del_lat_lng = deladd_lat_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.zip(deladd_lng_obkv.get(DelAdd::Deletion))
|
||||
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||
.transpose()?;
|
||||
let add_lat_lng = deladd_lat_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.zip(deladd_lng_obkv.get(DelAdd::Addition))
|
||||
.map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
|
||||
.transpose()?;
|
||||
if let Some((lat, lng)) = lat.zip(lng) {
|
||||
// then we extract the values
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
if let Some([lat, lng]) = del_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||
}
|
||||
if let Some([lat, lng]) = add_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Addition, bytes)?;
|
||||
}
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||
}
|
||||
(Some(_), None) => {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, None) => (),
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
} else if lat.is_none() && lng.is_some() {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() })?;
|
||||
} else if lat.is_some() && lng.is_none() {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
|
||||
}
|
||||
// else => the _geo object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
|
||||
Ok([lat, lng])
|
||||
}
|
||||
|
||||
@@ -1,24 +1,13 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
use std::io;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::UserError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
@@ -27,8 +16,9 @@ const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
vectors_fid: FieldId,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
@@ -37,112 +27,43 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// first we retrieve the _vectors field
|
||||
if let Some(value) = obkv.get(vectors_fid) {
|
||||
let vectors_obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(vectors) = obkv.get(vectors_fid) {
|
||||
// extract the vectors
|
||||
let vectors = match from_slice(vectors) {
|
||||
Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
|
||||
Err(_) => {
|
||||
return Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into())
|
||||
}
|
||||
};
|
||||
|
||||
// then we extract the values
|
||||
let del_vectors = vectors_obkv
|
||||
.get(DelAdd::Deletion)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add_vectors = vectors_obkv
|
||||
.get(DelAdd::Addition)
|
||||
.map(|vectors| extract_vectors(vectors, document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut writer,
|
||||
&mut key_buffer,
|
||||
del_vectors.unwrap_or_default(),
|
||||
add_vectors.unwrap_or_default(),
|
||||
)?;
|
||||
if let Some(vectors) = vectors {
|
||||
for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
let index = u16::try_from(i).unwrap();
|
||||
let mut key = docid_bytes.to_vec();
|
||||
key.extend_from_slice(&index.to_be_bytes());
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(key, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
// else => the `_vectors` object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn push_vectors_diff(
|
||||
writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_vectors: Vec<Vec<f32>>,
|
||||
mut add_vectors: Vec<Vec<f32>>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the vectors
|
||||
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
let merged_vectors_iter =
|
||||
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(vector) => {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
EitherOrBoth::Right(vector) => {
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compares two vectors by using the OrderingFloat helper.
|
||||
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
/// Extracts the vectors from a JSON value.
|
||||
fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
|
||||
match from_slice(value) {
|
||||
Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
|
||||
Err(_) => Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(value).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use obkv::KvReaderU16;
|
||||
@@ -28,11 +28,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
exact_attributes: &HashSet<FieldId>,
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
@@ -57,17 +53,17 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::new(deletion).iter() {
|
||||
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::new(addition).iter() {
|
||||
for (_pos, word) in KvReaderU16::new(&addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
@@ -122,9 +118,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||
if exact_attributes.contains(&fid) {
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||
} else {
|
||||
word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,7 +165,7 @@ fn words_into_sorter(
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.extend_from_slice(&word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{cmp, io};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
@@ -23,12 +22,13 @@ use crate::{DocumentId, Result};
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.into_iter()
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
@@ -74,7 +74,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
@@ -103,7 +103,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||
@@ -169,7 +169,7 @@ fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
|
||||
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
@@ -200,7 +200,7 @@ fn document_word_positions_into_sorter(
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.push(*prox);
|
||||
key_buffer.push(*prox as u8);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::io;
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::{bucketed_position, DocumentId, Result};
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
@@ -60,7 +60,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||
|
||||
@@ -11,7 +11,6 @@ mod extract_word_position_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use log::debug;
|
||||
@@ -28,8 +27,8 @@ use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{
|
||||
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
||||
MergeFn, MergeableReader,
|
||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
||||
MergeableReader,
|
||||
};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::{FieldId, Result};
|
||||
@@ -38,8 +37,8 @@ use crate::{FieldId, Result};
|
||||
/// Send data in grenad file over provided Sender.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn data_from_obkv_documents(
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: Option<HashSet<FieldId>>,
|
||||
@@ -63,6 +62,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
vectors_field_id,
|
||||
primary_key_id,
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
@@ -107,7 +107,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-exists-docids");
|
||||
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||
}
|
||||
@@ -123,7 +123,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-null-docids");
|
||||
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
@@ -139,7 +139,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!("merge {} database", "facet-id-is-empty-docids");
|
||||
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
@@ -150,22 +150,22 @@ pub(crate) fn data_from_obkv_documents(
|
||||
});
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
@@ -173,17 +173,13 @@ pub(crate) fn data_from_obkv_documents(
|
||||
spawn_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>,
|
||||
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
|
||||
>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
@@ -194,32 +190,32 @@ pub(crate) fn data_from_obkv_documents(
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
fid_docid_facet_strings_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||
fid_docid_facet_numbers_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx,
|
||||
extract_facet_number_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
@@ -269,10 +265,11 @@ fn spawn_extraction_task<FE, FS, M>(
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents
|
||||
fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
original_documents_chunk: Result<grenad::Reader<File>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@@ -281,7 +278,12 @@ fn send_original_documents_data(
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned,
|
||||
indexer,
|
||||
primary_key_id,
|
||||
vectors_field_id,
|
||||
);
|
||||
let _ = match result {
|
||||
Ok(vector_points) => {
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||
@@ -305,7 +307,7 @@ fn send_original_documents_data(
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
flattened_documents_chunk: Result<grenad::Reader<File>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
@@ -322,10 +324,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
),
|
||||
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
|
||||
),
|
||||
),
|
||||
)> {
|
||||
@@ -348,7 +347,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let (docid_word_positions_chunk, script_language_pair) =
|
||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
||||
extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
@@ -359,6 +358,9 @@ fn send_and_extract_flattened_documents_data(
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
// send documents_ids to DB writer
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
use std::io::{self, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
@@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
|
||||
typ: grenad::CompressionType,
|
||||
level: Option<u32>,
|
||||
file: R,
|
||||
) -> grenad::Writer<BufWriter<R>> {
|
||||
) -> grenad::Writer<R> {
|
||||
let mut builder = grenad::Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(BufWriter::new(file))
|
||||
builder.build(file)
|
||||
}
|
||||
|
||||
pub fn create_sorter(
|
||||
@@ -47,14 +47,13 @@ pub fn create_sorter(
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.sort_in_parallel(true);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn sorter_into_reader(
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
puffin::profile_function!();
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
@@ -66,18 +65,16 @@ pub fn sorter_into_reader(
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(
|
||||
writer: grenad::Writer<BufWriter<File>>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
file.rewind()?;
|
||||
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||
grenad::Reader::new(file).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub unsafe fn as_cloneable_grenad(
|
||||
reader: &grenad::Reader<BufReader<File>>,
|
||||
reader: &grenad::Reader<File>,
|
||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||
let file = reader.get_ref().get_ref();
|
||||
let file = reader.get_ref();
|
||||
let mmap = memmap2::Mmap::map(file)?;
|
||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||
let reader = grenad::Reader::new(cursor)?;
|
||||
@@ -93,8 +90,8 @@ where
|
||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||
type Output = grenad::Reader<BufReader<File>>;
|
||||
impl MergeableReader for Vec<grenad::Reader<File>> {
|
||||
type Output = grenad::Reader<File>;
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut merger = MergerBuilder::new(merge_fn);
|
||||
@@ -103,8 +100,8 @@ impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
type Output = (grenad::Reader<File>, grenad::Reader<File>);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
@@ -117,18 +114,8 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader
|
||||
for Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>
|
||||
{
|
||||
type Output = (
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
);
|
||||
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
@@ -155,7 +142,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
|
||||
let merger = self.0.build();
|
||||
let mut writer = create_writer(
|
||||
params.chunk_compression_type,
|
||||
@@ -206,7 +193,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
documents_chunk_size: usize,
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
|
||||
let mut continue_reading = true;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
|
||||
@@ -223,13 +210,11 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||
if !obkv.is_empty() {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -157,7 +157,7 @@ fn inner_merge_del_add_obkvs<'a>(
|
||||
let mut acc = newest[1..].to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
// reverse iter from the most recent to the oldest.
|
||||
for current in obkvs.iter().rev() {
|
||||
for current in obkvs.into_iter().rev() {
|
||||
// if in the previous iteration there was a complete deletion,
|
||||
// stop the merge process.
|
||||
if acc_operation_type == Operation::Deletion as u8 {
|
||||
|
||||
@@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
|
||||
WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
@@ -89,6 +89,7 @@ pub struct IndexDocumentsConfig {
|
||||
pub words_positions_level_group_size: Option<NonZeroU32>,
|
||||
pub words_positions_min_level_size: Option<NonZeroU32>,
|
||||
pub update_method: IndexDocumentsMethod,
|
||||
pub deletion_strategy: DeletionStrategy,
|
||||
pub autogenerate_docids: bool,
|
||||
}
|
||||
|
||||
@@ -180,7 +181,6 @@ where
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
return Ok((self, Ok(0)));
|
||||
}
|
||||
|
||||
@@ -193,7 +193,6 @@ where
|
||||
|
||||
self.deleted_documents += deleted_documents;
|
||||
|
||||
// Maintains Invariant: remove documents actually always returns Ok for the inner result
|
||||
Ok((self, Ok(deleted_documents)))
|
||||
}
|
||||
|
||||
@@ -201,7 +200,7 @@ where
|
||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||
puffin::profile_function!();
|
||||
|
||||
if self.added_documents == 0 && self.deleted_documents == 0 {
|
||||
if self.added_documents == 0 {
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
|
||||
}
|
||||
@@ -245,6 +244,9 @@ where
|
||||
primary_key,
|
||||
fields_ids_map,
|
||||
field_distribution,
|
||||
new_external_documents_ids,
|
||||
new_documents_ids,
|
||||
replaced_documents_ids,
|
||||
documents_count,
|
||||
original_documents,
|
||||
flattened_documents,
|
||||
@@ -368,12 +370,29 @@ where
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
// needs to be droped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx)
|
||||
});
|
||||
|
||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||
// We delete the documents that this document addition replaces. This way we are
|
||||
// able to simply insert all the documents even if they already exist in the database.
|
||||
if !replaced_documents_ids.is_empty() {
|
||||
let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
|
||||
deletion_builder.strategy(self.config.deletion_strategy);
|
||||
debug!("documents to delete {:?}", replaced_documents_ids);
|
||||
deletion_builder.delete_documents(&replaced_documents_ids);
|
||||
let deleted_documents_result = deletion_builder.execute_inner()?;
|
||||
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
|
||||
}
|
||||
|
||||
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let index_is_empty = index_documents_ids.is_empty();
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
let mut word_pair_proximity_docids = None;
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
|
||||
let mut databases_seen = 0;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
@@ -386,9 +405,38 @@ where
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let typed_chunk = result?;
|
||||
let typed_chunk = match result? {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
exact_word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_pair_proximity_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPairProximityDocids(chunk)
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_position_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
otherwise => otherwise,
|
||||
};
|
||||
|
||||
// FIXME: return newly added as well as newly deleted documents
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
|
||||
if !docids.is_empty() {
|
||||
@@ -418,6 +466,15 @@ where
|
||||
// We write the primary key field id into the main database
|
||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||
|
||||
// We write the external documents ids into the main database.
|
||||
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
|
||||
external_documents_ids.insert_ids(&new_external_documents_ids)?;
|
||||
let external_documents_ids = external_documents_ids.into_static();
|
||||
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
|
||||
|
||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
||||
|
||||
// TODO: reactivate prefix DB with diff-indexing
|
||||
// self.execute_prefix_databases(
|
||||
// word_docids,
|
||||
@@ -427,7 +484,7 @@ where
|
||||
// word_fid_docids,
|
||||
// )?;
|
||||
|
||||
self.index.number_of_documents(self.wtxn)
|
||||
Ok(all_documents_ids.len())
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
@@ -661,15 +718,14 @@ fn execute_word_prefix_docids(
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use big_s::S;
|
||||
use fst::IntoStreamer;
|
||||
use heed::RwTxn;
|
||||
use maplit::hashset;
|
||||
|
||||
use super::*;
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::{db_snap, Filter, Search, BEU16};
|
||||
use crate::update::DeleteDocuments;
|
||||
use crate::{db_snap, BEU16};
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@@ -760,10 +816,11 @@ mod tests {
|
||||
assert_eq!(count, 1);
|
||||
|
||||
// Check that we get only one document from the database.
|
||||
let docs = index.documents(&rtxn, Some(0)).unwrap();
|
||||
// Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
|
||||
let docs = index.documents(&rtxn, Some(1)).unwrap();
|
||||
assert_eq!(docs.len(), 1);
|
||||
let (id, doc) = docs[0];
|
||||
assert_eq!(id, 0);
|
||||
assert_eq!(id, 1);
|
||||
|
||||
// Check that this document is equal to the last one sent.
|
||||
let mut doc_iter = doc.iter();
|
||||
@@ -824,7 +881,7 @@ mod tests {
|
||||
assert_eq!(count, 3);
|
||||
|
||||
// the document 0 has been deleted and reinserted with the id 3
|
||||
let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
|
||||
let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
|
||||
let kevin_position =
|
||||
docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
||||
assert_eq!(kevin_position, 2);
|
||||
@@ -970,6 +1027,7 @@ mod tests {
|
||||
assert_eq!(count, 6);
|
||||
|
||||
db_snap!(index, word_docids, "updated");
|
||||
db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]");
|
||||
|
||||
drop(rtxn);
|
||||
}
|
||||
@@ -1072,15 +1130,17 @@ mod tests {
|
||||
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
|
||||
]))
|
||||
.unwrap();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
index.delete_document("30");
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.delete_external_id("30");
|
||||
builder.execute().unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
|
||||
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_none());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
@@ -1089,8 +1149,8 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let wtxn = index.write_txn().unwrap();
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
|
||||
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
|
||||
assert!(external_documents_ids.get("30").is_some());
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index
|
||||
@@ -1384,10 +1444,8 @@ mod tests {
|
||||
index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let all_documents_count = index.all_documents(&rtxn).unwrap().count();
|
||||
assert_eq!(all_documents_count, 1);
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
|
||||
let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
|
||||
assert!(external_documents_ids.get("1").is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1652,7 +1710,7 @@ mod tests {
|
||||
|
||||
let wtxn = index.read_txn().unwrap();
|
||||
|
||||
let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
|
||||
let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map();
|
||||
let ids = map.values().collect::<HashSet<_>>();
|
||||
|
||||
assert_eq!(ids.len(), map.len());
|
||||
@@ -2464,8 +2522,17 @@ mod tests {
|
||||
db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
|
||||
db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
index.delete_documents(vec!["0".into(), "3".into()]);
|
||||
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
|
||||
builder.strategy(DeletionStrategy::AlwaysHard);
|
||||
builder.delete_external_id("0");
|
||||
builder.delete_external_id("3");
|
||||
let result = builder.execute().unwrap();
|
||||
println!("{result:?}");
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
|
||||
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
|
||||
@@ -2520,7 +2587,8 @@ mod tests {
|
||||
),
|
||||
]
|
||||
*/
|
||||
let index = TempIndex::new();
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
|
||||
|
||||
// START OF BATCH
|
||||
|
||||
@@ -2560,7 +2628,8 @@ mod tests {
|
||||
{"id":1,"doggo":"bernese"}
|
||||
"###);
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
1 0
|
||||
"###);
|
||||
|
||||
@@ -2605,10 +2674,13 @@ mod tests {
|
||||
"###);
|
||||
|
||||
db_snap!(index, external_documents_ids, @r###"
|
||||
docids:
|
||||
soft:
|
||||
hard:
|
||||
0 1
|
||||
"###);
|
||||
|
||||
db_snap!(index, soft_deleted_documents_ids, @"[]");
|
||||
|
||||
// BATCH 3
|
||||
|
||||
println!("--- ENTERING BATCH 3");
|
||||
@@ -2650,537 +2722,4 @@ mod tests {
|
||||
let res = index.search(&rtxn).execute().unwrap();
|
||||
index.documents(&rtxn, res.documents_ids).unwrap();
|
||||
}
|
||||
|
||||
fn delete_documents<'t>(
|
||||
wtxn: &mut RwTxn<'t, '_>,
|
||||
index: &'t TempIndex,
|
||||
external_ids: &[&str],
|
||||
) -> Vec<u32> {
|
||||
let external_document_ids = index.external_documents_ids();
|
||||
let ids_to_delete: Vec<u32> = external_ids
|
||||
.iter()
|
||||
.map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
|
||||
.collect();
|
||||
|
||||
// Delete some documents.
|
||||
index.delete_documents_using_wtxn(
|
||||
wtxn,
|
||||
external_ids.iter().map(ToString::to_string).collect(),
|
||||
);
|
||||
|
||||
ids_to_delete
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_numbers_as_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
|
||||
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
|
||||
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// delete those documents, ids are synchronous therefore 0, 1, and 2.
|
||||
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// All these snapshots should be empty since the database was cleared
|
||||
db_snap!(index, documents_ids);
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, facet_id_exists_docids);
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_documents_with_strange_primary_key() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
|
||||
.unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "mysuperid": 0, "name": "kevin" },
|
||||
{ "mysuperid": 1, "name": "kevina" },
|
||||
{ "mysuperid": 2, "name": "benoit" }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
// Delete not all of the documents but some of them.
|
||||
index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents_ids);
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
settings.set_filterable_fields(hashset! { S("label"), S("label2") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
|
||||
|
||||
// Placeholder search with filter
|
||||
let filter = Filter::from_str("label = sign").unwrap().unwrap();
|
||||
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||
assert!(results.documents_ids.is_empty());
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_docids);
|
||||
db_snap!(index, facet_id_f64_docids);
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, facet_id_exists_docids);
|
||||
db_snap!(index, facet_id_string_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
|
||||
|
||||
// Placeholder search
|
||||
let results = index.search(&wtxn).execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||
|
||||
// search for abstract
|
||||
let results = index.search(&wtxn).query("abstract").execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_filterable_fields(hashset!(S("_geo")));
|
||||
settings.set_sortable_fields(hashset!(S("_geo")));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
|
||||
{ "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
|
||||
{ "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
|
||||
{ "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
|
||||
{ "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
|
||||
{ "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
|
||||
{ "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
|
||||
{ "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
|
||||
{ "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
|
||||
{ "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
|
||||
{ "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
|
||||
{ "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
|
||||
{ "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
|
||||
{ "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
|
||||
{ "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
|
||||
{ "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
|
||||
{ "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
|
||||
{ "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
|
||||
{ "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
|
||||
{ "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
|
||||
])).unwrap();
|
||||
|
||||
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
|
||||
|
||||
// Placeholder search with geo filter
|
||||
let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
|
||||
let results = index.search(&wtxn).filter(filter).execute().unwrap();
|
||||
assert!(!results.documents_ids.is_empty());
|
||||
for id in results.documents_ids.iter() {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids);
|
||||
db_snap!(index, facet_id_string_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn get_documents_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "docid": "1_4", "label": ["sign"] },
|
||||
{ "docid": "1_5", "label": ["letter"] },
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] },
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"] },
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"] },
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"] },
|
||||
{ "docid": "1_39", "label": ["abstract"] },
|
||||
{ "docid": "1_40", "label": ["cartoon"] },
|
||||
{ "docid": "1_41", "label": ["art","drawing"] },
|
||||
{ "docid": "1_42", "label": ["art","pattern"] },
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
|
||||
{ "docid": "1_44", "label": ["drawing"] },
|
||||
{ "docid": "1_45", "label": ["art"] },
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"] },
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"] },
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"] },
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"] },
|
||||
{ "docid": "1_68", "label": ["design"] },
|
||||
{ "docid": "1_69", "label": ["geometry"] },
|
||||
{ "docid": "1_70", "label2": ["geometry", 1.2] },
|
||||
{ "docid": "1_71", "label2": ["design", 2.2] },
|
||||
{ "docid": "1_72", "label2": ["geometry", 1.2] }
|
||||
]),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let deleted_external_ids = ["1_7", "1_52"];
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
|
||||
|
||||
// list all documents
|
||||
let results = index.all_documents(&wtxn).unwrap();
|
||||
for result in results {
|
||||
let (id, _) = result.unwrap();
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(&id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
|
||||
// list internal document ids
|
||||
let results = index.documents_ids(&wtxn).unwrap();
|
||||
for id in results {
|
||||
assert!(
|
||||
!deleted_internal_ids.contains(&id),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// get internal docids from deleted external document ids
|
||||
let results = index.external_documents_ids();
|
||||
for id in deleted_external_ids {
|
||||
assert!(
|
||||
results.get(&rtxn, id).unwrap().is_none(),
|
||||
"The document {} was supposed to be deleted",
|
||||
id
|
||||
);
|
||||
}
|
||||
drop(rtxn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stats_should_not_return_deleted_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "docid": "1_4", "label": ["sign"]},
|
||||
{ "docid": "1_5", "label": ["letter"]},
|
||||
{ "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
|
||||
{ "docid": "1_36", "label": ["drawing","painting","pattern"]},
|
||||
{ "docid": "1_37", "label": ["art","drawing","outdoor"]},
|
||||
{ "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
|
||||
{ "docid": "1_39", "label": ["abstract"]},
|
||||
{ "docid": "1_40", "label": ["cartoon"]},
|
||||
{ "docid": "1_41", "label": ["art","drawing"]},
|
||||
{ "docid": "1_42", "label": ["art","pattern"]},
|
||||
{ "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
|
||||
{ "docid": "1_44", "label": ["drawing"], "number": 44i32},
|
||||
{ "docid": "1_45", "label": ["art"]},
|
||||
{ "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
|
||||
{ "docid": "1_47", "label": ["abstract","pattern"]},
|
||||
{ "docid": "1_52", "label": ["abstract","cartoon"]},
|
||||
{ "docid": "1_57", "label": ["abstract","drawing","pattern"]},
|
||||
{ "docid": "1_58", "label": ["abstract","art","cartoon"]},
|
||||
{ "docid": "1_68", "label": ["design"]},
|
||||
{ "docid": "1_69", "label": ["geometry"]}
|
||||
])).unwrap();
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
|
||||
|
||||
// count internal documents
|
||||
let results = index.number_of_documents(&wtxn).unwrap();
|
||||
assert_eq!(18, results);
|
||||
|
||||
// count field distribution
|
||||
let results = index.field_distribution(&wtxn).unwrap();
|
||||
assert_eq!(Some(&18), results.get("label"));
|
||||
assert_eq!(Some(&1), results.get("title"));
|
||||
assert_eq!(Some(&2), results.get("number"));
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stored_detected_script_and_language_should_not_return_deleted_documents() {
|
||||
use charabia::{Language, Script};
|
||||
let index = TempIndex::new();
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(
|
||||
&mut wtxn,
|
||||
documents!([
|
||||
{ "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||
{ "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
|
||||
{ "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
|
||||
{ "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
|
||||
{ "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
|
||||
{ "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let key_cmn = (Script::Cj, Language::Cmn);
|
||||
let cj_cmn_docs =
|
||||
index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||
expected_cj_cmn_docids.push(1);
|
||||
expected_cj_cmn_docids.push(5);
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
|
||||
delete_documents(&mut wtxn, &index, &["1"]);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let cj_cmn_docs =
|
||||
index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
|
||||
let mut expected_cj_cmn_docids = RoaringBitmap::new();
|
||||
expected_cj_cmn_docids.push(5);
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_words_exact_attributes() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_searchable_fields(vec![S("text"), S("exact")]);
|
||||
settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "text": "hello" },
|
||||
{ "id": 1, "exact": "hello"}
|
||||
]))
|
||||
.unwrap();
|
||||
db_snap!(index, word_docids, 1, @r###"
|
||||
hello [0, ]
|
||||
"###);
|
||||
db_snap!(index, exact_word_docids, 1, @r###"
|
||||
hello [1, ]
|
||||
"###);
|
||||
db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, word_docids, 2, @r###"
|
||||
hello [0, ]
|
||||
"###);
|
||||
db_snap!(index, exact_word_docids, 2, @"");
|
||||
db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
|
||||
|
||||
insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
|
||||
let txn = index.read_txn().unwrap();
|
||||
let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
|
||||
insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("hello");
|
||||
let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[]
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[2, ]
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
benoit [2, ]
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
2 0 2.2 1 [21, ]
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -1,56 +1,60 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, ]
|
||||
0 [1, 7, ]
|
||||
1 [2, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
10 [1, 7, ]
|
||||
12 [0, 8, ]
|
||||
1344 [3, ]
|
||||
1813 [0, ]
|
||||
2 [0, ]
|
||||
1813 [8, ]
|
||||
2 [0, 8, ]
|
||||
23 [5, ]
|
||||
25 [2, ]
|
||||
3 [0, ]
|
||||
3 [0, 8, ]
|
||||
35 [5, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
4 [4, 6, ]
|
||||
42 [0, 5, 8, ]
|
||||
456 [1, 7, ]
|
||||
5 [0, 8, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, ]
|
||||
adventure [1, 7, ]
|
||||
alice [2, ]
|
||||
and [0, 4, ]
|
||||
antoine [1, ]
|
||||
austen [0, ]
|
||||
blood [4, ]
|
||||
and [0, 4, 6, 8, ]
|
||||
antoine [1, 7, ]
|
||||
austen [8, ]
|
||||
austin [0, ]
|
||||
blood [4, 6, ]
|
||||
carroll [2, ]
|
||||
de [1, ]
|
||||
de [1, 7, ]
|
||||
douglas [5, ]
|
||||
exupery [1, ]
|
||||
fantasy [2, 3, 4, ]
|
||||
exupery [1, 7, ]
|
||||
fantasy [2, 3, 4, 6, ]
|
||||
galaxy [5, ]
|
||||
guide [5, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
half [4, 6, ]
|
||||
harry [4, 6, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [0, 3, 4, ]
|
||||
k [4, ]
|
||||
j [3, 4, 6, 8, ]
|
||||
jane [0, ]
|
||||
k [4, 6, ]
|
||||
le [1, ]
|
||||
lewis [2, ]
|
||||
little [1, ]
|
||||
potter [4, ]
|
||||
prejudice [0, ]
|
||||
pride [0, ]
|
||||
prince [1, ]
|
||||
princess [4, ]
|
||||
little [7, ]
|
||||
petit [1, ]
|
||||
potter [4, 6, ]
|
||||
prejudice [0, 8, ]
|
||||
pride [0, 8, ]
|
||||
prince [1, 4, 7, ]
|
||||
princess [6, ]
|
||||
r [3, ]
|
||||
romance [0, ]
|
||||
rowling [4, ]
|
||||
romance [0, 8, ]
|
||||
rowling [4, 6, ]
|
||||
s [5, ]
|
||||
saint [1, ]
|
||||
the [1, 3, 4, 5, ]
|
||||
saint [1, 7, ]
|
||||
the [3, 4, 5, 6, 7, ]
|
||||
to [5, ]
|
||||
tolkien [3, ]
|
||||
wonderland [2, ]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user