mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-21 22:00:59 +00:00
Compare commits
224 Commits
tracing-fi
...
v1.7.3
Author | SHA1 | Date | |
---|---|---|---|
414fc14426 | |||
3b8e8b7f1a | |||
f2f1367ec3 | |||
18f17ed728 | |||
d49250358d | |||
567194b925 | |||
bd74cce86a | |||
f85c80d059 | |||
e8516f00c4 | |||
29e71eedc7 | |||
10d053cd2f | |||
a302e258bd | |||
29840473b4 | |||
f4037c1a95 | |||
13cc62728b | |||
f84bcb09e1 | |||
5c95b5c933 | |||
0b7bebeeb6 | |||
d2f77e88bd | |||
1d8c13f595 | |||
7f3c495f5c | |||
abd954755d | |||
f3fc2bd01f | |||
6fa3872268 | |||
6c9823d7bb | |||
e0dac5a22f | |||
b918b55c6b | |||
07b1d0edaf | |||
306b25ad3a | |||
9f7a4fbfeb | |||
5ed7b6a0b2 | |||
ae67d5eef0 | |||
88bc9556a9 | |||
ca4876fd10 | |||
d3a95ea2f6 | |||
69c118ef76 | |||
d44e20aa89 | |||
7b670a4afa | |||
fde209b7b6 | |||
904b82a61d | |||
8ec3e30d2b | |||
0a59cb9734 | |||
f053c280e1 | |||
ee3076d5ba | |||
ab1224bfa7 | |||
eefc1c421e | |||
4d42a7af7c | |||
7408db2a46 | |||
663629a9d6 | |||
15c38dca78 | |||
7ee20b0895 | |||
0c216048b5 | |||
36d17110d8 | |||
bdd428c22e | |||
b130917933 | |||
25f64ce7df | |||
adcd848809 | |||
84ae0cd456 | |||
eee46b7537 | |||
55f60a3638 | |||
c608b3f9b5 | |||
86ce843f3d | |||
b11df7ec34 | |||
6862caef64 | |||
f75c7ac979 | |||
f07069094b | |||
eada6de261 | |||
d3004d8040 | |||
f4a6261dea | |||
9806a3e5f6 | |||
a96b45dda7 | |||
452a343a2b | |||
b87485e80d | |||
147a67dc82 | |||
716ffc07ee | |||
b005eb3289 | |||
9e664d87eb | |||
6dcb5219a0 | |||
5e83bac448 | |||
0562818c2a | |||
a478392b7a | |||
bbf3fb88ca | |||
60510e037b | |||
36c27a18a1 | |||
1eb1c043b5 | |||
507739bd98 | |||
eb25b07390 | |||
938149f814 | |||
066a7a3cde | |||
55796406c5 | |||
eb90f0b4fb | |||
c2e2003a80 | |||
91cdd502f8 | |||
a493a50825 | |||
9d1f489a37 | |||
693ba8dd15 | |||
e1a3eed1eb | |||
05ae291989 | |||
6ba9994916 | |||
01ae46dd80 | |||
12f5389ba7 | |||
9ee4f55e6c | |||
865b415b3f | |||
5ee6aaddc4 | |||
4148d391b8 | |||
88c6165e20 | |||
d097431113 | |||
1f8af81ba9 | |||
5d3bad4120 | |||
d34692e30b | |||
024de0dcf8 | |||
a081da0d90 | |||
78e04520fc | |||
72c1674a31 | |||
03bb6372af | |||
3beda8833d | |||
3b6544db6d | |||
55e942cd45 | |||
48026aa75c | |||
e5e811e2c9 | |||
55de96f74e | |||
82b43e9a7f | |||
15dafde21d | |||
290f6d15e7 | |||
39c83cb3d9 | |||
7efb1cae11 | |||
7877788510 | |||
c02d585f5b | |||
be1b054b05 | |||
023c2d755f | |||
407ad753ed | |||
285aa15d2f | |||
bf43a3f60a | |||
2c88131bb1 | |||
35aa9d5904 | |||
cfb3e6b51f | |||
1502382316 | |||
ef994d84d0 | |||
1b74010e9e | |||
08af0e690c | |||
d71b77f18b | |||
c443ed7e3f | |||
db722d201a | |||
91eb67e981 | |||
902d700a24 | |||
f70a615ed9 | |||
7ff722b72e | |||
bcf7909bba | |||
ceb211c515 | |||
f3c34d5b8c | |||
4de2db6786 | |||
661baa716b | |||
02dcaf07db | |||
d78ada07b5 | |||
bc097d90cb | |||
b393823f36 | |||
e773dfa9ba | |||
f158e96fe7 | |||
e23ec4886d | |||
7793ba67a4 | |||
80774148fd | |||
bf5cea8b10 | |||
38e1c40f38 | |||
afc0585c1c | |||
0e7a411d4d | |||
0f327f2821 | |||
77254765e8 | |||
ce6e6ec2c5 | |||
91a8f74763 | |||
abaa72e2bf | |||
3c3a258a22 | |||
73e66d5a97 | |||
b8da117b9c | |||
5e52107474 | |||
bcf1c4dae5 | |||
50f84d43f5 | |||
f76cc0806e | |||
2f1abd2c03 | |||
dedc91e2cf | |||
a61d8c59ff | |||
6e23040464 | |||
8febbf64ce | |||
b141c82a04 | |||
cc79cd0b04 | |||
256538ccb9 | |||
ca8990394e | |||
83fb2949c3 | |||
6cf703387d | |||
771861599b | |||
7e47cea0c4 | |||
5d7061682e | |||
02e6c8a440 | |||
89401d097b | |||
72ebac1fbb | |||
a616a1d37b | |||
3e120619fa | |||
a1caac9bfb | |||
88d03c56ab | |||
32ee05ccef | |||
74c180267e | |||
517f5332d6 | |||
9ac5750096 | |||
7ae4013478 | |||
fb705116a6 | |||
053306c0e7 | |||
84235a63df | |||
29f8300ac7 | |||
05edd85d75 | |||
9eeb75d501 | |||
4792651462 | |||
58c3501b54 | |||
ff76d8f21a | |||
698ea5139d | |||
880e790bff | |||
fbf5f2a392 | |||
1555870088 | |||
9f8f3105d5 | |||
318843aacd | |||
6d111139b5 | |||
dff2707471 | |||
c57f7f7379 | |||
b968616a99 | |||
c1bf33a112 | |||
ddc2b7129a |
@ -1,2 +1,2 @@
|
||||
[alias]
|
||||
xtask = "run --package xtask --"
|
||||
xtask = "run --release --package xtask --"
|
||||
|
30
.github/workflows/bench-manual.yml
vendored
Normal file
30
.github/workflows/bench-manual.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: Bench (manual)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
workload:
|
||||
description: 'The path to the workloads to execute (workloads/...)'
|
||||
required: true
|
||||
default: 'workloads/movies.json'
|
||||
|
||||
env:
|
||||
WORKLOAD_NAME: ${{ github.event.inputs.workload }}
|
||||
|
||||
jobs:
|
||||
benchmarks:
|
||||
name: Run and upload benchmarks
|
||||
runs-on: benchmarks
|
||||
timeout-minutes: 180 # 3h
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks - workload ${WORKLOAD_NAME} - branch ${{ github.ref }} - commit ${{ github.sha }}
|
||||
run: |
|
||||
cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Manual [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- ${WORKLOAD_NAME}
|
||||
|
46
.github/workflows/bench-pr.yml
vendored
Normal file
46
.github/workflows/bench-pr.yml
vendored
Normal file
@ -0,0 +1,46 @@
|
||||
name: Bench (PR)
|
||||
on:
|
||||
issue_comment:
|
||||
types: [created]
|
||||
|
||||
permissions:
|
||||
issues: write
|
||||
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||
|
||||
jobs:
|
||||
run-benchmarks-on-comment:
|
||||
if: startsWith(github.event.comment.body, '/bench')
|
||||
name: Run and upload benchmarks
|
||||
runs-on: benchmarks
|
||||
timeout-minutes: 180 # 3h
|
||||
steps:
|
||||
- name: Check for Command
|
||||
id: command
|
||||
uses: xt0rted/slash-command-action@v2
|
||||
with:
|
||||
command: bench
|
||||
reaction-type: "rocket"
|
||||
repo-token: ${{ env.GH_TOKEN }}
|
||||
|
||||
- uses: xt0rted/pull-request-comment-branch@v2
|
||||
id: comment-branch
|
||||
with:
|
||||
repo_token: ${{ env.GH_TOKEN }}
|
||||
|
||||
- uses: actions/checkout@v3
|
||||
if: success()
|
||||
with:
|
||||
fetch-depth: 0 # fetch full history to be able to get main commit sha
|
||||
ref: ${{ steps.comment-branch.outputs.head_ref }}
|
||||
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
- name: Run benchmarks on PR ${{ github.event.issue.id }}
|
||||
run: |
|
||||
cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.url }}) on [#${{github.event.issue.id}}](${{ github.event.issue.url }})" -- ${{ steps.command.outputs.command-arguments }}
|
25
.github/workflows/bench-push-indexing.yml
vendored
Normal file
25
.github/workflows/bench-push-indexing.yml
vendored
Normal file
@ -0,0 +1,25 @@
|
||||
name: Indexing bench (push)
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
benchmarks:
|
||||
name: Run and upload benchmarks
|
||||
runs-on: benchmarks
|
||||
timeout-minutes: 180 # 3h
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
|
||||
# Run benchmarks
|
||||
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch main - Commit ${{ github.sha }}
|
||||
run: |
|
||||
cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Push on `main` [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- workloads/*.json
|
||||
|
2
.github/workflows/publish-docker-images.yml
vendored
2
.github/workflows/publish-docker-images.yml
vendored
@ -97,7 +97,7 @@ jobs:
|
||||
- name: Send CI information to Cloud team
|
||||
# Do not send if nightly build (i.e. 'schedule' or 'workflow_dispatch' event)
|
||||
if: github.event_name == 'push'
|
||||
uses: peter-evans/repository-dispatch@v2
|
||||
uses: peter-evans/repository-dispatch@v3
|
||||
with:
|
||||
token: ${{ secrets.MEILI_BOT_GH_PAT }}
|
||||
repository: meilisearch/meilisearch-cloud
|
||||
|
7
.github/workflows/test-suite.yml
vendored
7
.github/workflows/test-suite.yml
vendored
@ -31,17 +31,10 @@ jobs:
|
||||
apt-get update && apt-get install -y curl
|
||||
apt-get install build-essential -y
|
||||
- name: Setup test with Rust stable
|
||||
if: github.event_name != 'schedule'
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Setup test with Rust nightly
|
||||
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo check without any default features
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -9,6 +9,8 @@
|
||||
/data.ms
|
||||
/snapshots
|
||||
/dumps
|
||||
/bench
|
||||
/_xtask_benchmark.ms
|
||||
|
||||
# Snapshots
|
||||
## ... large
|
||||
|
1360
Cargo.lock
generated
1360
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -17,11 +17,11 @@ members = [
|
||||
"benchmarks",
|
||||
"fuzzers",
|
||||
"tracing-trace",
|
||||
"xtask",
|
||||
"xtask", "build-info",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.6.0"
|
||||
version = "1.7.3"
|
||||
authors = [
|
||||
"Quentin de Quelen <quentin@dequelen.me>",
|
||||
"Clément Renault <clement@meilisearch.com>",
|
||||
|
@ -8,7 +8,7 @@ WORKDIR /
|
||||
ARG COMMIT_SHA
|
||||
ARG COMMIT_DATE
|
||||
ARG GIT_TAG
|
||||
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG}
|
||||
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_DESCRIBE=${GIT_TAG}
|
||||
ENV RUSTFLAGS="-C target-feature=-crt-static"
|
||||
|
||||
COPY . .
|
||||
|
@ -41,10 +41,10 @@ Meilisearch helps you shape a delightful search experience in a snap, offering f
|
||||
## ✨ Features
|
||||
|
||||
- **Search-as-you-type:** find search results in less than 50 milliseconds
|
||||
- **[Typo tolerance](https://www.meilisearch.com/docs/learn/getting_started/customizing_relevancy?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features#typo-tolerance):** get relevant matches even when queries contain typos and misspellings
|
||||
- **[Typo tolerance](https://www.meilisearch.com/docs/learn/configuration/typo_tolerance?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** get relevant matches even when queries contain typos and misspellings
|
||||
- **[Filtering](https://www.meilisearch.com/docs/learn/fine_tuning_results/filtering?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features) and [faceted search](https://www.meilisearch.com/docs/learn/fine_tuning_results/faceted_search?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** enhance your users' search experience with custom filters and build a faceted search interface in a few lines of code
|
||||
- **[Sorting](https://www.meilisearch.com/docs/learn/fine_tuning_results/sorting?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** sort results based on price, date, or pretty much anything else your users need
|
||||
- **[Synonym support](https://www.meilisearch.com/docs/learn/getting_started/customizing_relevancy?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features#synonyms):** configure synonyms to include more relevant content in your search results
|
||||
- **[Synonym support](https://www.meilisearch.com/docs/learn/configuration/synonyms?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** configure synonyms to include more relevant content in your search results
|
||||
- **[Geosearch](https://www.meilisearch.com/docs/learn/fine_tuning_results/geosearch?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** filter and sort documents based on geographic data
|
||||
- **[Extensive language support](https://www.meilisearch.com/docs/learn/what_is_meilisearch/language?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** search datasets in any language, with optimized support for Chinese, Japanese, Hebrew, and languages using the Latin alphabet
|
||||
- **[Security management](https://www.meilisearch.com/docs/learn/security/master_api_keys?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=features):** control which users can access what data with API keys that allow fine-grained permissions handling
|
||||
@ -61,8 +61,6 @@ You can consult Meilisearch's documentation at [https://www.meilisearch.com/docs
|
||||
|
||||
For basic instructions on how to set up Meilisearch, add documents to an index, and search for documents, take a look at our [Quick Start](https://www.meilisearch.com/docs/learn/getting_started/quick_start?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=get-started) guide.
|
||||
|
||||
You may also want to check out [Meilisearch 101](https://www.meilisearch.com/docs/learn/getting_started/filtering_and_sorting?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=get-started) for an introduction to some of Meilisearch's most popular features.
|
||||
|
||||
## ⚡ Supercharge your Meilisearch experience
|
||||
|
||||
Say goodbye to server deployment and manual updates with [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). No credit card required.
|
||||
@ -101,7 +99,7 @@ Meilisearch is a search engine created by [Meili](https://www.welcometothejungle
|
||||
|
||||
- For feature requests, please visit our [product repository](https://github.com/meilisearch/product/discussions)
|
||||
- Found a bug? Open an [issue](https://github.com/meilisearch/meilisearch/issues)!
|
||||
- Want to be part of our Discord community? [Join us!](https://discord.gg/meilisearch)
|
||||
- Want to be part of our Discord community? [Join us!](https://discord.meilisearch.com/?utm_campaign=oss&utm_source=github&utm_medium=meilisearch&utm_content=contact)
|
||||
|
||||
Thank you for your support!
|
||||
|
||||
|
18
build-info/Cargo.toml
Normal file
18
build-info/Cargo.toml
Normal file
@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "build-info"
|
||||
version.workspace = true
|
||||
authors.workspace = true
|
||||
description.workspace = true
|
||||
homepage.workspace = true
|
||||
readme.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
time = { version = "0.3.34", features = ["parsing"] }
|
||||
|
||||
[build-dependencies]
|
||||
anyhow = "1.0.80"
|
||||
vergen-git2 = "1.0.0-beta.2"
|
22
build-info/build.rs
Normal file
22
build-info/build.rs
Normal file
@ -0,0 +1,22 @@
|
||||
fn main() {
|
||||
if let Err(err) = emit_git_variables() {
|
||||
println!("cargo:warning=vergen: {}", err);
|
||||
}
|
||||
}
|
||||
|
||||
fn emit_git_variables() -> anyhow::Result<()> {
|
||||
// Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them
|
||||
// in the corresponding GitHub workflow (publish_docker.yml).
|
||||
// This is due to the Dockerfile building the binary outside of the git directory.
|
||||
let mut builder = vergen_git2::Git2Builder::default();
|
||||
|
||||
builder.branch(true);
|
||||
builder.commit_timestamp(true);
|
||||
builder.commit_message(true);
|
||||
builder.describe(true, true, None);
|
||||
builder.sha(false);
|
||||
|
||||
let git2 = builder.build()?;
|
||||
|
||||
vergen_git2::Emitter::default().fail_on_error().add_instructions(&git2)?.emit()
|
||||
}
|
203
build-info/src/lib.rs
Normal file
203
build-info/src/lib.rs
Normal file
@ -0,0 +1,203 @@
|
||||
use time::format_description::well_known::Iso8601;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BuildInfo {
|
||||
pub branch: Option<&'static str>,
|
||||
pub describe: Option<DescribeResult>,
|
||||
pub commit_sha1: Option<&'static str>,
|
||||
pub commit_msg: Option<&'static str>,
|
||||
pub commit_timestamp: Option<time::OffsetDateTime>,
|
||||
}
|
||||
|
||||
impl BuildInfo {
|
||||
pub fn from_build() -> Self {
|
||||
let branch: Option<&'static str> = option_env!("VERGEN_GIT_BRANCH");
|
||||
let describe = DescribeResult::from_build();
|
||||
let commit_sha1 = option_env!("VERGEN_GIT_SHA");
|
||||
let commit_msg = option_env!("VERGEN_GIT_COMMIT_MESSAGE");
|
||||
let commit_timestamp = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP");
|
||||
|
||||
let commit_timestamp = commit_timestamp.and_then(|commit_timestamp| {
|
||||
time::OffsetDateTime::parse(commit_timestamp, &Iso8601::DEFAULT).ok()
|
||||
});
|
||||
|
||||
Self { branch, describe, commit_sha1, commit_msg, commit_timestamp }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum DescribeResult {
|
||||
Prototype { name: &'static str },
|
||||
Release { version: &'static str, major: u64, minor: u64, patch: u64 },
|
||||
Prerelease { version: &'static str, major: u64, minor: u64, patch: u64, rc: u64 },
|
||||
NotATag { describe: &'static str },
|
||||
}
|
||||
|
||||
impl DescribeResult {
|
||||
pub fn new(describe: &'static str) -> Self {
|
||||
if let Some(name) = prototype_name(describe) {
|
||||
Self::Prototype { name }
|
||||
} else if let Some(release) = release_version(describe) {
|
||||
release
|
||||
} else if let Some(prerelease) = prerelease_version(describe) {
|
||||
prerelease
|
||||
} else {
|
||||
Self::NotATag { describe }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_build() -> Option<Self> {
|
||||
let describe: &'static str = option_env!("VERGEN_GIT_DESCRIBE")?;
|
||||
Some(Self::new(describe))
|
||||
}
|
||||
|
||||
pub fn as_tag(&self) -> Option<&'static str> {
|
||||
match self {
|
||||
DescribeResult::Prototype { name } => Some(name),
|
||||
DescribeResult::Release { version, .. } => Some(version),
|
||||
DescribeResult::Prerelease { version, .. } => Some(version),
|
||||
DescribeResult::NotATag { describe: _ } => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_prototype(&self) -> Option<&'static str> {
|
||||
match self {
|
||||
DescribeResult::Prototype { name } => Some(name),
|
||||
DescribeResult::Release { .. }
|
||||
| DescribeResult::Prerelease { .. }
|
||||
| DescribeResult::NotATag { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses the input as a prototype name.
|
||||
///
|
||||
/// Returns `Some(prototype_name)` if the following conditions are met on this value:
|
||||
///
|
||||
/// 1. starts with `prototype-`,
|
||||
/// 2. ends with `-<some_number>`,
|
||||
/// 3. does not end with `<some_number>-<some_number>`.
|
||||
///
|
||||
/// Otherwise, returns `None`.
|
||||
fn prototype_name(describe: &'static str) -> Option<&'static str> {
|
||||
if !describe.starts_with("prototype-") {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut rsplit_prototype = describe.rsplit('-');
|
||||
// last component MUST be a number
|
||||
rsplit_prototype.next()?.parse::<u64>().ok()?;
|
||||
// before than last component SHALL NOT be a number
|
||||
rsplit_prototype.next()?.parse::<u64>().err()?;
|
||||
|
||||
Some(describe)
|
||||
}
|
||||
|
||||
fn release_version(describe: &'static str) -> Option<DescribeResult> {
|
||||
if !describe.starts_with('v') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// full release version don't contain a `-`
|
||||
if describe.contains('-') {
|
||||
return None;
|
||||
}
|
||||
|
||||
// full release version parse as vX.Y.Z, with X, Y, Z numbers.
|
||||
let mut dots = describe[1..].split('.');
|
||||
let major: u64 = dots.next()?.parse().ok()?;
|
||||
let minor: u64 = dots.next()?.parse().ok()?;
|
||||
let patch: u64 = dots.next()?.parse().ok()?;
|
||||
|
||||
if dots.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(DescribeResult::Release { version: describe, major, minor, patch })
|
||||
}
|
||||
|
||||
fn prerelease_version(describe: &'static str) -> Option<DescribeResult> {
|
||||
// prerelease version is in the shape vM.N.P-rc.C
|
||||
let mut hyphen = describe.rsplit('-');
|
||||
let prerelease = hyphen.next()?;
|
||||
if !prerelease.starts_with("rc.") {
|
||||
return None;
|
||||
}
|
||||
|
||||
let rc: u64 = prerelease[3..].parse().ok()?;
|
||||
|
||||
let release = hyphen.next()?;
|
||||
|
||||
let DescribeResult::Release { version: _, major, minor, patch } = release_version(release)?
|
||||
else {
|
||||
return None;
|
||||
};
|
||||
|
||||
Some(DescribeResult::Prerelease { version: describe, major, minor, patch, rc })
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::DescribeResult;
|
||||
|
||||
fn assert_not_a_tag(describe: &'static str) {
|
||||
assert_eq!(DescribeResult::NotATag { describe }, DescribeResult::new(describe))
|
||||
}
|
||||
|
||||
fn assert_proto(describe: &'static str) {
|
||||
assert_eq!(DescribeResult::Prototype { name: describe }, DescribeResult::new(describe))
|
||||
}
|
||||
|
||||
fn assert_release(describe: &'static str, major: u64, minor: u64, patch: u64) {
|
||||
assert_eq!(
|
||||
DescribeResult::Release { version: describe, major, minor, patch },
|
||||
DescribeResult::new(describe)
|
||||
)
|
||||
}
|
||||
|
||||
fn assert_prerelease(describe: &'static str, major: u64, minor: u64, patch: u64, rc: u64) {
|
||||
assert_eq!(
|
||||
DescribeResult::Prerelease { version: describe, major, minor, patch, rc },
|
||||
DescribeResult::new(describe)
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_a_tag() {
|
||||
assert_not_a_tag("whatever-fuzzy");
|
||||
assert_not_a_tag("whatever-fuzzy-5-ggg-dirty");
|
||||
assert_not_a_tag("whatever-fuzzy-120-ggg-dirty");
|
||||
|
||||
// technically a tag, but not a proto nor a version, so not parsed as a tag
|
||||
assert_not_a_tag("whatever");
|
||||
|
||||
// dirty version
|
||||
assert_not_a_tag("v1.7.0-1-ggga-dirty");
|
||||
assert_not_a_tag("v1.7.0-rc.1-1-ggga-dirty");
|
||||
|
||||
// after version
|
||||
assert_not_a_tag("v1.7.0-1-ggga");
|
||||
assert_not_a_tag("v1.7.0-rc.1-1-ggga");
|
||||
|
||||
// after proto
|
||||
assert_not_a_tag("protoype-tag-0-1-ggga");
|
||||
assert_not_a_tag("protoype-tag-0-1-ggga-dirty");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prototype() {
|
||||
assert_proto("prototype-tag-0");
|
||||
assert_proto("prototype-tag-10");
|
||||
assert_proto("prototype-long-name-tag-10");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn release() {
|
||||
assert_release("v1.7.2", 1, 7, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prerelease() {
|
||||
assert_prerelease("v1.7.2-rc.3", 1, 7, 2, 3);
|
||||
}
|
||||
}
|
@ -1,4 +1,3 @@
|
||||
use std::convert::TryInto;
|
||||
use std::str::FromStr;
|
||||
|
||||
use time::OffsetDateTime;
|
||||
|
@ -13,6 +13,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
tempfile = "3.9.0"
|
||||
thiserror = "1.0.56"
|
||||
tracing = "0.1.40"
|
||||
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::fs::File as StdFile;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
|
||||
@ -22,20 +22,6 @@ pub enum Error {
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
|
||||
impl Deref for File {
|
||||
type Target = NamedTempFile;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.file
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for File {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.file
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FileStore {
|
||||
path: PathBuf,
|
||||
@ -56,7 +42,7 @@ impl FileStore {
|
||||
let file = NamedTempFile::new_in(&self.path)?;
|
||||
let uuid = Uuid::new_v4();
|
||||
let path = self.path.join(uuid.to_string());
|
||||
let update_file = File { file, path };
|
||||
let update_file = File { file: Some(file), path };
|
||||
|
||||
Ok((uuid, update_file))
|
||||
}
|
||||
@ -67,7 +53,7 @@ impl FileStore {
|
||||
let file = NamedTempFile::new_in(&self.path)?;
|
||||
let uuid = Uuid::from_u128(uuid);
|
||||
let path = self.path.join(uuid.to_string());
|
||||
let update_file = File { file, path };
|
||||
let update_file = File { file: Some(file), path };
|
||||
|
||||
Ok((uuid, update_file))
|
||||
}
|
||||
@ -75,7 +61,13 @@ impl FileStore {
|
||||
/// Returns the file corresponding to the requested uuid.
|
||||
pub fn get_update(&self, uuid: Uuid) -> Result<StdFile> {
|
||||
let path = self.get_update_path(uuid);
|
||||
let file = StdFile::open(path)?;
|
||||
let file = match StdFile::open(path) {
|
||||
Ok(file) => file,
|
||||
Err(e) => {
|
||||
tracing::error!("Can't access update file {uuid}: {e}");
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
Ok(file)
|
||||
}
|
||||
|
||||
@ -110,8 +102,12 @@ impl FileStore {
|
||||
|
||||
pub fn delete(&self, uuid: Uuid) -> Result<()> {
|
||||
let path = self.path.join(uuid.to_string());
|
||||
std::fs::remove_file(path)?;
|
||||
Ok(())
|
||||
if let Err(e) = std::fs::remove_file(path) {
|
||||
tracing::error!("Can't delete file {uuid}: {e}");
|
||||
Err(e.into())
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// List the Uuids of the files in the FileStore
|
||||
@ -136,16 +132,40 @@ impl FileStore {
|
||||
|
||||
pub struct File {
|
||||
path: PathBuf,
|
||||
file: NamedTempFile,
|
||||
file: Option<NamedTempFile>,
|
||||
}
|
||||
|
||||
impl File {
|
||||
pub fn dry_file() -> Result<Self> {
|
||||
Ok(Self { path: PathBuf::new(), file: None })
|
||||
}
|
||||
|
||||
pub fn persist(self) -> Result<()> {
|
||||
self.file.persist(&self.path)?;
|
||||
if let Some(file) = self.file {
|
||||
file.persist(&self.path)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for File {
|
||||
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
if let Some(file) = self.file.as_mut() {
|
||||
file.write(buf)
|
||||
} else {
|
||||
Ok(buf.len())
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
if let Some(file) = self.file.as_mut() {
|
||||
file.flush()
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::io::Write;
|
||||
|
@ -23,6 +23,7 @@ meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
page_size = "0.5.0"
|
||||
puffin = { version = "0.16.0", features = ["serialization"] }
|
||||
rayon = "1.8.1"
|
||||
roaring = { version = "0.10.2", features = ["serde"] }
|
||||
serde = { version = "1.0.195", features = ["derive"] }
|
||||
serde_json = { version = "1.0.111", features = ["preserve_order"] }
|
||||
|
@ -142,22 +142,28 @@ pub(crate) enum IndexOperation {
|
||||
|
||||
impl Batch {
|
||||
/// Return the task ids associated with this batch.
|
||||
pub fn ids(&self) -> Vec<TaskId> {
|
||||
pub fn ids(&self) -> RoaringBitmap {
|
||||
match self {
|
||||
Batch::TaskCancelation { task, .. }
|
||||
| Batch::Dump(task)
|
||||
| Batch::IndexCreation { task, .. }
|
||||
| Batch::IndexUpdate { task, .. } => vec![task.uid],
|
||||
| Batch::IndexUpdate { task, .. } => {
|
||||
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
|
||||
}
|
||||
Batch::SnapshotCreation(tasks)
|
||||
| Batch::TaskDeletions(tasks)
|
||||
| Batch::IndexDeletion { tasks, .. } => tasks.iter().map(|task| task.uid).collect(),
|
||||
| Batch::IndexDeletion { tasks, .. } => {
|
||||
RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid))
|
||||
}
|
||||
Batch::IndexOperation { op, .. } => match op {
|
||||
IndexOperation::DocumentOperation { tasks, .. }
|
||||
| IndexOperation::Settings { tasks, .. }
|
||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid))
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => {
|
||||
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
|
||||
IndexOperation::SettingsAndDocumentOperation {
|
||||
document_import_tasks: tasks,
|
||||
settings_tasks: other,
|
||||
@ -167,9 +173,11 @@ impl Batch {
|
||||
cleared_tasks: tasks,
|
||||
settings_tasks: other,
|
||||
..
|
||||
} => tasks.iter().chain(other).map(|task| task.uid).collect(),
|
||||
} => RoaringBitmap::from_iter(tasks.iter().chain(other).map(|task| task.uid)),
|
||||
},
|
||||
Batch::IndexSwap { task } => vec![task.uid],
|
||||
Batch::IndexSwap { task } => {
|
||||
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -48,6 +48,8 @@ impl From<DateField> for Code {
|
||||
pub enum Error {
|
||||
#[error("{1}")]
|
||||
WithCustomErrorCode(Code, Box<Self>),
|
||||
#[error("Received bad task id: {received} should be >= to {expected}.")]
|
||||
BadTaskId { received: TaskId, expected: TaskId },
|
||||
#[error("Index `{0}` not found.")]
|
||||
IndexNotFound(String),
|
||||
#[error("Index `{0}` already exists.")]
|
||||
@ -161,6 +163,7 @@ impl Error {
|
||||
match self {
|
||||
Error::IndexNotFound(_)
|
||||
| Error::WithCustomErrorCode(_, _)
|
||||
| Error::BadTaskId { .. }
|
||||
| Error::IndexAlreadyExists(_)
|
||||
| Error::SwapDuplicateIndexFound(_)
|
||||
| Error::SwapDuplicateIndexesFound(_)
|
||||
@ -205,6 +208,7 @@ impl ErrorCode for Error {
|
||||
fn error_code(&self) -> Code {
|
||||
match self {
|
||||
Error::WithCustomErrorCode(code, _) => *code,
|
||||
Error::BadTaskId { .. } => Code::BadRequest,
|
||||
Error::IndexNotFound(_) => Code::IndexNotFound,
|
||||
Error::IndexAlreadyExists(_) => Code::IndexAlreadyExists,
|
||||
Error::SwapDuplicateIndexesFound(_) => Code::InvalidSwapDuplicateIndexFound,
|
||||
|
@ -30,19 +30,6 @@ impl RoFeatures {
|
||||
self.runtime
|
||||
}
|
||||
|
||||
pub fn check_score_details(&self) -> Result<()> {
|
||||
if self.runtime.score_details {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Computing score details",
|
||||
feature: "score details",
|
||||
issue_link: "https://github.com/meilisearch/product/discussions/674",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_metrics(&self) -> Result<()> {
|
||||
if self.runtime.metrics {
|
||||
Ok(())
|
||||
@ -61,7 +48,7 @@ impl RoFeatures {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "getting logs through the `/logs/stream` route",
|
||||
disabled_action: "Modifying logs through the `/logs/*` routes",
|
||||
feature: "logs route",
|
||||
issue_link: "https://github.com/orgs/meilisearch/discussions/721",
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
||||
|
||||
let IndexScheduler {
|
||||
autobatching_enabled,
|
||||
cleanup_enabled: _,
|
||||
must_stop_processing: _,
|
||||
processing_tasks,
|
||||
file_store,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,90 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
[
|
||||
{
|
||||
"uid": 0,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": null,
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "succeeded",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"uid": 1,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": {
|
||||
"message": "Index `doggo` already exists.",
|
||||
"code": "index_already_exists",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#index_already_exists"
|
||||
},
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "failed",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"uid": 2,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": null,
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "enqueued",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"uid": 3,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": null,
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "enqueued",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
@ -0,0 +1,90 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
[
|
||||
{
|
||||
"uid": 0,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": null,
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "succeeded",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"uid": 1,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": {
|
||||
"message": "Index `doggo` already exists.",
|
||||
"code": "index_already_exists",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#index_already_exists"
|
||||
},
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "failed",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"uid": 2,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": null,
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "enqueued",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"uid": 3,
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]",
|
||||
"error": null,
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"IndexInfo": {
|
||||
"primary_key": null
|
||||
}
|
||||
},
|
||||
"status": "enqueued",
|
||||
"kind": {
|
||||
"indexCreation": {
|
||||
"index_uid": "doggo",
|
||||
"primary_key": null
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
@ -1,5 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use uuid::Uuid;
|
||||
|
@ -1,7 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::HashSet;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::create_dir_all;
|
||||
use std::path::Path;
|
||||
use std::result::Result as StdResult;
|
||||
|
@ -54,3 +54,5 @@ thai = ["milli/thai"]
|
||||
greek = ["milli/greek"]
|
||||
# allow khmer specialized tokenization
|
||||
khmer = ["milli/khmer"]
|
||||
# allow vietnamese specialized tokenization
|
||||
vietnamese = ["milli/vietnamese"]
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::fmt::{self, Debug, Display};
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, Write};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use memmap2::MmapOptions;
|
||||
@ -104,8 +104,8 @@ impl ErrorCode for DocumentFormatError {
|
||||
}
|
||||
|
||||
/// Reads CSV from input and write an obkv batch to writer.
|
||||
pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||
pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
|
||||
let mmap = unsafe { MmapOptions::new().map(file)? };
|
||||
let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref());
|
||||
builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?;
|
||||
@ -116,9 +116,9 @@ pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result
|
||||
Ok(count as u64)
|
||||
}
|
||||
|
||||
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||
pub fn read_json(file: &File, writer: impl Write + Seek) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||
pub fn read_json(file: &File, writer: impl Write) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
|
||||
let mmap = unsafe { MmapOptions::new().map(file)? };
|
||||
let mut deserializer = serde_json::Deserializer::from_slice(&mmap);
|
||||
|
||||
@ -151,8 +151,8 @@ pub fn read_json(file: &File, writer: impl Write + Seek) -> Result<u64> {
|
||||
}
|
||||
|
||||
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||
pub fn read_ndjson(file: &File, writer: impl Write + Seek) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(writer);
|
||||
pub fn read_ndjson(file: &File, writer: impl Write) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
|
||||
let mmap = unsafe { MmapOptions::new().map(file)? };
|
||||
|
||||
for result in serde_json::Deserializer::from_slice(&mmap).into_iter() {
|
||||
|
@ -349,6 +349,9 @@ impl ErrorCode for milli::Error {
|
||||
UserError::InvalidFieldForSource { .. }
|
||||
| UserError::MissingFieldForSource { .. }
|
||||
| UserError::InvalidOpenAiModel { .. }
|
||||
| UserError::InvalidOpenAiModelDimensions { .. }
|
||||
| UserError::InvalidOpenAiModelDimensionsMax { .. }
|
||||
| UserError::InvalidSettingsDimensions { .. }
|
||||
| UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders,
|
||||
|
@ -3,7 +3,6 @@ use serde::{Deserialize, Serialize};
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq)]
|
||||
#[serde(rename_all = "camelCase", default)]
|
||||
pub struct RuntimeTogglableFeatures {
|
||||
pub score_details: bool,
|
||||
pub vector_store: bool,
|
||||
pub metrics: bool,
|
||||
pub logs_route: bool,
|
||||
|
@ -104,9 +104,10 @@ serde_urlencoded = "0.7.1"
|
||||
termcolor = "1.4.1"
|
||||
url = { version = "2.5.0", features = ["serde"] }
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = "0.3.18"
|
||||
tracing-subscriber = { version = "0.3.18", features = ["json"] }
|
||||
tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
|
||||
tracing-actix-web = "0.7.9"
|
||||
build-info = { version = "1.7.0", path = "../build-info" }
|
||||
|
||||
[dev-dependencies]
|
||||
actix-rt = "2.9.0"
|
||||
@ -131,7 +132,6 @@ reqwest = { version = "0.11.23", features = [
|
||||
sha-1 = { version = "0.10.1", optional = true }
|
||||
static-files = { version = "0.2.3", optional = true }
|
||||
tempfile = { version = "3.9.0", optional = true }
|
||||
vergen = { version = "7.5.1", default-features = false, features = ["git"] }
|
||||
zip = { version = "0.6.6", optional = true }
|
||||
|
||||
[features]
|
||||
@ -154,7 +154,8 @@ japanese = ["meilisearch-types/japanese"]
|
||||
thai = ["meilisearch-types/thai"]
|
||||
greek = ["meilisearch-types/greek"]
|
||||
khmer = ["meilisearch-types/khmer"]
|
||||
vietnamese = ["meilisearch-types/vietnamese"]
|
||||
|
||||
[package.metadata.mini-dashboard]
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.12/build.zip"
|
||||
sha1 = "acfe9a018c93eb0604ea87ee87bff7df5474e18e"
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
|
||||
sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"
|
||||
|
@ -1,17 +1,4 @@
|
||||
use vergen::{vergen, Config, SemverKind};
|
||||
|
||||
fn main() {
|
||||
// Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them
|
||||
// in the corresponding GitHub workflow (publish_docker.yml).
|
||||
// This is due to the Dockerfile building the binary outside of the git directory.
|
||||
let mut config = Config::default();
|
||||
// allow using non-annotated tags
|
||||
*config.git_mut().semver_kind_mut() = SemverKind::Lightweight;
|
||||
|
||||
if let Err(e) = vergen(config) {
|
||||
println!("cargo:warning=vergen: {}", e);
|
||||
}
|
||||
|
||||
#[cfg(feature = "mini-dashboard")]
|
||||
mini_dashboard::setup_mini_dashboard().expect("Could not load the mini-dashboard assets");
|
||||
}
|
||||
|
@ -28,7 +28,9 @@ use super::{
|
||||
config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH,
|
||||
};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot};
|
||||
use crate::option::{
|
||||
default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot,
|
||||
};
|
||||
use crate::routes::indexes::documents::UpdateDocumentsQuery;
|
||||
use crate::routes::indexes::facet_search::FacetSearchQuery;
|
||||
use crate::routes::tasks::TasksFilterQuery;
|
||||
@ -250,9 +252,12 @@ impl super::Analytics for SegmentAnalytics {
|
||||
struct Infos {
|
||||
env: String,
|
||||
experimental_enable_metrics: bool,
|
||||
experimental_logs_mode: LogMode,
|
||||
experimental_replication_parameters: bool,
|
||||
experimental_enable_logs_route: bool,
|
||||
experimental_reduce_indexing_memory_usage: bool,
|
||||
experimental_max_number_of_batched_tasks: usize,
|
||||
gpu_enabled: bool,
|
||||
db_path: bool,
|
||||
import_dump: bool,
|
||||
dump_dir: bool,
|
||||
@ -288,6 +293,8 @@ impl From<Opt> for Infos {
|
||||
let Opt {
|
||||
db_path,
|
||||
experimental_enable_metrics,
|
||||
experimental_logs_mode,
|
||||
experimental_replication_parameters,
|
||||
experimental_enable_logs_route,
|
||||
experimental_reduce_indexing_memory_usage,
|
||||
experimental_max_number_of_batched_tasks,
|
||||
@ -335,8 +342,11 @@ impl From<Opt> for Infos {
|
||||
Self {
|
||||
env,
|
||||
experimental_enable_metrics,
|
||||
experimental_logs_mode,
|
||||
experimental_replication_parameters,
|
||||
experimental_enable_logs_route,
|
||||
experimental_reduce_indexing_memory_usage,
|
||||
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
|
||||
db_path: db_path != PathBuf::from("./data.ms"),
|
||||
import_dump: import_dump.is_some(),
|
||||
dump_dir: dump_dir != PathBuf::from("dumps/"),
|
||||
@ -463,7 +473,9 @@ impl Segment {
|
||||
create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default())
|
||||
{
|
||||
// Replace the version number with the prototype name if any.
|
||||
let version = if let Some(prototype) = crate::prototype_name() {
|
||||
let version = if let Some(prototype) = build_info::DescribeResult::from_build()
|
||||
.and_then(|describe| describe.as_prototype())
|
||||
{
|
||||
prototype
|
||||
} else {
|
||||
env!("CARGO_PKG_VERSION")
|
||||
|
@ -131,6 +131,7 @@ gen_seq! { SeqFromRequestFut3; A B C }
|
||||
gen_seq! { SeqFromRequestFut4; A B C D }
|
||||
gen_seq! { SeqFromRequestFut5; A B C D E }
|
||||
gen_seq! { SeqFromRequestFut6; A B C D E F }
|
||||
gen_seq! { SeqFromRequestFut7; A B C D E F G }
|
||||
|
||||
pin_project! {
|
||||
#[project = ExtractProj]
|
||||
|
@ -38,7 +38,7 @@ use meilisearch_types::versioning::{check_version_file, create_version_file};
|
||||
use meilisearch_types::{compression, milli, VERSION_FILE_NAME};
|
||||
pub use option::Opt;
|
||||
use option::ScheduleSnapshot;
|
||||
use tracing::error;
|
||||
use tracing::{error, info_span};
|
||||
use tracing_subscriber::filter::Targets;
|
||||
|
||||
use crate::error::MeilisearchHttpError;
|
||||
@ -97,11 +97,25 @@ pub type LogRouteType = tracing_subscriber::filter::Filtered<
|
||||
tracing_subscriber::Registry,
|
||||
>;
|
||||
|
||||
pub type SubscriberForSecondLayer = tracing_subscriber::layer::Layered<
|
||||
tracing_subscriber::reload::Layer<LogRouteType, tracing_subscriber::Registry>,
|
||||
tracing_subscriber::Registry,
|
||||
>;
|
||||
|
||||
pub type LogStderrHandle =
|
||||
tracing_subscriber::reload::Handle<LogStderrType, SubscriberForSecondLayer>;
|
||||
|
||||
pub type LogStderrType = tracing_subscriber::filter::Filtered<
|
||||
Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
|
||||
Targets,
|
||||
SubscriberForSecondLayer,
|
||||
>;
|
||||
|
||||
pub fn create_app(
|
||||
index_scheduler: Data<IndexScheduler>,
|
||||
auth_controller: Data<AuthController>,
|
||||
opt: Opt,
|
||||
logs: LogRouteHandle,
|
||||
logs: (LogRouteHandle, LogStderrHandle),
|
||||
analytics: Arc<dyn Analytics>,
|
||||
enable_dashboard: bool,
|
||||
) -> actix_web::App<
|
||||
@ -136,11 +150,49 @@ pub fn create_app(
|
||||
.allow_any_method()
|
||||
.max_age(86_400), // 24h
|
||||
)
|
||||
.wrap(tracing_actix_web::TracingLogger::default())
|
||||
.wrap(tracing_actix_web::TracingLogger::<AwebTracingLogger>::new())
|
||||
.wrap(actix_web::middleware::Compress::default())
|
||||
.wrap(actix_web::middleware::NormalizePath::new(actix_web::middleware::TrailingSlash::Trim))
|
||||
}
|
||||
|
||||
struct AwebTracingLogger;
|
||||
|
||||
impl tracing_actix_web::RootSpanBuilder for AwebTracingLogger {
|
||||
fn on_request_start(request: &actix_web::dev::ServiceRequest) -> tracing::Span {
|
||||
use tracing::field::Empty;
|
||||
|
||||
let conn_info = request.connection_info();
|
||||
let headers = request.headers();
|
||||
let user_agent = headers
|
||||
.get(http::header::USER_AGENT)
|
||||
.map(|value| String::from_utf8_lossy(value.as_bytes()).into_owned())
|
||||
.unwrap_or_default();
|
||||
info_span!("HTTP request", method = %request.method(), host = conn_info.host(), route = %request.path(), query_parameters = %request.query_string(), %user_agent, status_code = Empty, error = Empty)
|
||||
}
|
||||
|
||||
fn on_request_end<B: MessageBody>(
|
||||
span: tracing::Span,
|
||||
outcome: &Result<ServiceResponse<B>, actix_web::Error>,
|
||||
) {
|
||||
match &outcome {
|
||||
Ok(response) => {
|
||||
let code: i32 = response.response().status().as_u16().into();
|
||||
span.record("status_code", code);
|
||||
|
||||
if let Some(error) = response.response().error() {
|
||||
// use the status code already constructed for the outgoing HTTP response
|
||||
span.record("error", &tracing::field::display(error.as_response_error()));
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
let code: i32 = error.error_response().status().as_u16().into();
|
||||
span.record("status_code", code);
|
||||
span.record("error", &tracing::field::display(error.as_response_error()));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
enum OnFailure {
|
||||
RemoveDb,
|
||||
KeepDb,
|
||||
@ -213,7 +265,9 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc<IndexScheduler>, Arc<
|
||||
.name(String::from("register-snapshot-tasks"))
|
||||
.spawn(move || loop {
|
||||
thread::sleep(snapshot_delay);
|
||||
if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation) {
|
||||
if let Err(e) =
|
||||
index_scheduler.register(KindWithContent::SnapshotCreation, None, false)
|
||||
{
|
||||
error!("Error while registering snapshot: {}", e);
|
||||
}
|
||||
})
|
||||
@ -248,6 +302,7 @@ fn open_or_create_database_unchecked(
|
||||
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
|
||||
indexer_config: (&opt.indexer_options).try_into()?,
|
||||
autobatching_enabled: true,
|
||||
cleanup_enabled: !opt.experimental_replication_parameters,
|
||||
max_number_of_tasks: 1_000_000,
|
||||
max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
|
||||
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
|
||||
@ -371,6 +426,9 @@ fn import_dump(
|
||||
let reader = BufReader::new(file);
|
||||
let reader = DocumentsBatchReader::from_reader(reader)?;
|
||||
|
||||
let embedder_configs = index.embedding_configs(&wtxn)?;
|
||||
let embedders = index_scheduler.embedders(embedder_configs)?;
|
||||
|
||||
let builder = milli::update::IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
@ -383,6 +441,8 @@ fn import_dump(
|
||||
|| false,
|
||||
)?;
|
||||
|
||||
let builder = builder.with_embedders(embedders);
|
||||
|
||||
let (builder, user_result) = builder.add_documents(reader)?;
|
||||
let user_result = user_result?;
|
||||
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
|
||||
@ -406,7 +466,7 @@ pub fn configure_data(
|
||||
index_scheduler: Data<IndexScheduler>,
|
||||
auth: Data<AuthController>,
|
||||
opt: &Opt,
|
||||
logs: LogRouteHandle,
|
||||
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
|
||||
analytics: Arc<dyn Analytics>,
|
||||
) {
|
||||
let http_payload_size_limit = opt.http_payload_size_limit.get_bytes() as usize;
|
||||
@ -414,7 +474,9 @@ pub fn configure_data(
|
||||
.app_data(index_scheduler)
|
||||
.app_data(auth)
|
||||
.app_data(web::Data::from(analytics))
|
||||
.app_data(web::Data::new(logs))
|
||||
.app_data(web::Data::new(logs_route))
|
||||
.app_data(web::Data::new(logs_stderr))
|
||||
.app_data(web::Data::new(opt.clone()))
|
||||
.app_data(
|
||||
web::JsonConfig::default()
|
||||
.limit(http_payload_size_limit)
|
||||
@ -474,30 +536,3 @@ pub fn dashboard(config: &mut web::ServiceConfig, enable_frontend: bool) {
|
||||
pub fn dashboard(config: &mut web::ServiceConfig, _enable_frontend: bool) {
|
||||
config.service(web::resource("/").route(web::get().to(routes::running)));
|
||||
}
|
||||
|
||||
/// Parses the output of
|
||||
/// [`VERGEN_GIT_SEMVER_LIGHTWEIGHT`](https://docs.rs/vergen/latest/vergen/struct.Git.html#instructions)
|
||||
/// as a prototype name.
|
||||
///
|
||||
/// Returns `Some(prototype_name)` if the following conditions are met on this value:
|
||||
///
|
||||
/// 1. starts with `prototype-`,
|
||||
/// 2. ends with `-<some_number>`,
|
||||
/// 3. does not end with `<some_number>-<some_number>`.
|
||||
///
|
||||
/// Otherwise, returns `None`.
|
||||
pub fn prototype_name() -> Option<&'static str> {
|
||||
let prototype: &'static str = option_env!("VERGEN_GIT_SEMVER_LIGHTWEIGHT")?;
|
||||
|
||||
if !prototype.starts_with("prototype-") {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut rsplit_prototype = prototype.rsplit('-');
|
||||
// last component MUST be a number
|
||||
rsplit_prototype.next()?.parse::<u64>().ok()?;
|
||||
// before than last component SHALL NOT be a number
|
||||
rsplit_prototype.next()?.parse::<u64>().err()?;
|
||||
|
||||
Some(prototype)
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::env;
|
||||
use std::io::{stderr, Write};
|
||||
use std::io::{stderr, LineWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@ -10,8 +10,10 @@ use actix_web::HttpServer;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use is_terminal::IsTerminal;
|
||||
use meilisearch::analytics::Analytics;
|
||||
use meilisearch::option::LogMode;
|
||||
use meilisearch::{
|
||||
analytics, create_app, prototype_name, setup_meilisearch, LogRouteHandle, LogRouteType, Opt,
|
||||
analytics, create_app, setup_meilisearch, LogRouteHandle, LogRouteType, LogStderrHandle,
|
||||
LogStderrType, Opt, SubscriberForSecondLayer,
|
||||
};
|
||||
use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE};
|
||||
use mimalloc::MiMalloc;
|
||||
@ -23,28 +25,44 @@ use tracing_subscriber::Layer;
|
||||
#[global_allocator]
|
||||
static ALLOC: MiMalloc = MiMalloc;
|
||||
|
||||
fn default_layer() -> LogRouteType {
|
||||
fn default_log_route_layer() -> LogRouteType {
|
||||
None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF))
|
||||
}
|
||||
|
||||
fn default_log_stderr_layer(opt: &Opt) -> LogStderrType {
|
||||
let layer = tracing_subscriber::fmt::layer()
|
||||
.with_writer(|| LineWriter::new(std::io::stderr()))
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE);
|
||||
|
||||
let layer = match opt.experimental_logs_mode {
|
||||
LogMode::Human => Box::new(layer)
|
||||
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
|
||||
LogMode::Json => Box::new(layer.json())
|
||||
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
|
||||
};
|
||||
|
||||
layer.with_filter(
|
||||
tracing_subscriber::filter::Targets::new()
|
||||
.with_target("", LevelFilter::from_str(&opt.log_level.to_string()).unwrap()),
|
||||
)
|
||||
}
|
||||
|
||||
/// does all the setup before meilisearch is launched
|
||||
fn setup(opt: &Opt) -> anyhow::Result<LogRouteHandle> {
|
||||
let (route_layer, route_layer_handle) = tracing_subscriber::reload::Layer::new(default_layer());
|
||||
fn setup(opt: &Opt) -> anyhow::Result<(LogRouteHandle, LogStderrHandle)> {
|
||||
let (route_layer, route_layer_handle) =
|
||||
tracing_subscriber::reload::Layer::new(default_log_route_layer());
|
||||
let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer;
|
||||
|
||||
let subscriber = tracing_subscriber::registry().with(route_layer).with(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::NEW)
|
||||
.with_filter(
|
||||
tracing_subscriber::filter::LevelFilter::from_str(&opt.log_level.to_string())
|
||||
.unwrap(),
|
||||
),
|
||||
);
|
||||
let (stderr_layer, stderr_layer_handle) =
|
||||
tracing_subscriber::reload::Layer::new(default_log_stderr_layer(opt));
|
||||
let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer;
|
||||
|
||||
let subscriber = tracing_subscriber::registry().with(route_layer).with(stderr_layer);
|
||||
|
||||
// set the subscriber as the default for the application
|
||||
tracing::subscriber::set_global_default(subscriber).unwrap();
|
||||
|
||||
Ok(route_layer_handle)
|
||||
Ok((route_layer_handle, stderr_layer_handle))
|
||||
}
|
||||
|
||||
fn on_panic(info: &std::panic::PanicInfo) {
|
||||
@ -110,7 +128,7 @@ async fn run_http(
|
||||
index_scheduler: Arc<IndexScheduler>,
|
||||
auth_controller: Arc<AuthController>,
|
||||
opt: Opt,
|
||||
logs: LogRouteHandle,
|
||||
logs: (LogRouteHandle, LogStderrHandle),
|
||||
analytics: Arc<dyn Analytics>,
|
||||
) -> anyhow::Result<()> {
|
||||
let enable_dashboard = &opt.env == "development";
|
||||
@ -145,8 +163,8 @@ pub fn print_launch_resume(
|
||||
analytics: Arc<dyn Analytics>,
|
||||
config_read_from: Option<PathBuf>,
|
||||
) {
|
||||
let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown");
|
||||
let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown");
|
||||
let build_info = build_info::BuildInfo::from_build();
|
||||
|
||||
let protocol =
|
||||
if opt.ssl_cert_path.is_some() && opt.ssl_key_path.is_some() { "https" } else { "http" };
|
||||
let ascii_name = r#"
|
||||
@ -171,10 +189,18 @@ pub fn print_launch_resume(
|
||||
eprintln!("Database path:\t\t{:?}", opt.db_path);
|
||||
eprintln!("Server listening on:\t\"{}://{}\"", protocol, opt.http_addr);
|
||||
eprintln!("Environment:\t\t{:?}", opt.env);
|
||||
eprintln!("Commit SHA:\t\t{:?}", commit_sha.to_string());
|
||||
eprintln!("Commit date:\t\t{:?}", commit_date.to_string());
|
||||
eprintln!("Commit SHA:\t\t{:?}", build_info.commit_sha1.unwrap_or("unknown"));
|
||||
eprintln!(
|
||||
"Commit date:\t\t{:?}",
|
||||
build_info
|
||||
.commit_timestamp
|
||||
.and_then(|commit_timestamp| commit_timestamp
|
||||
.format(&time::format_description::well_known::Rfc3339)
|
||||
.ok())
|
||||
.unwrap_or("unknown".into())
|
||||
);
|
||||
eprintln!("Package version:\t{:?}", env!("CARGO_PKG_VERSION").to_string());
|
||||
if let Some(prototype) = prototype_name() {
|
||||
if let Some(prototype) = build_info.describe.and_then(|describe| describe.as_prototype()) {
|
||||
eprintln!("Prototype:\t\t{:?}", prototype);
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::convert::TryFrom;
|
||||
use std::env::VarError;
|
||||
use std::ffi::OsStr;
|
||||
use std::fmt::Display;
|
||||
@ -51,6 +50,8 @@ const MEILI_IGNORE_MISSING_DUMP: &str = "MEILI_IGNORE_MISSING_DUMP";
|
||||
const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS";
|
||||
const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR";
|
||||
const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
|
||||
const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE";
|
||||
const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS";
|
||||
const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE";
|
||||
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
||||
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
|
||||
@ -79,6 +80,39 @@ const DEFAULT_LOG_EVERY_N: usize = 100_000;
|
||||
pub const INDEX_SIZE: u64 = 2 * 1024 * 1024 * 1024 * 1024; // 2 TiB
|
||||
pub const TASK_DB_SIZE: u64 = 20 * 1024 * 1024 * 1024; // 20 GiB
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "UPPERCASE")]
|
||||
pub enum LogMode {
|
||||
#[default]
|
||||
Human,
|
||||
Json,
|
||||
}
|
||||
|
||||
impl Display for LogMode {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
LogMode::Human => Display::fmt("HUMAN", f),
|
||||
LogMode::Json => Display::fmt("JSON", f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for LogMode {
|
||||
type Err = LogModeError;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s.trim().to_lowercase().as_str() {
|
||||
"human" => Ok(LogMode::Human),
|
||||
"json" => Ok(LogMode::Json),
|
||||
_ => Err(LogModeError(s.to_owned())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("Unsupported log mode level `{0}`. Supported values are `HUMAN` and `JSON`.")]
|
||||
pub struct LogModeError(String);
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "UPPERCASE")]
|
||||
pub enum LogLevel {
|
||||
@ -310,13 +344,30 @@ pub struct Opt {
|
||||
#[serde(default)]
|
||||
pub experimental_enable_metrics: bool,
|
||||
|
||||
/// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723>
|
||||
///
|
||||
/// Change the mode of the logs on the console.
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_LOGS_MODE, default_value_t)]
|
||||
#[serde(default)]
|
||||
pub experimental_logs_mode: LogMode,
|
||||
|
||||
/// Experimental logs route feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/721>
|
||||
///
|
||||
/// Enables the log route on the `POST /logs/stream` endpoint and the `DELETE /logs/stream` to stop receiving logs.
|
||||
/// Enables the log routes on the `POST /logs/stream`, `POST /logs/stderr` endpoints, and the `DELETE /logs/stream` to stop receiving logs.
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE)]
|
||||
#[serde(default)]
|
||||
pub experimental_enable_logs_route: bool,
|
||||
|
||||
/// Enable multiple features that helps you to run meilisearch in a replicated context.
|
||||
/// For more information, see: <https://github.com/orgs/meilisearch/discussions/725>
|
||||
///
|
||||
/// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now
|
||||
/// - Lets you specify a custom task ID upon registering a task
|
||||
/// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed)
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS)]
|
||||
#[serde(default)]
|
||||
pub experimental_replication_parameters: bool,
|
||||
|
||||
/// Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)]
|
||||
#[serde(default)]
|
||||
@ -422,7 +473,9 @@ impl Opt {
|
||||
#[cfg(feature = "analytics")]
|
||||
no_analytics,
|
||||
experimental_enable_metrics,
|
||||
experimental_logs_mode,
|
||||
experimental_enable_logs_route,
|
||||
experimental_replication_parameters,
|
||||
experimental_reduce_indexing_memory_usage,
|
||||
} = self;
|
||||
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
|
||||
@ -479,6 +532,14 @@ impl Opt {
|
||||
MEILI_EXPERIMENTAL_ENABLE_METRICS,
|
||||
experimental_enable_metrics.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_LOGS_MODE,
|
||||
experimental_logs_mode.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS,
|
||||
experimental_replication_parameters.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE,
|
||||
experimental_enable_logs_route.to_string(),
|
||||
|
@ -10,7 +10,7 @@ use meilisearch_types::deserr::query_params::Param;
|
||||
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::error::{Code, ResponseError};
|
||||
use meilisearch_types::keys::{Action, CreateApiKey, Key, PatchApiKey};
|
||||
use meilisearch_types::keys::{CreateApiKey, Key, PatchApiKey};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::OffsetDateTime;
|
||||
use uuid::Uuid;
|
||||
|
@ -11,7 +11,8 @@ use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::SummarizedTaskView;
|
||||
use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::Opt;
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump))));
|
||||
@ -21,6 +22,7 @@ pub async fn create_dump(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>,
|
||||
auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.publish("Dump Created".to_string(), json!({}), Some(&req));
|
||||
@ -29,8 +31,12 @@ pub async fn create_dump(
|
||||
keys: auth_controller.list_keys()?,
|
||||
instance_uid: analytics.instance_uid().cloned(),
|
||||
};
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Create dump");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
|
@ -41,8 +41,6 @@ async fn get_features(
|
||||
#[derive(Debug, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct RuntimeTogglableFeatures {
|
||||
#[deserr(default)]
|
||||
pub score_details: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub vector_store: Option<bool>,
|
||||
#[deserr(default)]
|
||||
@ -67,7 +65,6 @@ async fn patch_features(
|
||||
|
||||
let old_features = features.runtime_features();
|
||||
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
|
||||
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
|
||||
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
|
||||
metrics: new_features.0.metrics.unwrap_or(old_features.metrics),
|
||||
logs_route: new_features.0.logs_route.unwrap_or(old_features.logs_route),
|
||||
@ -81,7 +78,6 @@ async fn patch_features(
|
||||
// the it renames to camelCase, which we don't want for analytics.
|
||||
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
|
||||
let meilisearch_types::features::RuntimeTogglableFeatures {
|
||||
score_details,
|
||||
vector_store,
|
||||
metrics,
|
||||
logs_route,
|
||||
@ -91,7 +87,6 @@ async fn patch_features(
|
||||
analytics.publish(
|
||||
"Experimental features Updated".to_string(),
|
||||
json!({
|
||||
"score_details": score_details,
|
||||
"vector_store": vector_store,
|
||||
"metrics": metrics,
|
||||
"logs_route": logs_route,
|
||||
|
@ -7,7 +7,7 @@ use bstr::ByteSlice as _;
|
||||
use deserr::actix_web::{AwebJson, AwebQueryParameter};
|
||||
use deserr::Deserr;
|
||||
use futures::StreamExt;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use index_scheduler::{IndexScheduler, TaskId};
|
||||
use meilisearch_types::deserr::query_params::Param;
|
||||
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
|
||||
use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType};
|
||||
@ -36,8 +36,11 @@ use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::payload::Payload;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::{PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
|
||||
use crate::routes::{
|
||||
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
||||
};
|
||||
use crate::search::parse_filter;
|
||||
use crate::Opt;
|
||||
|
||||
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
||||
vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()]
|
||||
@ -119,6 +122,7 @@ pub async fn delete_document(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
|
||||
path: web::Path<DocumentParam>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let DocumentParam { index_uid, document_id } = path.into_inner();
|
||||
@ -130,9 +134,13 @@ pub async fn delete_document(
|
||||
index_uid: index_uid.to_string(),
|
||||
documents_ids: vec![document_id],
|
||||
};
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
debug!(returns = ?task, "Delete document");
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
debug!("returns: {:?}", task);
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
||||
@ -267,6 +275,7 @@ pub async fn replace_documents(
|
||||
params: AwebQueryParameter<UpdateDocumentsQuery, DeserrQueryParamError>,
|
||||
body: Payload,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
@ -277,6 +286,8 @@ pub async fn replace_documents(
|
||||
analytics.add_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req);
|
||||
|
||||
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task = document_addition(
|
||||
extract_mime_type(&req)?,
|
||||
index_scheduler,
|
||||
@ -285,6 +296,8 @@ pub async fn replace_documents(
|
||||
params.csv_delimiter,
|
||||
body,
|
||||
IndexDocumentsMethod::ReplaceDocuments,
|
||||
uid,
|
||||
dry_run,
|
||||
allow_index_creation,
|
||||
)
|
||||
.await?;
|
||||
@ -299,6 +312,7 @@ pub async fn update_documents(
|
||||
params: AwebQueryParameter<UpdateDocumentsQuery, DeserrQueryParamError>,
|
||||
body: Payload,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
@ -309,6 +323,8 @@ pub async fn update_documents(
|
||||
analytics.update_documents(¶ms, index_scheduler.index(&index_uid).is_err(), &req);
|
||||
|
||||
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task = document_addition(
|
||||
extract_mime_type(&req)?,
|
||||
index_scheduler,
|
||||
@ -317,6 +333,8 @@ pub async fn update_documents(
|
||||
params.csv_delimiter,
|
||||
body,
|
||||
IndexDocumentsMethod::UpdateDocuments,
|
||||
uid,
|
||||
dry_run,
|
||||
allow_index_creation,
|
||||
)
|
||||
.await?;
|
||||
@ -334,6 +352,8 @@ async fn document_addition(
|
||||
csv_delimiter: Option<u8>,
|
||||
mut body: Payload,
|
||||
method: IndexDocumentsMethod,
|
||||
task_id: Option<TaskId>,
|
||||
dry_run: bool,
|
||||
allow_index_creation: bool,
|
||||
) -> Result<SummarizedTaskView, MeilisearchHttpError> {
|
||||
let format = match (
|
||||
@ -366,7 +386,7 @@ async fn document_addition(
|
||||
}
|
||||
};
|
||||
|
||||
let (uuid, mut update_file) = index_scheduler.create_update_file()?;
|
||||
let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?;
|
||||
|
||||
let temp_file = match tempfile() {
|
||||
Ok(file) => file,
|
||||
@ -405,11 +425,9 @@ async fn document_addition(
|
||||
let read_file = buffer.into_inner().into_std().await;
|
||||
let documents_count = tokio::task::spawn_blocking(move || {
|
||||
let documents_count = match format {
|
||||
PayloadType::Json => read_json(&read_file, update_file.as_file_mut())?,
|
||||
PayloadType::Csv { delimiter } => {
|
||||
read_csv(&read_file, update_file.as_file_mut(), delimiter)?
|
||||
}
|
||||
PayloadType::Ndjson => read_ndjson(&read_file, update_file.as_file_mut())?,
|
||||
PayloadType::Json => read_json(&read_file, &mut update_file)?,
|
||||
PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?,
|
||||
PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?,
|
||||
};
|
||||
// we NEED to persist the file here because we moved the `udpate_file` in another task.
|
||||
update_file.persist()?;
|
||||
@ -450,7 +468,9 @@ async fn document_addition(
|
||||
};
|
||||
|
||||
let scheduler = index_scheduler.clone();
|
||||
let task = match tokio::task::spawn_blocking(move || scheduler.register(task)).await? {
|
||||
let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id, dry_run))
|
||||
.await?
|
||||
{
|
||||
Ok(task) => task,
|
||||
Err(e) => {
|
||||
index_scheduler.delete_update_file(uuid)?;
|
||||
@ -466,6 +486,7 @@ pub async fn delete_documents_batch(
|
||||
index_uid: web::Path<String>,
|
||||
body: web::Json<Vec<Value>>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Delete documents by batch");
|
||||
@ -480,8 +501,12 @@ pub async fn delete_documents_batch(
|
||||
|
||||
let task =
|
||||
KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids };
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Delete documents by batch");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
@ -499,6 +524,7 @@ pub async fn delete_documents_by_filter(
|
||||
index_uid: web::Path<String>,
|
||||
body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Delete documents by filter");
|
||||
@ -516,8 +542,12 @@ pub async fn delete_documents_by_filter(
|
||||
.map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?;
|
||||
let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter };
|
||||
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Delete documents by filter");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
@ -527,14 +557,19 @@ pub async fn clear_all_documents(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
analytics.delete_documents(DocumentDeletionKind::ClearAll, &req);
|
||||
|
||||
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Delete all documents");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
|
@ -17,11 +17,13 @@ use serde_json::json;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
|
||||
use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::is_dry_run;
|
||||
use crate::Opt;
|
||||
|
||||
pub mod documents;
|
||||
pub mod facet_search;
|
||||
@ -123,6 +125,7 @@ pub async fn create_index(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>,
|
||||
body: AwebJson<IndexCreateRequest, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Create index");
|
||||
@ -137,8 +140,12 @@ pub async fn create_index(
|
||||
);
|
||||
|
||||
let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key };
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
debug!(returns = ?task, "Create index");
|
||||
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
@ -190,6 +197,7 @@ pub async fn update_index(
|
||||
index_uid: web::Path<String>,
|
||||
body: AwebJson<UpdateIndexRequest, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Update index");
|
||||
@ -206,8 +214,12 @@ pub async fn update_index(
|
||||
primary_key: body.primary_key,
|
||||
};
|
||||
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Update index");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
@ -216,11 +228,17 @@ pub async fn update_index(
|
||||
pub async fn delete_index(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_DELETE }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() };
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
debug!(returns = ?task, "Delete index");
|
||||
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
|
@ -15,7 +15,8 @@ use tracing::debug;
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::routes::SummarizedTaskView;
|
||||
use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::Opt;
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! make_setting_route {
|
||||
@ -34,7 +35,8 @@ macro_rules! make_setting_route {
|
||||
use $crate::extractors::authentication::policies::*;
|
||||
use $crate::extractors::authentication::GuardedData;
|
||||
use $crate::extractors::sequential_extractor::SeqHandler;
|
||||
use $crate::routes::SummarizedTaskView;
|
||||
use $crate::Opt;
|
||||
use $crate::routes::{is_dry_run, get_task_id, SummarizedTaskView};
|
||||
|
||||
pub async fn delete(
|
||||
index_scheduler: GuardedData<
|
||||
@ -42,6 +44,8 @@ macro_rules! make_setting_route {
|
||||
Data<IndexScheduler>,
|
||||
>,
|
||||
index_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
@ -56,8 +60,10 @@ macro_rules! make_setting_route {
|
||||
is_deletion: true,
|
||||
allow_index_creation,
|
||||
};
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task))
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
@ -73,6 +79,7 @@ macro_rules! make_setting_route {
|
||||
index_uid: actix_web::web::Path<String>,
|
||||
body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
$analytics_var: web::Data<dyn Analytics>,
|
||||
) -> std::result::Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
@ -105,8 +112,10 @@ macro_rules! make_setting_route {
|
||||
is_deletion: false,
|
||||
allow_index_creation,
|
||||
};
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task))
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
@ -652,6 +661,7 @@ pub async fn update_all(
|
||||
index_uid: web::Path<String>,
|
||||
body: AwebJson<Settings<Unchecked>, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
@ -767,8 +777,12 @@ pub async fn update_all(
|
||||
is_deletion: false,
|
||||
allow_index_creation,
|
||||
};
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Update all settings");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
@ -790,6 +804,8 @@ pub async fn get_all(
|
||||
pub async fn delete_all(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SETTINGS_UPDATE }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
@ -803,8 +819,12 @@ pub async fn delete_all(
|
||||
is_deletion: true,
|
||||
allow_index_creation,
|
||||
};
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Delete all settings");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
|
@ -22,21 +22,23 @@ use crate::error::MeilisearchHttpError;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::LogRouteHandle;
|
||||
use crate::{LogRouteHandle, LogStderrHandle};
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(
|
||||
web::resource("stream")
|
||||
.route(web::post().to(SeqHandler(get_logs)))
|
||||
.route(web::delete().to(SeqHandler(cancel_logs))),
|
||||
);
|
||||
)
|
||||
.service(web::resource("stderr").route(web::post().to(SeqHandler(update_stderr_target))));
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Deserr, PartialEq, Eq)]
|
||||
#[deserr(rename_all = lowercase)]
|
||||
#[deserr(rename_all = camelCase)]
|
||||
pub enum LogMode {
|
||||
#[default]
|
||||
Fmt,
|
||||
Human,
|
||||
Json,
|
||||
Profile,
|
||||
}
|
||||
|
||||
@ -50,7 +52,7 @@ enum MyParseError {
|
||||
#[error(transparent)]
|
||||
ParseError(#[from] tracing_subscriber::filter::ParseError),
|
||||
#[error(
|
||||
"Empty string is not a valid target. If you want to get no logs use `OFF`. Usage: `info`, `info:meilisearch`, or you can write multiple filters in one target: `index_scheduler=info,milli=trace`"
|
||||
"Empty string is not a valid target. If you want to get no logs use `OFF`. Usage: `info`, `meilisearch=info`, or you can write multiple filters in one target: `index_scheduler=info,milli=trace`"
|
||||
)]
|
||||
Example,
|
||||
}
|
||||
@ -160,12 +162,23 @@ fn make_layer<
|
||||
) -> (Box<dyn Layer<S> + Send + Sync>, PinnedByteStream) {
|
||||
let guard = HandleGuard { logs: logs.into_inner() };
|
||||
match opt.mode {
|
||||
LogMode::Fmt => {
|
||||
LogMode::Human => {
|
||||
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_writer(move || LogWriter { sender: sender.clone() })
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE);
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE);
|
||||
|
||||
let stream = byte_stream(receiver, guard);
|
||||
(Box::new(fmt_layer) as Box<dyn Layer<S> + Send + Sync>, Box::pin(stream))
|
||||
}
|
||||
LogMode::Json => {
|
||||
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let fmt_layer = tracing_subscriber::fmt::layer()
|
||||
.with_writer(move || LogWriter { sender: sender.clone() })
|
||||
.json()
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE);
|
||||
|
||||
let stream = byte_stream(receiver, guard);
|
||||
(Box::new(fmt_layer) as Box<dyn Layer<S> + Send + Sync>, Box::pin(stream))
|
||||
@ -279,3 +292,27 @@ pub async fn cancel_logs(
|
||||
|
||||
Ok(HttpResponse::NoContent().finish())
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct UpdateStderrLogs {
|
||||
#[deserr(default = "info".parse().unwrap(), try_from(&String) = MyTargets::from_str -> DeserrJsonError<BadRequest>)]
|
||||
target: MyTargets,
|
||||
}
|
||||
|
||||
pub async fn update_stderr_target(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
|
||||
logs: Data<LogStderrHandle>,
|
||||
body: AwebJson<UpdateStderrLogs, DeserrJsonError>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
index_scheduler.features().check_logs_route()?;
|
||||
|
||||
let opt = body.into_inner();
|
||||
|
||||
logs.modify(|layer| {
|
||||
*layer.filter_mut() = opt.target.0.clone();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
Ok(HttpResponse::NoContent().finish())
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::error::{Code, ResponseError};
|
||||
use meilisearch_types::settings::{Settings, Unchecked};
|
||||
use meilisearch_types::tasks::{Kind, Status, Task, TaskId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@ -15,6 +15,7 @@ use tracing::debug;
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::Opt;
|
||||
|
||||
const PAGINATION_DEFAULT_LIMIT: usize = 20;
|
||||
|
||||
@ -45,6 +46,56 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
.service(web::scope("/experimental-features").configure(features::configure));
|
||||
}
|
||||
|
||||
pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result<Option<TaskId>, ResponseError> {
|
||||
if !opt.experimental_replication_parameters {
|
||||
return Ok(None);
|
||||
}
|
||||
let task_id = req
|
||||
.headers()
|
||||
.get("TaskId")
|
||||
.map(|header| {
|
||||
header.to_str().map_err(|e| {
|
||||
ResponseError::from_msg(
|
||||
format!("TaskId is not a valid utf-8 string: {e}"),
|
||||
Code::BadRequest,
|
||||
)
|
||||
})
|
||||
})
|
||||
.transpose()?
|
||||
.map(|s| {
|
||||
s.parse::<TaskId>().map_err(|e| {
|
||||
ResponseError::from_msg(
|
||||
format!(
|
||||
"Could not parse the TaskId as a {}: {e}",
|
||||
std::any::type_name::<TaskId>(),
|
||||
),
|
||||
Code::BadRequest,
|
||||
)
|
||||
})
|
||||
})
|
||||
.transpose()?;
|
||||
Ok(task_id)
|
||||
}
|
||||
|
||||
pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result<bool, ResponseError> {
|
||||
if !opt.experimental_replication_parameters {
|
||||
return Ok(false);
|
||||
}
|
||||
Ok(req
|
||||
.headers()
|
||||
.get("DryRun")
|
||||
.map(|header| {
|
||||
header.to_str().map_err(|e| {
|
||||
ResponseError::from_msg(
|
||||
format!("DryRun is not a valid utf-8 string: {e}"),
|
||||
Code::BadRequest,
|
||||
)
|
||||
})
|
||||
})
|
||||
.transpose()?
|
||||
.map_or(false, |s| s.to_lowercase() == "true"))
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct SummarizedTaskView {
|
||||
@ -308,12 +359,18 @@ async fn get_version(
|
||||
) -> HttpResponse {
|
||||
analytics.publish("Version Seen".to_string(), json!(null), Some(&req));
|
||||
|
||||
let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown");
|
||||
let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown");
|
||||
let build_info = build_info::BuildInfo::from_build();
|
||||
|
||||
HttpResponse::Ok().json(VersionResponse {
|
||||
commit_sha: commit_sha.to_string(),
|
||||
commit_date: commit_date.to_string(),
|
||||
commit_sha: build_info.commit_sha1.unwrap_or("unknown").to_string(),
|
||||
commit_date: build_info
|
||||
.commit_timestamp
|
||||
.and_then(|commit_timestamp| {
|
||||
commit_timestamp
|
||||
.format(&time::format_description::well_known::Iso8601::DEFAULT)
|
||||
.ok()
|
||||
})
|
||||
.unwrap_or("unknown".into()),
|
||||
pkg_version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
})
|
||||
}
|
||||
|
@ -10,7 +10,8 @@ use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::SummarizedTaskView;
|
||||
use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::Opt;
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
|
||||
@ -19,13 +20,18 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
pub async fn create_snapshot(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
|
||||
|
||||
let task = KindWithContent::SnapshotCreation;
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
|
||||
debug!(returns = ?task, "Create snapshot");
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
|
@ -10,12 +10,13 @@ use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::tasks::{IndexSwap, KindWithContent};
|
||||
use serde_json::json;
|
||||
|
||||
use super::SummarizedTaskView;
|
||||
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::error::MeilisearchHttpError;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::Opt;
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(swap_indexes))));
|
||||
@ -32,6 +33,7 @@ pub async fn swap_indexes(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>,
|
||||
params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let params = params.into_inner();
|
||||
@ -60,7 +62,11 @@ pub async fn swap_indexes(
|
||||
}
|
||||
|
||||
let task = KindWithContent::IndexSwap { swaps };
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task: SummarizedTaskView =
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
|
||||
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
|
||||
.await??
|
||||
.into();
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
@ -18,11 +18,12 @@ use time::macros::format_description;
|
||||
use time::{Date, Duration, OffsetDateTime, Time};
|
||||
use tokio::task;
|
||||
|
||||
use super::SummarizedTaskView;
|
||||
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::Opt;
|
||||
|
||||
const DEFAULT_LIMIT: u32 = 20;
|
||||
|
||||
@ -161,6 +162,7 @@ async fn cancel_tasks(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>,
|
||||
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let params = params.into_inner();
|
||||
@ -197,7 +199,11 @@ async fn cancel_tasks(
|
||||
let task_cancelation =
|
||||
KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks };
|
||||
|
||||
let task = task::spawn_blocking(move || index_scheduler.register(task_cancelation)).await??;
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task =
|
||||
task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid, dry_run))
|
||||
.await??;
|
||||
let task: SummarizedTaskView = task.into();
|
||||
|
||||
Ok(HttpResponse::Ok().json(task))
|
||||
@ -207,6 +213,7 @@ async fn delete_tasks(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_DELETE }>, Data<IndexScheduler>>,
|
||||
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let params = params.into_inner();
|
||||
@ -242,7 +249,10 @@ async fn delete_tasks(
|
||||
let task_deletion =
|
||||
KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks };
|
||||
|
||||
let task = task::spawn_blocking(move || index_scheduler.register(task_deletion)).await??;
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
let dry_run = is_dry_run(&req, &opt)?;
|
||||
let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid, dry_run))
|
||||
.await??;
|
||||
let task: SummarizedTaskView = task.into();
|
||||
|
||||
Ok(HttpResponse::Ok().json(task))
|
||||
|
@ -441,10 +441,6 @@ fn prepare_search<'t>(
|
||||
ScoringStrategy::Skip
|
||||
});
|
||||
|
||||
if query.show_ranking_score_details {
|
||||
features.check_score_details()?;
|
||||
}
|
||||
|
||||
if let Some(HybridQuery { embedder: Some(embedder), .. }) = &query.hybrid {
|
||||
search.embedder_name(embedder);
|
||||
}
|
||||
|
@ -100,16 +100,11 @@ impl Index<'_> {
|
||||
pub async fn raw_add_documents(
|
||||
&self,
|
||||
payload: &str,
|
||||
content_type: Option<&str>,
|
||||
headers: Vec<(&str, &str)>,
|
||||
query_parameter: &str,
|
||||
) -> (Value, StatusCode) {
|
||||
let url = format!("/indexes/{}/documents{}", urlencode(self.uid.as_ref()), query_parameter);
|
||||
|
||||
if let Some(content_type) = content_type {
|
||||
self.service.post_str(url, payload, vec![("Content-Type", content_type)]).await
|
||||
} else {
|
||||
self.service.post_str(url, payload, Vec::new()).await
|
||||
}
|
||||
self.service.post_str(url, payload, headers).await
|
||||
}
|
||||
|
||||
pub async fn update_documents(
|
||||
|
@ -9,7 +9,7 @@ use actix_web::http::StatusCode;
|
||||
use byte_unit::{Byte, ByteUnit};
|
||||
use clap::Parser;
|
||||
use meilisearch::option::{IndexerOpts, MaxMemory, Opt};
|
||||
use meilisearch::{analytics, create_app, setup_meilisearch};
|
||||
use meilisearch::{analytics, create_app, setup_meilisearch, SubscriberForSecondLayer};
|
||||
use once_cell::sync::Lazy;
|
||||
use tempfile::TempDir;
|
||||
use tokio::time::sleep;
|
||||
@ -87,12 +87,20 @@ impl Server {
|
||||
tracing_subscriber::reload::Layer::new(None.with_filter(
|
||||
tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF),
|
||||
));
|
||||
let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new(
|
||||
(Box::new(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE),
|
||||
)
|
||||
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>)
|
||||
.with_filter(tracing_subscriber::filter::Targets::new()),
|
||||
);
|
||||
|
||||
actix_web::test::init_service(create_app(
|
||||
self.service.index_scheduler.clone().into(),
|
||||
self.service.auth.clone().into(),
|
||||
self.service.options.clone(),
|
||||
route_layer_handle,
|
||||
(route_layer_handle, stderr_layer_handle),
|
||||
analytics::MockAnalytics::new(&self.service.options),
|
||||
true,
|
||||
))
|
||||
|
@ -5,7 +5,7 @@ use actix_web::http::StatusCode;
|
||||
use actix_web::test;
|
||||
use actix_web::test::TestRequest;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch::{analytics, create_app, Opt};
|
||||
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
|
||||
use meilisearch_auth::AuthController;
|
||||
use tracing::level_filters::LevelFilter;
|
||||
use tracing_subscriber::Layer;
|
||||
@ -111,12 +111,20 @@ impl Service {
|
||||
tracing_subscriber::reload::Layer::new(None.with_filter(
|
||||
tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF),
|
||||
));
|
||||
let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new(
|
||||
(Box::new(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE),
|
||||
)
|
||||
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>)
|
||||
.with_filter(tracing_subscriber::filter::Targets::new()),
|
||||
);
|
||||
|
||||
let app = test::init_service(create_app(
|
||||
self.index_scheduler.clone().into(),
|
||||
self.auth.clone().into(),
|
||||
self.options.clone(),
|
||||
route_layer_handle,
|
||||
(route_layer_handle, stderr_layer_handle),
|
||||
analytics::MockAnalytics::new(&self.options),
|
||||
true,
|
||||
))
|
||||
|
@ -1,10 +1,11 @@
|
||||
use actix_web::test;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::Opt;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::{GetAllDocumentsOptions, Server, Value};
|
||||
use crate::common::{default_settings, GetAllDocumentsOptions, Server, Value};
|
||||
use crate::json;
|
||||
|
||||
/// This is the basic usage of our API and every other tests uses the content-type application/json
|
||||
@ -2157,3 +2158,49 @@ async fn batch_several_documents_addition() {
|
||||
assert_eq!(code, 200, "failed with `{}`", response);
|
||||
assert_eq!(response["results"].as_array().unwrap().len(), 120);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn dry_register_file() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
|
||||
let options =
|
||||
Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) };
|
||||
let server = Server::new_with_options(options).await.unwrap();
|
||||
let index = server.index("tamo");
|
||||
|
||||
let documents = r#"
|
||||
{
|
||||
"id": "12",
|
||||
"doggo": "kefir"
|
||||
}
|
||||
"#;
|
||||
|
||||
let (response, code) = index
|
||||
.raw_add_documents(
|
||||
documents,
|
||||
vec![("Content-Type", "application/json"), ("DryRun", "true")],
|
||||
"",
|
||||
)
|
||||
.await;
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"taskUid": 0,
|
||||
"indexUid": "tamo",
|
||||
"status": "enqueued",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"202 Accepted");
|
||||
|
||||
let (response, code) = index.get_task(response.uid()).await;
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Task `0` not found.",
|
||||
"code": "task_not_found",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#task_not_found"
|
||||
}
|
||||
"###);
|
||||
snapshot!(code, @"404 Not Found");
|
||||
}
|
||||
|
@ -209,7 +209,8 @@ async fn replace_documents_missing_payload() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = index.raw_add_documents("", Some("application/json"), "").await;
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", vec![("Content-Type", "application/json")], "").await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -220,7 +221,8 @@ async fn replace_documents_missing_payload() {
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index.raw_add_documents("", Some("application/x-ndjson"), "").await;
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "").await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -231,7 +233,8 @@ async fn replace_documents_missing_payload() {
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index.raw_add_documents("", Some("text/csv"), "").await;
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", vec![("Content-Type", "text/csv")], "").await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -287,7 +290,7 @@ async fn replace_documents_missing_content_type() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = index.raw_add_documents("", None, "").await;
|
||||
let (response, code) = index.raw_add_documents("", Vec::new(), "").await;
|
||||
snapshot!(code, @"415 Unsupported Media Type");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -299,7 +302,7 @@ async fn replace_documents_missing_content_type() {
|
||||
"###);
|
||||
|
||||
// even with a csv delimiter specified this error is triggered first
|
||||
let (response, code) = index.raw_add_documents("", None, "?csvDelimiter=;").await;
|
||||
let (response, code) = index.raw_add_documents("", Vec::new(), "?csvDelimiter=;").await;
|
||||
snapshot!(code, @"415 Unsupported Media Type");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -345,7 +348,7 @@ async fn replace_documents_bad_content_type() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = index.raw_add_documents("", Some("doggo"), "").await;
|
||||
let (response, code) = index.raw_add_documents("", vec![("Content-Type", "doggo")], "").await;
|
||||
snapshot!(code, @"415 Unsupported Media Type");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -379,8 +382,9 @@ async fn replace_documents_bad_csv_delimiter() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", Some("application/json"), "?csvDelimiter").await;
|
||||
let (response, code) = index
|
||||
.raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter")
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -391,8 +395,9 @@ async fn replace_documents_bad_csv_delimiter() {
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", Some("application/json"), "?csvDelimiter=doggo").await;
|
||||
let (response, code) = index
|
||||
.raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=doggo")
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -404,7 +409,11 @@ async fn replace_documents_bad_csv_delimiter() {
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.raw_add_documents("", Some("application/json"), &format!("?csvDelimiter={}", encode("🍰")))
|
||||
.raw_add_documents(
|
||||
"",
|
||||
vec![("Content-Type", "application/json")],
|
||||
&format!("?csvDelimiter={}", encode("🍰")),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
@ -469,8 +478,9 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", Some("application/json"), "?csvDelimiter=a").await;
|
||||
let (response, code) = index
|
||||
.raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=a")
|
||||
.await;
|
||||
snapshot!(code, @"415 Unsupported Media Type");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
@ -481,8 +491,9 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() {
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) =
|
||||
index.raw_add_documents("", Some("application/x-ndjson"), "?csvDelimiter=a").await;
|
||||
let (response, code) = index
|
||||
.raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "?csvDelimiter=a")
|
||||
.await;
|
||||
snapshot!(code, @"415 Unsupported Media Type");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
|
@ -1,4 +1,4 @@
|
||||
use meili_snap::snapshot;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::{GetAllDocumentsOptions, Server};
|
||||
@ -209,3 +209,93 @@ async fn error_update_documents_missing_document_id() {
|
||||
"https://docs.meilisearch.com/errors#missing_document_id"
|
||||
);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn update_faceted_document() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"rankingRules": ["facet:asc"],
|
||||
}))
|
||||
.await;
|
||||
assert_eq!("202", code.as_str(), "{:?}", response);
|
||||
index.wait_task(0).await;
|
||||
|
||||
let documents: Vec<_> = (0..1000)
|
||||
.map(|id| {
|
||||
json!({
|
||||
"doc_id": id,
|
||||
"facet": (id/3),
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (_response, code) = index.add_documents(documents.into(), None).await;
|
||||
assert_eq!(code, 202);
|
||||
|
||||
index.wait_task(1).await;
|
||||
|
||||
let documents = json!([
|
||||
{
|
||||
"doc_id": 9,
|
||||
"facet": 1.5,
|
||||
}
|
||||
]);
|
||||
|
||||
let (response, code) = index.update_documents(documents, None).await;
|
||||
assert_eq!(code, 202, "response: {}", response);
|
||||
|
||||
index.wait_task(2).await;
|
||||
|
||||
index
|
||||
.search(json!({"limit": 10}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"doc_id": 0,
|
||||
"facet": 0
|
||||
},
|
||||
{
|
||||
"doc_id": 1,
|
||||
"facet": 0
|
||||
},
|
||||
{
|
||||
"doc_id": 2,
|
||||
"facet": 0
|
||||
},
|
||||
{
|
||||
"doc_id": 3,
|
||||
"facet": 1
|
||||
},
|
||||
{
|
||||
"doc_id": 4,
|
||||
"facet": 1
|
||||
},
|
||||
{
|
||||
"doc_id": 5,
|
||||
"facet": 1
|
||||
},
|
||||
{
|
||||
"doc_id": 9,
|
||||
"facet": 1.5
|
||||
},
|
||||
{
|
||||
"doc_id": 6,
|
||||
"facet": 2
|
||||
},
|
||||
{
|
||||
"doc_id": 7,
|
||||
"facet": 2
|
||||
},
|
||||
{
|
||||
"doc_id": 8,
|
||||
"facet": 2
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
@ -1845,7 +1845,6 @@ async fn import_dump_v6_containing_experimental_features() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
|
@ -18,7 +18,6 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
@ -31,7 +30,6 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
@ -44,7 +42,6 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
@ -58,7 +55,6 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
@ -72,7 +68,6 @@ async fn experimental_features() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
@ -93,7 +88,6 @@ async fn experimental_feature_metrics() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": true,
|
||||
"logsRoute": false,
|
||||
@ -152,7 +146,7 @@ async fn errors() {
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `logsRoute`, `exportPuffinReports`",
|
||||
"message": "Unknown field `NotAFeature`: expected one of `vectorStore`, `metrics`, `logsRoute`, `exportPuffinReports`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
|
@ -2,9 +2,10 @@ use actix_web::http::header::ContentType;
|
||||
use actix_web::test;
|
||||
use http::header::ACCEPT_ENCODING;
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::Opt;
|
||||
|
||||
use crate::common::encoder::Encoder;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::common::{default_settings, Server, Value};
|
||||
use crate::json;
|
||||
|
||||
#[actix_rt::test]
|
||||
@ -199,3 +200,79 @@ async fn error_create_with_invalid_index_uid() {
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn send_task_id() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
|
||||
let options =
|
||||
Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) };
|
||||
let server = Server::new_with_options(options).await.unwrap();
|
||||
|
||||
let app = server.init_web_app().await;
|
||||
let index = server.index("catto");
|
||||
let (response, code) = index.create(None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
|
||||
{
|
||||
"taskUid": 0,
|
||||
"indexUid": "catto",
|
||||
"status": "enqueued",
|
||||
"type": "indexCreation",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let body = serde_json::to_string(&json!({
|
||||
"uid": "doggo",
|
||||
"primaryKey": None::<&str>,
|
||||
}))
|
||||
.unwrap();
|
||||
let req = test::TestRequest::post()
|
||||
.uri("/indexes")
|
||||
.insert_header(("TaskId", "25"))
|
||||
.insert_header(ContentType::json())
|
||||
.set_payload(body)
|
||||
.to_request();
|
||||
|
||||
let res = test::call_service(&app, req).await;
|
||||
snapshot!(res.status(), @"202 Accepted");
|
||||
|
||||
let bytes = test::read_body(res).await;
|
||||
let response = serde_json::from_slice::<Value>(&bytes).expect("Expecting valid json");
|
||||
snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
|
||||
{
|
||||
"taskUid": 25,
|
||||
"indexUid": "doggo",
|
||||
"status": "enqueued",
|
||||
"type": "indexCreation",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let body = serde_json::to_string(&json!({
|
||||
"uid": "girafo",
|
||||
"primaryKey": None::<&str>,
|
||||
}))
|
||||
.unwrap();
|
||||
let req = test::TestRequest::post()
|
||||
.uri("/indexes")
|
||||
.insert_header(("TaskId", "12"))
|
||||
.insert_header(ContentType::json())
|
||||
.set_payload(body)
|
||||
.to_request();
|
||||
|
||||
let res = test::call_service(&app, req).await;
|
||||
snapshot!(res.status(), @"400 Bad Request");
|
||||
|
||||
let bytes = test::read_body(res).await;
|
||||
let response = serde_json::from_slice::<Value>(&bytes).expect("Expecting valid json");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
{
|
||||
"message": "Received bad task id: 12 should be >= to 26.",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ async fn logs_stream_bad_target() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value at `.target`: Empty string is not a valid target. If you want to get no logs use `OFF`. Usage: `info`, `info:meilisearch`, or you can write multiple filters in one target: `index_scheduler=info,milli=trace`",
|
||||
"message": "Invalid value at `.target`: Empty string is not a valid target. If you want to get no logs use `OFF`. Usage: `info`, `meilisearch=info`, or you can write multiple filters in one target: `index_scheduler=info,milli=trace`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
@ -89,7 +89,7 @@ async fn logs_stream_bad_mode() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Unknown value `tamo` at `.mode`: expected one of `fmt`, `profile`",
|
||||
"message": "Unknown value `tamo` at `.mode`: expected one of `human`, `json`, `profile`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
@ -133,7 +133,7 @@ async fn logs_stream_bad_profile_memory() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value: `profile_memory` can only be used while profiling code and is not compatible with the Fmt mode.",
|
||||
"message": "Invalid value: `profile_memory` can only be used while profiling code and is not compatible with the Human mode.",
|
||||
"code": "invalid_settings_typo_tolerance",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_settings_typo_tolerance"
|
||||
@ -146,10 +146,10 @@ async fn logs_stream_bad_profile_memory() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value: `profile_memory` can only be used while profiling code and is not compatible with the Fmt mode.",
|
||||
"code": "invalid_settings_typo_tolerance",
|
||||
"message": "Unknown value `fmt` at `.mode`: expected one of `human`, `json`, `profile`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_settings_typo_tolerance"
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
@ -162,7 +162,7 @@ async fn logs_stream_without_enabling_the_route() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
|
||||
"message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
@ -173,7 +173,18 @@ async fn logs_stream_without_enabling_the_route() {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
|
||||
"message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = server.service.post("/logs/stderr", json!({})).await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
|
@ -5,7 +5,7 @@ use std::str::FromStr;
|
||||
|
||||
use actix_web::http::header::ContentType;
|
||||
use meili_snap::snapshot;
|
||||
use meilisearch::{analytics, create_app, Opt};
|
||||
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
|
||||
use tracing::level_filters::LevelFilter;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::Layer;
|
||||
@ -27,18 +27,25 @@ async fn basic_test_log_stream_route() {
|
||||
tracing_subscriber::reload::Layer::new(None.with_filter(
|
||||
tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF),
|
||||
));
|
||||
let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new(
|
||||
(Box::new(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE),
|
||||
) as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>)
|
||||
.with_filter(tracing_subscriber::filter::Targets::new()),
|
||||
);
|
||||
|
||||
let subscriber = tracing_subscriber::registry().with(route_layer).with(
|
||||
tracing_subscriber::fmt::layer()
|
||||
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE)
|
||||
.with_filter(tracing_subscriber::filter::LevelFilter::from_str("INFO").unwrap()),
|
||||
.with_filter(tracing_subscriber::filter::LevelFilter::from_str("OFF").unwrap()),
|
||||
);
|
||||
|
||||
let app = actix_web::test::init_service(create_app(
|
||||
server.service.index_scheduler.clone().into(),
|
||||
server.service.auth.clone().into(),
|
||||
server.service.options.clone(),
|
||||
route_layer_handle,
|
||||
(route_layer_handle, stderr_layer_handle),
|
||||
analytics::MockAnalytics::new(&server.service.options),
|
||||
true,
|
||||
))
|
||||
@ -57,7 +64,7 @@ async fn basic_test_log_stream_route() {
|
||||
.insert_header(ContentType::json())
|
||||
.set_payload(
|
||||
serde_json::to_vec(&json!({
|
||||
"mode": "fmt",
|
||||
"mode": "human",
|
||||
"target": "info",
|
||||
}))
|
||||
.unwrap(),
|
||||
|
@ -13,7 +13,6 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
@ -88,6 +87,52 @@ async fn simple_search() {
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn highlighter() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
"hybrid": {"semanticRatio": 0.2},
|
||||
"attributesToHighlight": [
|
||||
"desc"
|
||||
],
|
||||
"highlightPreTag": "**BEGIN**",
|
||||
"highlightPostTag": "**END**"
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
"hybrid": {"semanticRatio": 0.8},
|
||||
"attributesToHighlight": [
|
||||
"desc"
|
||||
],
|
||||
"highlightPreTag": "**BEGIN**",
|
||||
"highlightPostTag": "**END**"
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_semanticScore":0.9472136}]"###);
|
||||
|
||||
// no highlighting on full semantic
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
"hybrid": {"semanticRatio": 1.0},
|
||||
"attributesToHighlight": [
|
||||
"desc"
|
||||
],
|
||||
"highlightPreTag": "**BEGIN**",
|
||||
"highlightPostTag": "**END**"
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}}]"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn invalid_semantic_ratio() {
|
||||
let server = Server::new().await;
|
||||
|
@ -766,38 +766,14 @@ async fn faceting_max_values_per_facet() {
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn experimental_feature_score_details() {
|
||||
async fn test_score_details() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
|
||||
index.add_documents(json!(documents), None).await;
|
||||
index.wait_task(0).await;
|
||||
|
||||
index
|
||||
.search(
|
||||
json!({
|
||||
"q": "train dragon",
|
||||
"showRankingScoreDetails": true,
|
||||
}),
|
||||
|response, code| {
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Computing score details requires enabling the `score details` experimental feature. See https://github.com/meilisearch/product/discussions/674",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let (response, code) = server.set_features(json!({"scoreDetails": true})).await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response["scoreDetails"], @"true");
|
||||
let res = index.add_documents(json!(documents), None).await;
|
||||
index.wait_task(res.0.uid()).await;
|
||||
|
||||
index
|
||||
.search(
|
||||
|
@ -7,7 +7,7 @@ use std::sync::Arc;
|
||||
use actix_http::body::MessageBody;
|
||||
use actix_web::dev::{ServiceFactory, ServiceResponse};
|
||||
use actix_web::web::{Bytes, Data};
|
||||
use actix_web::{post, App, HttpResponse, HttpServer};
|
||||
use actix_web::{post, App, HttpRequest, HttpResponse, HttpServer};
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::Opt;
|
||||
use tokio::sync::mpsc;
|
||||
@ -17,7 +17,17 @@ use crate::common::{default_settings, Server};
|
||||
use crate::json;
|
||||
|
||||
#[post("/")]
|
||||
async fn forward_body(sender: Data<mpsc::UnboundedSender<Vec<u8>>>, body: Bytes) -> HttpResponse {
|
||||
async fn forward_body(
|
||||
req: HttpRequest,
|
||||
sender: Data<mpsc::UnboundedSender<Vec<u8>>>,
|
||||
body: Bytes,
|
||||
) -> HttpResponse {
|
||||
let headers = req.headers();
|
||||
assert_eq!(headers.get("content-type").unwrap(), "application/x-ndjson");
|
||||
assert_eq!(headers.get("transfer-encoding").unwrap(), "chunked");
|
||||
assert_eq!(headers.get("accept-encoding").unwrap(), "gzip");
|
||||
assert_eq!(headers.get("content-encoding").unwrap(), "gzip");
|
||||
|
||||
let body = body.to_vec();
|
||||
sender.send(body).unwrap();
|
||||
HttpResponse::Ok().into()
|
||||
|
@ -1,5 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use uuid::Uuid;
|
||||
|
@ -17,7 +17,7 @@ bincode = "1.3.3"
|
||||
bstr = "1.9.0"
|
||||
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.5.0"
|
||||
charabia = { version = "0.8.5", default-features = false }
|
||||
charabia = { version = "0.8.7", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.11"
|
||||
deserr = "0.6.1"
|
||||
@ -70,13 +70,13 @@ itertools = "0.11.0"
|
||||
# profiling
|
||||
puffin = "0.16.0"
|
||||
|
||||
# logging
|
||||
logging_timer = "1.1.0"
|
||||
csv = "1.3.0"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
|
||||
candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
|
||||
candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
|
||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] }
|
||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = [
|
||||
"onig",
|
||||
] }
|
||||
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
|
||||
"online",
|
||||
] }
|
||||
@ -102,7 +102,16 @@ meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
|
||||
all-tokenizations = [
|
||||
"charabia/chinese",
|
||||
"charabia/hebrew",
|
||||
"charabia/japanese",
|
||||
"charabia/thai",
|
||||
"charabia/korean",
|
||||
"charabia/greek",
|
||||
"charabia/khmer",
|
||||
"charabia/vietnamese",
|
||||
]
|
||||
|
||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||
# For more information on this feature, see heed's Cargo.toml
|
||||
@ -130,5 +139,7 @@ greek = ["charabia/greek"]
|
||||
# allow khmer specialized tokenization
|
||||
khmer = ["charabia/khmer"]
|
||||
|
||||
vietnamese = ["charabia/vietnamese"]
|
||||
|
||||
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
||||
cuda = ["candle-core/cuda"]
|
||||
|
@ -227,6 +227,22 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
embedder_name: String,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: Model `{model}` does not support overriding its native dimensions of {expected_dimensions}. Found {dimensions}")]
|
||||
InvalidOpenAiModelDimensions {
|
||||
embedder_name: String,
|
||||
model: &'static str,
|
||||
dimensions: usize,
|
||||
expected_dimensions: usize,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: Model `{model}` does not support overriding its dimensions to a value higher than {max_dimensions}. Found {dimensions}")]
|
||||
InvalidOpenAiModelDimensionsMax {
|
||||
embedder_name: String,
|
||||
model: &'static str,
|
||||
dimensions: usize,
|
||||
max_dimensions: usize,
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.dimensions`: `dimensions` cannot be zero")]
|
||||
InvalidSettingsDimensions { embedder_name: String },
|
||||
}
|
||||
|
||||
impl From<crate::vector::Error> for Error {
|
||||
|
@ -102,7 +102,7 @@ impl ScoreWithRatioResult {
|
||||
}
|
||||
|
||||
SearchResult {
|
||||
matching_words: left.matching_words,
|
||||
matching_words: right.matching_words,
|
||||
candidates: left.candidates | right.candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
|
@ -15,7 +15,7 @@ pub struct BucketSortOutput {
|
||||
|
||||
// TODO: would probably be good to regroup some of these inside of a struct?
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")]
|
||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||
|
@ -191,7 +191,7 @@ fn resolve_maximally_reduced_query_graph(
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
#[logging_timer::time]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search")]
|
||||
fn resolve_universe(
|
||||
ctx: &mut SearchContext,
|
||||
initial_universe: &RoaringBitmap,
|
||||
@ -557,7 +557,7 @@ pub fn execute_vector_search(
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search")]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext,
|
||||
query: Option<&str>,
|
||||
@ -577,6 +577,9 @@ pub fn execute_search(
|
||||
|
||||
let mut located_query_terms = None;
|
||||
let query_terms = if let Some(query) = query {
|
||||
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
|
||||
let entered = span.enter();
|
||||
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
let mut tokbuilder = TokenizerBuilder::new();
|
||||
@ -605,7 +608,12 @@ pub fn execute_search(
|
||||
}
|
||||
|
||||
let tokenizer = tokbuilder.build();
|
||||
drop(entered);
|
||||
|
||||
let span = tracing::trace_span!(target: "search::tokens", "tokenize");
|
||||
let entered = span.enter();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
drop(entered);
|
||||
|
||||
let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
||||
if query_terms.is_empty() {
|
||||
|
@ -6,9 +6,10 @@ use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
|
||||
use super::*;
|
||||
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
|
||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
use crate::search::new::query_term::TwoTypoTerm;
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::{Lazy, TwoTypoTerm};
|
||||
use crate::search::new::{limits, SearchContext};
|
||||
use crate::search::{build_dfa, get_first};
|
||||
use crate::{Result, MAX_WORD_LENGTH};
|
||||
|
@ -7,7 +7,6 @@ use std::collections::BTreeSet;
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use compute_derivations::partially_initialized_term_from_word;
|
||||
use either::Either;
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};
|
||||
|
@ -1,11 +1,15 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use charabia::normalizer::NormalizedTokenIter;
|
||||
use charabia::{SeparatorKind, TokenKind};
|
||||
|
||||
use super::*;
|
||||
use super::compute_derivations::partially_initialized_term_from_word;
|
||||
use super::{LocatedQueryTerm, ZeroTypoTerm};
|
||||
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
#[logging_timer::time]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext,
|
||||
query: NormalizedTokenIter,
|
||||
@ -225,7 +229,7 @@ pub fn make_ngram(
|
||||
}
|
||||
|
||||
struct PhraseBuilder {
|
||||
words: Vec<Option<Interned<String>>>,
|
||||
words: Vec<Option<crate::search::new::Interned<String>>>,
|
||||
start: u16,
|
||||
end: u16,
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use grenad::{CompressionType, Merger};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
@ -14,6 +14,7 @@ use crate::heed_codec::facet::{
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
@ -28,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> {
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
delta_data: Option<grenad::Reader<BufReader<File>>>,
|
||||
delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
@ -36,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
@ -65,7 +66,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("FacetsUpdateBulk::{}")]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")]
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||
|
||||
@ -89,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
pub delta_data: Option<grenad::Reader<R>>,
|
||||
pub delta_data: Option<Merger<R, MergeFn>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
@ -129,8 +130,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
if self.db.is_empty(wtxn)? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
@ -154,8 +155,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
let mut buffer = Vec::new();
|
||||
let database = self.db.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut cursor = delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::Merger;
|
||||
use heed::types::{Bytes, DecodeIgnore};
|
||||
use heed::{BytesDecode, Error, RoTxn, RwTxn};
|
||||
use obkv::KvReader;
|
||||
@ -14,31 +15,56 @@ use crate::heed_codec::BytesRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
enum InsertionResult {
|
||||
/// Enum used as a return value for the facet incremental indexing.
|
||||
///
|
||||
/// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have
|
||||
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
|
||||
/// of the parent node.
|
||||
///
|
||||
/// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted
|
||||
/// in the addition of a new key in that level, and that therefore the number of children
|
||||
/// of the parent node should be incremented.
|
||||
///
|
||||
/// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the
|
||||
/// number of keys in the level. For example, removing a document id from the facet value `3` could
|
||||
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
|
||||
/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must
|
||||
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
|
||||
///
|
||||
/// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the
|
||||
/// bounds of the keys of the level. For example, removing a document id from the facet value
|
||||
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
|
||||
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
|
||||
/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust
|
||||
/// its left bound as well.
|
||||
///
|
||||
/// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`.
|
||||
/// This case is reachable when a document id is removed from a sub-level node but is still present in another one.
|
||||
/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`.
|
||||
enum ModificationResult {
|
||||
InPlace,
|
||||
Expand,
|
||||
Insert,
|
||||
}
|
||||
enum DeletionResult {
|
||||
InPlace,
|
||||
Reduce { next: Option<Vec<u8>> },
|
||||
Remove { next: Option<Vec<u8>> },
|
||||
Nothing,
|
||||
}
|
||||
|
||||
/// Algorithm to incrementally insert and delete elememts into the
|
||||
/// `facet_id_(string/f64)_docids` databases.
|
||||
pub struct FacetsUpdateIncremental {
|
||||
inner: FacetsUpdateIncrementalInner,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
}
|
||||
|
||||
impl FacetsUpdateIncremental {
|
||||
pub fn new(
|
||||
index: &Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
max_group_size: u8,
|
||||
@ -61,34 +87,59 @@ impl FacetsUpdateIncremental {
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")]
|
||||
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
|
||||
let mut cursor = self.delta_data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut current_field_id = None;
|
||||
let mut facet_level_may_be_updated = false;
|
||||
let mut iter = self.delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key)
|
||||
.map_err(heed::Error::Encoding)?;
|
||||
let value = KvReader::new(value);
|
||||
|
||||
if facet_level_may_be_updated
|
||||
&& current_field_id.map_or(false, |fid| fid != key.field_id)
|
||||
{
|
||||
// Only add or remove a level after making all the field modifications.
|
||||
self.inner.add_or_delete_level(wtxn, current_field_id.unwrap())?;
|
||||
facet_level_may_be_updated = false;
|
||||
}
|
||||
current_field_id = Some(key.field_id);
|
||||
|
||||
let value = KvReader::new(value);
|
||||
let docids_to_delete = value
|
||||
.get(DelAdd::Deletion)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.map_err(heed::Error::Encoding));
|
||||
.map(|o| o.map_err(heed::Error::Encoding))
|
||||
.transpose()?;
|
||||
|
||||
let docids_to_add = value
|
||||
.get(DelAdd::Addition)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
.map(|o| o.map_err(heed::Error::Encoding));
|
||||
.map(|o| o.map_err(heed::Error::Encoding))
|
||||
.transpose()?;
|
||||
|
||||
if let Some(docids_to_delete) = docids_to_delete {
|
||||
let docids_to_delete = docids_to_delete?;
|
||||
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
|
||||
let level_size_changed = self.inner.modify(
|
||||
wtxn,
|
||||
key.field_id,
|
||||
key.left_bound,
|
||||
docids_to_add.as_ref(),
|
||||
docids_to_delete.as_ref(),
|
||||
)?;
|
||||
|
||||
if level_size_changed {
|
||||
// if a node has been added or removed from the highest level,
|
||||
// we may have to update the facet level.
|
||||
facet_level_may_be_updated = true;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(docids_to_add) = docids_to_add {
|
||||
let docids_to_add = docids_to_add?;
|
||||
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
|
||||
if let Some(field_id) = current_field_id {
|
||||
if facet_level_may_be_updated {
|
||||
self.inner.add_or_delete_level(wtxn, field_id)?;
|
||||
}
|
||||
}
|
||||
|
||||
@ -162,138 +213,78 @@ impl FacetsUpdateIncrementalInner {
|
||||
///
|
||||
/// ## Return
|
||||
/// See documentation of `insert_in_level`
|
||||
fn insert_in_level_0(
|
||||
fn modify_in_level_0(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
) -> Result<InsertionResult> {
|
||||
add_docids: Option<&RoaringBitmap>,
|
||||
del_docids: Option<&RoaringBitmap>,
|
||||
) -> Result<ModificationResult> {
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
|
||||
let value = FacetGroupValue { bitmap: docids.clone(), size: 1 };
|
||||
|
||||
let mut level0_prefix = vec![];
|
||||
level0_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level0_prefix.push(0);
|
||||
|
||||
let mut iter =
|
||||
self.db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, &level0_prefix)?;
|
||||
|
||||
if iter.next().is_none() {
|
||||
drop(iter);
|
||||
self.db.put(txn, &key, &value)?;
|
||||
Ok(InsertionResult::Insert)
|
||||
} else {
|
||||
drop(iter);
|
||||
let old_value = self.db.get(txn, &key)?;
|
||||
match old_value {
|
||||
Some(mut updated_value) => {
|
||||
// now merge the two
|
||||
updated_value.bitmap |= value.bitmap;
|
||||
self.db.put(txn, &key, &updated_value)?;
|
||||
Ok(InsertionResult::InPlace)
|
||||
}
|
||||
None => {
|
||||
let old_value = self.db.get(txn, &key)?;
|
||||
match (old_value, add_docids, del_docids) {
|
||||
// Addition + deletion on an existing value
|
||||
(Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => {
|
||||
let value = FacetGroupValue { bitmap: (bitmap - del_docids) | add_docids, size: 1 };
|
||||
self.db.put(txn, &key, &value)?;
|
||||
Ok(ModificationResult::InPlace)
|
||||
}
|
||||
// Addition on an existing value
|
||||
(Some(FacetGroupValue { bitmap, .. }), Some(add_docids), None) => {
|
||||
let value = FacetGroupValue { bitmap: bitmap | add_docids, size: 1 };
|
||||
self.db.put(txn, &key, &value)?;
|
||||
Ok(ModificationResult::InPlace)
|
||||
}
|
||||
// Addition of a new value (ignore deletion)
|
||||
(None, Some(add_docids), _) => {
|
||||
let value = FacetGroupValue { bitmap: add_docids.clone(), size: 1 };
|
||||
self.db.put(txn, &key, &value)?;
|
||||
Ok(ModificationResult::Insert)
|
||||
}
|
||||
// Deletion on an existing value, fully delete the key if the resulted value is empty.
|
||||
(Some(FacetGroupValue { mut bitmap, .. }), None, Some(del_docids)) => {
|
||||
bitmap -= del_docids;
|
||||
if bitmap.is_empty() {
|
||||
// Full deletion
|
||||
let mut next_key = None;
|
||||
if let Some((next, _)) =
|
||||
self.db.remap_data_type::<DecodeIgnore>().get_greater_than(txn, &key)?
|
||||
{
|
||||
if next.field_id == field_id && next.level == 0 {
|
||||
next_key = Some(next.left_bound.to_vec());
|
||||
}
|
||||
}
|
||||
self.db.delete(txn, &key)?;
|
||||
Ok(ModificationResult::Remove { next: next_key })
|
||||
} else {
|
||||
// Partial deletion
|
||||
let value = FacetGroupValue { bitmap, size: 1 };
|
||||
self.db.put(txn, &key, &value)?;
|
||||
Ok(InsertionResult::Insert)
|
||||
Ok(ModificationResult::InPlace)
|
||||
}
|
||||
}
|
||||
// Otherwise do nothing (None + no addition + deletion == Some + no addition + no deletion == Nothing),
|
||||
// may be unreachable at some point.
|
||||
(None, None, _) | (Some(_), None, None) => Ok(ModificationResult::Nothing),
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
|
||||
/// This function works recursively.
|
||||
/// Split a level node into two balanced nodes.
|
||||
///
|
||||
/// ## Return
|
||||
/// Returns the effect of adding the facet value to the database on the given `level`.
|
||||
///
|
||||
/// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have
|
||||
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
|
||||
/// of the parent node.
|
||||
///
|
||||
/// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
|
||||
/// in the addition of a new key in that level, and that therefore the number of children
|
||||
/// of the parent node should be incremented.
|
||||
fn insert_in_level(
|
||||
/// # Return
|
||||
/// Returns `ModificationResult::Insert` if the split is successful.
|
||||
fn split_group(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
) -> Result<InsertionResult> {
|
||||
if level == 0 {
|
||||
return self.insert_in_level_0(txn, field_id, facet_value, docids);
|
||||
}
|
||||
|
||||
let max_group_size = self.max_group_size;
|
||||
|
||||
let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?;
|
||||
// level below inserted an element
|
||||
|
||||
let (insertion_key, insertion_value) =
|
||||
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
|
||||
|
||||
match result {
|
||||
// because we know that we inserted in place, the facet_value is not a new one
|
||||
// thus it doesn't extend a group, and thus the insertion key computed above is
|
||||
// still correct
|
||||
InsertionResult::InPlace => {
|
||||
let mut updated_value = insertion_value;
|
||||
updated_value.bitmap |= docids;
|
||||
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||
|
||||
return Ok(InsertionResult::InPlace);
|
||||
}
|
||||
InsertionResult::Expand => {}
|
||||
InsertionResult::Insert => {}
|
||||
}
|
||||
|
||||
// Here we know that inserting the facet value in the level below resulted in the creation
|
||||
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
|
||||
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
|
||||
// could happen).
|
||||
let (insertion_key, insertion_key_was_modified) = {
|
||||
let mut new_insertion_key = insertion_key.clone();
|
||||
let mut key_should_be_modified = false;
|
||||
|
||||
if facet_value < insertion_key.left_bound.as_slice() {
|
||||
new_insertion_key.left_bound = facet_value.to_vec();
|
||||
key_should_be_modified = true;
|
||||
}
|
||||
if key_should_be_modified {
|
||||
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
|
||||
assert!(is_deleted);
|
||||
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
|
||||
}
|
||||
(new_insertion_key, key_should_be_modified)
|
||||
};
|
||||
// Now we know that the insertion key contains the `facet_value`.
|
||||
|
||||
// We still need to update the insertion value by:
|
||||
// 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`)
|
||||
// 2. Merge the previous docids with the new one
|
||||
let mut updated_value = insertion_value;
|
||||
|
||||
if matches!(result, InsertionResult::Insert) {
|
||||
updated_value.size += 1;
|
||||
}
|
||||
|
||||
if updated_value.size < max_group_size {
|
||||
updated_value.bitmap |= docids;
|
||||
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||
if insertion_key_was_modified {
|
||||
return Ok(InsertionResult::Expand);
|
||||
} else {
|
||||
return Ok(InsertionResult::InPlace);
|
||||
}
|
||||
}
|
||||
|
||||
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
|
||||
// Therefore it must be split into two nodes.
|
||||
|
||||
let size_left = updated_value.size / 2;
|
||||
let size_right = updated_value.size - size_left;
|
||||
insertion_key: FacetGroupKey<Vec<u8>>,
|
||||
insertion_value: FacetGroupValue,
|
||||
) -> Result<ModificationResult> {
|
||||
let size_left = insertion_value.size / 2;
|
||||
let size_right = insertion_value.size - size_left;
|
||||
|
||||
let level_below = level - 1;
|
||||
|
||||
@ -347,34 +338,228 @@ impl FacetsUpdateIncrementalInner {
|
||||
self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
|
||||
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
|
||||
|
||||
Ok(InsertionResult::Insert)
|
||||
Ok(ModificationResult::Insert)
|
||||
}
|
||||
|
||||
/// Insert the given facet value and corresponding document ids in the database.
|
||||
pub fn insert(
|
||||
/// Remove the docids still present in the related sub-level nodes from the del_docids.
|
||||
///
|
||||
/// This process is needed to avoid removing docids from a group node where the docid is present in several sub-nodes.
|
||||
fn trim_del_docids<'a>(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
insertion_key: &FacetGroupKey<Vec<u8>>,
|
||||
insertion_value_size: usize,
|
||||
del_docids: &'a RoaringBitmap,
|
||||
) -> Result<std::borrow::Cow<'a, RoaringBitmap>> {
|
||||
let level_below = level - 1;
|
||||
|
||||
let start_key = FacetGroupKey {
|
||||
field_id,
|
||||
level: level_below,
|
||||
left_bound: insertion_key.left_bound.as_slice(),
|
||||
};
|
||||
|
||||
let mut del_docids = std::borrow::Cow::Borrowed(del_docids);
|
||||
let iter = self.db.range(txn, &(start_key..))?.take(insertion_value_size);
|
||||
for next in iter {
|
||||
let (_, value) = next?;
|
||||
// if a sublevel bitmap as common docids with del_docids,
|
||||
// then these docids shouldn't be removed and so, remove them from the deletion list.
|
||||
if !value.bitmap.is_disjoint(&del_docids) {
|
||||
*del_docids.to_mut() -= value.bitmap;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(del_docids)
|
||||
}
|
||||
|
||||
/// Modify the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
|
||||
/// This function works recursively.
|
||||
///
|
||||
/// ## Return
|
||||
/// Returns the effect of modifying the facet value to the database on the given `level`.
|
||||
///
|
||||
fn modify_in_level(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
facet_value: &[u8],
|
||||
add_docids: Option<&RoaringBitmap>,
|
||||
del_docids: Option<&RoaringBitmap>,
|
||||
) -> Result<ModificationResult> {
|
||||
if level == 0 {
|
||||
return self.modify_in_level_0(txn, field_id, facet_value, add_docids, del_docids);
|
||||
}
|
||||
|
||||
let result =
|
||||
self.modify_in_level(txn, field_id, level - 1, facet_value, add_docids, del_docids)?;
|
||||
// level below inserted an element
|
||||
|
||||
if let ModificationResult::Nothing = result {
|
||||
// if the previous level has not been modified,
|
||||
// early return ModificationResult::Nothing.
|
||||
return Ok(ModificationResult::Nothing);
|
||||
}
|
||||
|
||||
let (insertion_key, insertion_value) =
|
||||
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
|
||||
let insertion_value_size = insertion_value.size as usize;
|
||||
|
||||
let mut insertion_value_was_modified = false;
|
||||
let mut updated_value = insertion_value;
|
||||
|
||||
if let ModificationResult::Insert = result {
|
||||
// if a key has been inserted in the sub-level raise the value size.
|
||||
updated_value.size += 1;
|
||||
insertion_value_was_modified = true;
|
||||
} else if let ModificationResult::Remove { .. } = result {
|
||||
if updated_value.size <= 1 {
|
||||
// if the only remaining node is the one to delete,
|
||||
// delete the key instead and early return.
|
||||
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
|
||||
assert!(is_deleted);
|
||||
return Ok(result);
|
||||
} else {
|
||||
// Reduce the value size
|
||||
updated_value.size -= 1;
|
||||
insertion_value_was_modified = true;
|
||||
}
|
||||
}
|
||||
|
||||
let (insertion_key, insertion_key_modification) =
|
||||
if let ModificationResult::InPlace = result {
|
||||
(insertion_key, ModificationResult::InPlace)
|
||||
} else {
|
||||
// Inserting or deleting the facet value in the level below resulted in the creation
|
||||
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
|
||||
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
|
||||
// could happen).
|
||||
let mut new_insertion_key = insertion_key.clone();
|
||||
let mut key_modification = ModificationResult::InPlace;
|
||||
|
||||
if let ModificationResult::Remove { next } | ModificationResult::Reduce { next } =
|
||||
result
|
||||
{
|
||||
// if the deleted facet_value is the left_bound of the current node,
|
||||
// the left_bound should be updated reducing the current node.
|
||||
let reduced_range = facet_value == insertion_key.left_bound;
|
||||
if reduced_range {
|
||||
new_insertion_key.left_bound = next.clone().unwrap();
|
||||
key_modification = ModificationResult::Reduce { next };
|
||||
}
|
||||
} else if facet_value < insertion_key.left_bound.as_slice() {
|
||||
// if the added facet_value is the under the left_bound of the current node,
|
||||
// the left_bound should be updated expanding the current node.
|
||||
new_insertion_key.left_bound = facet_value.to_vec();
|
||||
key_modification = ModificationResult::Expand;
|
||||
}
|
||||
|
||||
if matches!(
|
||||
key_modification,
|
||||
ModificationResult::Expand | ModificationResult::Reduce { .. }
|
||||
) {
|
||||
// if the node should be updated, delete it, it will be recreated using a new key later.
|
||||
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
|
||||
assert!(is_deleted);
|
||||
}
|
||||
(new_insertion_key, key_modification)
|
||||
};
|
||||
|
||||
if updated_value.size < self.max_group_size {
|
||||
// If there are docids to delete, trim them avoiding unexpected removal.
|
||||
if let Some(del_docids) = del_docids
|
||||
.map(|ids| {
|
||||
self.trim_del_docids(
|
||||
txn,
|
||||
field_id,
|
||||
level,
|
||||
&insertion_key,
|
||||
insertion_value_size,
|
||||
ids,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
.filter(|ids| !ids.is_empty())
|
||||
{
|
||||
updated_value.bitmap -= &*del_docids;
|
||||
insertion_value_was_modified = true;
|
||||
}
|
||||
|
||||
if let Some(add_docids) = add_docids {
|
||||
updated_value.bitmap |= add_docids;
|
||||
insertion_value_was_modified = true;
|
||||
}
|
||||
|
||||
if insertion_value_was_modified
|
||||
|| matches!(
|
||||
insertion_key_modification,
|
||||
ModificationResult::Expand | ModificationResult::Reduce { .. }
|
||||
)
|
||||
{
|
||||
// if any modification occured, insert it in the database.
|
||||
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
|
||||
Ok(insertion_key_modification)
|
||||
} else {
|
||||
// this case is reachable when a docid is removed from a sub-level node but is still present in another one.
|
||||
// For instance, a document containing 2 and 3, if 2 is removed, the docid should remain in the group node [1..4].
|
||||
Ok(ModificationResult::Nothing)
|
||||
}
|
||||
} else {
|
||||
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
|
||||
// Therefore it must be split into two nodes.
|
||||
self.split_group(txn, field_id, level, insertion_key, updated_value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Modify the given facet value and corresponding document ids in the database.
|
||||
/// If no more document ids correspond to the facet value, delete it completely.
|
||||
///
|
||||
/// ## Return
|
||||
/// Returns `true` if some tree-nodes of the highest level have been removed or added implying a potential
|
||||
/// addition or deletion of a facet level.
|
||||
/// Otherwise returns `false` if the tree-nodes have been modified in place.
|
||||
pub fn modify(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
if docids.is_empty() {
|
||||
return Ok(());
|
||||
add_docids: Option<&RoaringBitmap>,
|
||||
del_docids: Option<&RoaringBitmap>,
|
||||
) -> Result<bool> {
|
||||
if add_docids.map_or(true, RoaringBitmap::is_empty)
|
||||
&& del_docids.map_or(true, RoaringBitmap::is_empty)
|
||||
{
|
||||
return Ok(false);
|
||||
}
|
||||
let group_size = self.group_size;
|
||||
|
||||
let highest_level = get_highest_level(txn, self.db, field_id)?;
|
||||
|
||||
let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?;
|
||||
let result = self.modify_in_level(
|
||||
txn,
|
||||
field_id,
|
||||
highest_level,
|
||||
facet_value,
|
||||
add_docids,
|
||||
del_docids,
|
||||
)?;
|
||||
match result {
|
||||
InsertionResult::InPlace => return Ok(()),
|
||||
InsertionResult::Expand => return Ok(()),
|
||||
InsertionResult::Insert => {}
|
||||
ModificationResult::InPlace
|
||||
| ModificationResult::Expand
|
||||
| ModificationResult::Nothing
|
||||
| ModificationResult::Reduce { .. } => Ok(false),
|
||||
ModificationResult::Insert | ModificationResult::Remove { .. } => Ok(true),
|
||||
}
|
||||
}
|
||||
|
||||
// Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`.
|
||||
// If it has, we must build an addition level above it.
|
||||
|
||||
/// Check whether the highest level has exceeded `min_level_size` * `self.group_size`.
|
||||
/// If it has, we must build an addition level above it.
|
||||
/// Then check whether the highest level is under `min_level_size`.
|
||||
/// If it has, we must remove the complete level.
|
||||
pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn, field_id: u16) -> Result<()> {
|
||||
let highest_level = get_highest_level(txn, self.db, field_id)?;
|
||||
let mut highest_level_prefix = vec![];
|
||||
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
highest_level_prefix.push(highest_level);
|
||||
@ -382,14 +567,48 @@ impl FacetsUpdateIncrementalInner {
|
||||
let size_highest_level =
|
||||
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?.count();
|
||||
|
||||
if size_highest_level < self.group_size as usize * self.min_level_size as usize {
|
||||
return Ok(());
|
||||
if size_highest_level >= self.group_size as usize * self.min_level_size as usize {
|
||||
self.add_level(txn, field_id, highest_level, &highest_level_prefix, size_highest_level)
|
||||
} else if size_highest_level < self.min_level_size as usize && highest_level != 0 {
|
||||
self.delete_level(txn, &highest_level_prefix)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Delete a level.
|
||||
fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> {
|
||||
let mut to_delete = vec![];
|
||||
let mut iter =
|
||||
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, highest_level_prefix)?;
|
||||
for el in iter.by_ref() {
|
||||
let (k, _) = el?;
|
||||
to_delete.push(
|
||||
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(k)
|
||||
.map_err(Error::Encoding)?
|
||||
.into_owned(),
|
||||
);
|
||||
}
|
||||
drop(iter);
|
||||
for k in to_delete {
|
||||
self.db.delete(txn, &k.as_ref())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build an additional level for the field id.
|
||||
fn add_level(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
highest_level: u8,
|
||||
highest_level_prefix: &[u8],
|
||||
size_highest_level: usize,
|
||||
) -> Result<()> {
|
||||
let mut groups_iter = self
|
||||
.db
|
||||
.remap_types::<Bytes, FacetGroupValueCodec>()
|
||||
.prefix_iter(txn, &highest_level_prefix)?;
|
||||
.prefix_iter(txn, highest_level_prefix)?;
|
||||
|
||||
let nbr_new_groups = size_highest_level / self.group_size as usize;
|
||||
let nbr_leftover_elements = size_highest_level % self.group_size as usize;
|
||||
@ -398,7 +617,7 @@ impl FacetsUpdateIncrementalInner {
|
||||
for _ in 0..nbr_new_groups {
|
||||
let mut first_key = None;
|
||||
let mut values = RoaringBitmap::new();
|
||||
for _ in 0..group_size {
|
||||
for _ in 0..self.group_size {
|
||||
let (key_bytes, value_i) = groups_iter.next().unwrap()?;
|
||||
let key_i = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes)
|
||||
.map_err(Error::Encoding)?;
|
||||
@ -413,7 +632,7 @@ impl FacetsUpdateIncrementalInner {
|
||||
level: highest_level + 1,
|
||||
left_bound: first_key.unwrap().left_bound,
|
||||
};
|
||||
let value = FacetGroupValue { size: group_size, bitmap: values };
|
||||
let value = FacetGroupValue { size: self.group_size, bitmap: values };
|
||||
to_add.push((key.into_owned(), value));
|
||||
}
|
||||
// now we add the rest of the level, in case its size is > group_size * min_level_size
|
||||
@ -448,173 +667,6 @@ impl FacetsUpdateIncrementalInner {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete the given document id from the given facet value in the database, from level 0 to the
|
||||
/// the given level.
|
||||
///
|
||||
/// ## Return
|
||||
/// Returns the effect of removing the document id from the database on the given `level`.
|
||||
///
|
||||
/// - `DeletionResult::InPlace` means that deleting the document id did not have
|
||||
/// an effect on the keys in that level.
|
||||
///
|
||||
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
|
||||
/// number of keys in the level. For example, removing a document id from the facet value `3` could
|
||||
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
|
||||
/// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must
|
||||
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
|
||||
///
|
||||
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
|
||||
/// bounds of the keys of the level. For example, removing a document id from the facet value
|
||||
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
|
||||
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
|
||||
/// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
|
||||
/// its left bound as well.
|
||||
fn delete_in_level(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
) -> Result<DeletionResult> {
|
||||
if level == 0 {
|
||||
return self.delete_in_level_0(txn, field_id, facet_value, docids);
|
||||
}
|
||||
let (deletion_key, mut bitmap) =
|
||||
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
|
||||
|
||||
let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?;
|
||||
|
||||
let mut decrease_size = false;
|
||||
let next_key = match result {
|
||||
DeletionResult::InPlace => {
|
||||
bitmap.bitmap -= docids;
|
||||
self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
|
||||
return Ok(DeletionResult::InPlace);
|
||||
}
|
||||
DeletionResult::Reduce { next } => next,
|
||||
DeletionResult::Remove { next } => {
|
||||
decrease_size = true;
|
||||
next
|
||||
}
|
||||
};
|
||||
// If either DeletionResult::Reduce or DeletionResult::Remove was returned,
|
||||
// then we may need to adjust the left_bound of the deletion key.
|
||||
|
||||
// If DeletionResult::Remove was returned, then we need to decrease the group
|
||||
// size of the deletion key.
|
||||
let mut updated_value = bitmap;
|
||||
if decrease_size {
|
||||
updated_value.size -= 1;
|
||||
}
|
||||
|
||||
if updated_value.size == 0 {
|
||||
self.db.delete(txn, &deletion_key.as_ref())?;
|
||||
Ok(DeletionResult::Remove { next: next_key })
|
||||
} else {
|
||||
let mut updated_deletion_key = deletion_key.clone();
|
||||
let reduced_range = facet_value == deletion_key.left_bound;
|
||||
if reduced_range {
|
||||
updated_deletion_key.left_bound = next_key.clone().unwrap();
|
||||
}
|
||||
updated_value.bitmap -= docids;
|
||||
let _ = self.db.delete(txn, &deletion_key.as_ref())?;
|
||||
self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
|
||||
if reduced_range {
|
||||
Ok(DeletionResult::Reduce { next: next_key })
|
||||
} else {
|
||||
Ok(DeletionResult::InPlace)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn delete_in_level_0(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
) -> Result<DeletionResult> {
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
|
||||
let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap;
|
||||
bitmap -= docids;
|
||||
|
||||
if bitmap.is_empty() {
|
||||
let mut next_key = None;
|
||||
if let Some((next, _)) =
|
||||
self.db.remap_data_type::<DecodeIgnore>().get_greater_than(txn, &key)?
|
||||
{
|
||||
if next.field_id == field_id && next.level == 0 {
|
||||
next_key = Some(next.left_bound.to_vec());
|
||||
}
|
||||
}
|
||||
self.db.delete(txn, &key)?;
|
||||
Ok(DeletionResult::Remove { next: next_key })
|
||||
} else {
|
||||
self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
|
||||
Ok(DeletionResult::InPlace)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete(
|
||||
&self,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
if self
|
||||
.db
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })?
|
||||
.is_none()
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
let highest_level = get_highest_level(txn, self.db, field_id)?;
|
||||
|
||||
let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?;
|
||||
match result {
|
||||
DeletionResult::InPlace => return Ok(()),
|
||||
DeletionResult::Reduce { .. } => return Ok(()),
|
||||
DeletionResult::Remove { .. } => {}
|
||||
}
|
||||
|
||||
// if we either removed a key from the highest level, its size may have fallen
|
||||
// below `min_level_size`, in which case we need to remove the entire level
|
||||
|
||||
let mut highest_level_prefix = vec![];
|
||||
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
highest_level_prefix.push(highest_level);
|
||||
|
||||
if highest_level == 0
|
||||
|| self
|
||||
.db
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(txn, &highest_level_prefix)?
|
||||
.count()
|
||||
>= self.min_level_size as usize
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
let mut to_delete = vec![];
|
||||
let mut iter =
|
||||
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?;
|
||||
for el in iter.by_ref() {
|
||||
let (k, _) = el?;
|
||||
to_delete.push(
|
||||
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(k)
|
||||
.map_err(Error::Encoding)?
|
||||
.into_owned(),
|
||||
);
|
||||
}
|
||||
drop(iter);
|
||||
for k in to_delete {
|
||||
self.db.delete(txn, &k.as_ref())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FacetGroupKey<&'a [u8]> {
|
||||
|
@ -79,12 +79,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
use grenad::{CompressionType, SortAlgorithm};
|
||||
use heed::types::{Bytes, DecodeIgnore, SerdeJson};
|
||||
use heed::BytesEncode;
|
||||
use grenad::Merger;
|
||||
use heed::types::{Bytes, DecodeIgnore};
|
||||
use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
@ -93,9 +90,9 @@ use super::FacetsUpdateBulk;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::index_documents::create_sorter;
|
||||
use crate::update::merge_btreeset_string;
|
||||
use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod incremental;
|
||||
@ -108,16 +105,20 @@ pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
data_size: u64,
|
||||
}
|
||||
impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: grenad::Reader<BufReader<File>>,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
data_size: u64,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => {
|
||||
@ -135,18 +136,20 @@ impl<'i> FacetsUpdate<'i> {
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
delta_data,
|
||||
normalized_delta_data,
|
||||
data_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||
if self.delta_data.is_empty() {
|
||||
if self.data_size == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
|
||||
if self.data_size >= (self.database.len(wtxn)? / 500) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
@ -170,96 +173,110 @@ impl<'i> FacetsUpdate<'i> {
|
||||
incremental_update.execute(wtxn)?;
|
||||
}
|
||||
|
||||
// We clear the list of normalized-for-search facets
|
||||
// and the previous FSTs to compute everything from scratch
|
||||
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
|
||||
self.index.facet_id_string_fst.clear(wtxn)?;
|
||||
|
||||
// As we can't use the same write transaction to read and write in two different databases
|
||||
// we must create a temporary sorter that we will write into LMDB afterward.
|
||||
// As multiple unnormalized facet values can become the same normalized facet value
|
||||
// we must merge them together.
|
||||
let mut sorter = create_sorter(
|
||||
SortAlgorithm::Unstable,
|
||||
merge_btreeset_string,
|
||||
CompressionType::None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
// We iterate on the list of original, semi-normalized, facet values
|
||||
// and normalize them for search, inserting them in LMDB in any given order.
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let (facet_group_key, ()) = result?;
|
||||
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
||||
let mut normalized_facet = left_bound.normalize(&options);
|
||||
let normalized_truncated_facet: String;
|
||||
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
|
||||
normalized_truncated_facet = normalized_facet
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
normalized_facet = normalized_truncated_facet.into();
|
||||
}
|
||||
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
||||
let key = (field_id, normalized_facet.as_ref());
|
||||
let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
sorter.insert(key, val)?;
|
||||
}
|
||||
match self.normalized_delta_data {
|
||||
Some(data) => index_facet_search(wtxn, data, self.index),
|
||||
None => Ok(()),
|
||||
}
|
||||
|
||||
// In this loop we don't need to take care of merging bitmaps
|
||||
// as the grenad sorter already merged them for us.
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
|
||||
self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
|
||||
wtxn,
|
||||
key_bytes,
|
||||
btreeset_bytes,
|
||||
)?;
|
||||
}
|
||||
|
||||
// We compute one FST by string facet
|
||||
let mut text_fsts = vec![];
|
||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||
let database =
|
||||
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let ((field_id, normalized_facet), _) = result?;
|
||||
current_fst = match current_fst.take() {
|
||||
Some((fid, fst_builder)) if fid != field_id => {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((fid, fst));
|
||||
Some((field_id, fst::SetBuilder::memory()))
|
||||
}
|
||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||
None => Some((field_id, fst::SetBuilder::memory())),
|
||||
};
|
||||
|
||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||
fst_builder.insert(normalized_facet)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((field_id, fst_builder)) = current_fst {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((field_id, fst));
|
||||
}
|
||||
|
||||
// We write those FSTs in LMDB now
|
||||
for (field_id, fst) in text_fsts {
|
||||
self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn index_facet_search(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
normalized_delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
index: &Index,
|
||||
) -> Result<()> {
|
||||
let mut iter = normalized_delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key_bytes, delta_bytes)) = iter.next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(delta_bytes);
|
||||
|
||||
let database_set = index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.get(wtxn, key_bytes)?
|
||||
.unwrap_or_default();
|
||||
|
||||
let add_set = deladd_reader
|
||||
.get(DelAdd::Addition)
|
||||
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
let del_set = match deladd_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||
{
|
||||
Some(del_set) => {
|
||||
let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
let mut set = BTreeSet::new();
|
||||
for facet in del_set {
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() };
|
||||
// Check if the referenced value doesn't exist anymore before deleting it.
|
||||
if index
|
||||
.facet_id_string_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(wtxn, &key)?
|
||||
.is_none()
|
||||
{
|
||||
set.insert(facet);
|
||||
}
|
||||
}
|
||||
set
|
||||
}
|
||||
None => BTreeSet::new(),
|
||||
};
|
||||
|
||||
let set: BTreeSet<_> =
|
||||
database_set.difference(&del_set).chain(add_set.iter()).cloned().collect();
|
||||
|
||||
if set.is_empty() {
|
||||
index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.delete(wtxn, key_bytes)?;
|
||||
} else {
|
||||
index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.put(wtxn, key_bytes, &set)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We clear the FST of normalized-for-search to compute everything from scratch.
|
||||
index.facet_id_string_fst.clear(wtxn)?;
|
||||
// We compute one FST by string facet
|
||||
let mut text_fsts = vec![];
|
||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||
let database = index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let ((field_id, normalized_facet), _) = result?;
|
||||
current_fst = match current_fst.take() {
|
||||
Some((fid, fst_builder)) if fid != field_id => {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((fid, fst));
|
||||
Some((field_id, fst::SetBuilder::memory()))
|
||||
}
|
||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||
None => Some((field_id, fst::SetBuilder::memory())),
|
||||
};
|
||||
|
||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||
fst_builder.insert(normalized_facet)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((field_id, fst_builder)) = current_fst {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((field_id, fst));
|
||||
}
|
||||
|
||||
// We write those FSTs in LMDB now
|
||||
for (field_id, fst) in text_fsts {
|
||||
index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_helpers {
|
||||
use std::cell::Cell;
|
||||
@ -268,6 +285,7 @@ pub(crate) mod test_helpers {
|
||||
use std::marker::PhantomData;
|
||||
use std::rc::Rc;
|
||||
|
||||
use grenad::MergerBuilder;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
@ -280,7 +298,8 @@ pub(crate) mod test_helpers {
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::FacetsUpdateIncrementalInner;
|
||||
use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps;
|
||||
use crate::update::{FacetsUpdateIncrementalInner, MergeFn};
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
/// Utility function to generate a string whose position in a lexicographically
|
||||
@ -410,7 +429,8 @@ pub(crate) mod test_helpers {
|
||||
max_group_size: self.max_group_size.get(),
|
||||
};
|
||||
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
|
||||
update.insert(wtxn, field_id, &key_bytes, docids).unwrap();
|
||||
update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap();
|
||||
update.add_or_delete_level(wtxn, field_id).unwrap();
|
||||
}
|
||||
pub fn delete_single_docid<'a>(
|
||||
&self,
|
||||
@ -436,7 +456,8 @@ pub(crate) mod test_helpers {
|
||||
max_group_size: self.max_group_size.get(),
|
||||
};
|
||||
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
|
||||
update.delete(wtxn, field_id, &key_bytes, docids).unwrap();
|
||||
update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap();
|
||||
update.add_or_delete_level(wtxn, field_id).unwrap();
|
||||
}
|
||||
|
||||
pub fn bulk_insert<'a, 'b>(
|
||||
@ -463,10 +484,13 @@ pub(crate) mod test_helpers {
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
builder.push(reader.into_cursor().unwrap());
|
||||
let merger = builder.build();
|
||||
|
||||
let update = FacetsUpdateBulkInner {
|
||||
db: self.content,
|
||||
delta_data: Some(reader),
|
||||
delta_data: Some(merger),
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
|
@ -26,7 +26,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
@ -181,11 +181,11 @@ fn searchable_fields_changed(
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<&[u8]>>,
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
@ -211,7 +211,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
stop_words: Option<&fst::Set<&[u8]>>,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: u32,
|
||||
|
@ -1,15 +1,21 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
use std::{io, str};
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
use heed::types::SerdeJson;
|
||||
use heed::BytesEncode;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
||||
use crate::{FieldId, Result};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
};
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
@ -19,10 +25,11 @@ use crate::{FieldId, Result};
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
@ -30,12 +37,30 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_btreeset_string,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some()
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
@ -44,17 +69,46 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let mut hyper_normalized_value = normalized_value.normalize(&options);
|
||||
let normalized_truncated_facet: String;
|
||||
if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH {
|
||||
normalized_truncated_facet = hyper_normalized_value
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect();
|
||||
hyper_normalized_value = normalized_truncated_facet.into();
|
||||
}
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key = (field_id, hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
@ -257,6 +257,7 @@ fn push_vectors_diff(
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||
if must_remove {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
@ -332,16 +333,15 @@ fn extract_vectors(
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let rt = tokio::runtime::Builder::new_current_thread().enable_io().enable_time().build()?;
|
||||
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parellelism
|
||||
puffin::profile_function!();
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
// docid, state with embedding
|
||||
@ -375,11 +375,8 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
current_chunk_ids.push(docid);
|
||||
|
||||
if chunks.len() == chunks.capacity() {
|
||||
let chunked_embeds = rt
|
||||
.block_on(
|
||||
embedder
|
||||
.embed_chunks(std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks))),
|
||||
)
|
||||
let chunked_embeds = embedder
|
||||
.embed_chunks(std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)))
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
|
||||
@ -396,8 +393,8 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
|
||||
// send last chunk
|
||||
if !chunks.is_empty() {
|
||||
let chunked_embeds = rt
|
||||
.block_on(embedder.embed_chunks(std::mem::take(&mut chunks)))
|
||||
let chunked_embeds = embedder
|
||||
.embed_chunks(std::mem::take(&mut chunks))
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
for (docid, embeddings) in chunks_ids
|
||||
@ -410,13 +407,15 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
if !current_chunk.is_empty() {
|
||||
let embeds = rt
|
||||
.block_on(embedder.embed(std::mem::take(&mut current_chunk)))
|
||||
let embeds = embedder
|
||||
.embed_chunks(vec![std::mem::take(&mut current_chunk)])
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?;
|
||||
|
||||
for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
if let Some(embeds) = embeds.first() {
|
||||
for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings.as_inner()))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,6 @@ use std::io::BufReader;
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use rayon::prelude::*;
|
||||
use tracing::debug;
|
||||
|
||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
@ -29,10 +28,7 @@ use self::extract_vector_points::{
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{
|
||||
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
|
||||
MergeFn, MergeableReader,
|
||||
};
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
@ -52,7 +48,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
stop_words: Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
@ -62,227 +58,170 @@ pub(crate) fn data_from_obkv_documents(
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()?;
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
&allowed_separators,
|
||||
&dictionary,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (
|
||||
docid_word_positions_chunks,
|
||||
(
|
||||
fid_docid_facet_numbers_chunks,
|
||||
(
|
||||
fid_docid_facet_strings_chunks,
|
||||
(
|
||||
facet_is_null_docids_chunks,
|
||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||
),
|
||||
),
|
||||
),
|
||||
) = result?;
|
||||
|
||||
// merge facet_exists_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!(database = "facet-id-exists-docids", "merge");
|
||||
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// merge facet_is_null_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!(database = "facet-id-is-null-docids", "merge");
|
||||
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// merge facet_is_empty_docids and send them as a typed chunk
|
||||
{
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
debug!(database = "facet-id-is-empty-docids", "merge");
|
||||
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
|
||||
Ok(reader) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>,
|
||||
>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
|| {
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
&allowed_separators,
|
||||
&dictionary,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.map(|result| {
|
||||
if let Ok((
|
||||
ref docid_word_positions_chunk,
|
||||
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
|
||||
)) = result
|
||||
{
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
let exact_attributes = exact_attributes.clone();
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
),
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| {
|
||||
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
|
||||
},
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
docid_word_positions_chunks.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
fid_docid_facet_strings_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
|
||||
fid_docid_facet_numbers_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx,
|
||||
extract_facet_number_docids,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docidsdexing::details, ",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
original_pipeline_result.and(flattened_pipeline_result)
|
||||
}
|
||||
|
||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||
/// Generated grenad chunks are merged using the merge_fn.
|
||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||
/// and sent into lmdb_writer_sx.
|
||||
fn spawn_extraction_task<FE, FS, M>(
|
||||
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
merge_fn: MergeFn,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
|
||||
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
|
||||
M::Output: Send,
|
||||
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
|
||||
M: Send,
|
||||
{
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
rayon::spawn(move || {
|
||||
let child_span =
|
||||
tracing::trace_span!(target: "", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
puffin::profile_scope!("extract_multiple_chunksdexing::details, ", name);
|
||||
let chunks: Result<M> =
|
||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect();
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
rayon::spawn(move || match chunks {
|
||||
Ok(chunks) => {
|
||||
let child_span = tracing::trace_span!(target: "", parent: ¤t_span, "merge_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
debug!(database = name, "merge");
|
||||
puffin::profile_scope!("merge_multiple_chunks", name);
|
||||
let reader = chunks.merge(merge_fn, &indexer);
|
||||
let _ = lmdb_writer_sx.send(reader.map(serialize_fn));
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
match extract_fn(chunk, indexer) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
@ -340,7 +279,7 @@ fn send_original_documents_data(
|
||||
});
|
||||
|
||||
// TODO: create a custom internal error
|
||||
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -360,22 +299,13 @@ fn send_and_extract_flattened_documents_data(
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
stop_words: &Option<fst::Set<&[u8]>>,
|
||||
stop_words: &Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: &Option<&[&str]>,
|
||||
dictionary: &Option<&[&str]>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
),
|
||||
),
|
||||
),
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@ -446,16 +376,17 @@ fn send_and_extract_flattened_documents_data(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
Ok((
|
||||
fid_docid_facet_numbers_chunk,
|
||||
(
|
||||
fid_docid_facet_strings_chunk,
|
||||
(
|
||||
fid_facet_is_null_docids_chunk,
|
||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||
),
|
||||
),
|
||||
))
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
|
||||
|
||||
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
|
||||
},
|
||||
);
|
||||
|
||||
|
@ -9,6 +9,10 @@ use super::{ClonableMmap, MergeFn};
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
/// This is something reasonable given the fact
|
||||
/// that there is one grenad sorter by thread.
|
||||
const MAX_GRENAD_SORTER_USAGE: usize = 500 * 1024 * 1024; // 500 MiB
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
|
||||
pub fn create_writer<R: io::Write>(
|
||||
@ -24,6 +28,9 @@ pub fn create_writer<R: io::Write>(
|
||||
builder.build(BufWriter::new(file))
|
||||
}
|
||||
|
||||
/// A helper function that creates a grenad sorter
|
||||
/// with the given parameters. The max memory is
|
||||
/// clamped to something reasonable.
|
||||
pub fn create_sorter(
|
||||
sort_algorithm: grenad::SortAlgorithm,
|
||||
merge: MergeFn,
|
||||
@ -41,7 +48,7 @@ pub fn create_sorter(
|
||||
builder.max_nb_chunks(nb_chunks);
|
||||
}
|
||||
if let Some(memory) = max_memory {
|
||||
builder.dump_threshold(memory);
|
||||
builder.dump_threshold(memory.min(MAX_GRENAD_SORTER_USAGE));
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
@ -83,90 +90,6 @@ pub unsafe fn as_cloneable_grenad(
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
pub trait MergeableReader
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
type Output;
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
|
||||
type Output = grenad::Reader<BufReader<File>>;
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut merger = MergerBuilder::new(merge_fn);
|
||||
self.into_iter().try_for_each(|r| merger.push(r))?;
|
||||
merger.finish(params)
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
let mut m2 = MergerBuilder::new(merge_fn);
|
||||
for (r1, r2) in self.into_iter() {
|
||||
m1.push(r1)?;
|
||||
m2.push(r2)?;
|
||||
}
|
||||
Ok((m1.finish(params)?, m2.finish(params)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeableReader
|
||||
for Vec<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)>
|
||||
{
|
||||
type Output = (
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
);
|
||||
|
||||
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||
let mut m1 = MergerBuilder::new(merge_fn);
|
||||
let mut m2 = MergerBuilder::new(merge_fn);
|
||||
let mut m3 = MergerBuilder::new(merge_fn);
|
||||
for (r1, r2, r3) in self.into_iter() {
|
||||
m1.push(r1)?;
|
||||
m2.push(r2)?;
|
||||
m3.push(r3)?;
|
||||
}
|
||||
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
||||
}
|
||||
}
|
||||
|
||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||
|
||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||
fn new(merge_fn: MergeFn) -> Self {
|
||||
Self(grenad::MergerBuilder::new(merge_fn))
|
||||
}
|
||||
|
||||
fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
|
||||
self.0.push(reader.into_cursor()?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let merger = self.0.build();
|
||||
let mut writer = create_writer(
|
||||
params.chunk_compression_type,
|
||||
params.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
merger.write_into_stream_writer(&mut writer)?;
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GrenadParameters {
|
||||
pub chunk_compression_type: CompressionType,
|
||||
@ -188,10 +111,15 @@ impl Default for GrenadParameters {
|
||||
|
||||
impl GrenadParameters {
|
||||
/// This function use the number of threads in the current threadpool to compute the value.
|
||||
///
|
||||
/// This should be called inside of a rayon thread pool,
|
||||
/// Otherwise, it will take the global number of threads.
|
||||
/// otherwise, it will take the global number of threads.
|
||||
///
|
||||
/// The max memory cannot exceed a given reasonable value.
|
||||
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads())
|
||||
self.max_memory.map(|max_memory| {
|
||||
(max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -35,27 +35,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
|
||||
}
|
||||
}
|
||||
|
||||
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// TODO improve the perf by using a `#[borrow] Cow<str>`.
|
||||
let strings: BTreeSet<String> = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(serde_json::from_slice::<BTreeSet<String>>)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|mut current, new| {
|
||||
for x in new {
|
||||
current.insert(x);
|
||||
}
|
||||
current
|
||||
})
|
||||
.unwrap();
|
||||
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
}
|
||||
@ -243,3 +222,40 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Do a union of BtreeSet on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub fn merge_deladd_btreeset_string<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_set = BTreeSet::new();
|
||||
let mut add_set = BTreeSet::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
del_set.insert(value);
|
||||
}
|
||||
}
|
||||
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
add_set.insert(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let del = serde_json::to_vec(&del_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||
let add = serde_json::to_vec(&add_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
@ -10,10 +10,10 @@ use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||
GrenadParameters, MergeableReader,
|
||||
GrenadParameters,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, MergeFn,
|
||||
|
@ -5,29 +5,29 @@ mod transform;
|
||||
mod typed_chunk;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Cursor, Read, Seek};
|
||||
use std::io::{Read, Seek};
|
||||
use std::iter::FromIterator;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::types::Str;
|
||||
use heed::Database;
|
||||
use rand::SeedableRng;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use slice_group_by::GroupBy;
|
||||
use tracing::{debug_span};
|
||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||
use tracing::debug;
|
||||
use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
|
||||
|
||||
use self::enrich::enrich_documents_batch;
|
||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||
pub use self::helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
|
||||
ClonableMmap, MergeFn,
|
||||
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
|
||||
valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
|
||||
};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
@ -95,8 +95,8 @@ pub struct IndexDocumentsConfig {
|
||||
|
||||
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
FP: Fn(UpdateIndexingStep) + Sync + Send,
|
||||
FA: Fn() -> bool + Sync + Send,
|
||||
{
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i>,
|
||||
@ -284,7 +284,7 @@ where
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip_all,
|
||||
target = "profile::indexing::details",
|
||||
target = "indexing::details",
|
||||
name = "index_documents_raw"
|
||||
)]
|
||||
pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
|
||||
@ -326,9 +326,6 @@ where
|
||||
}
|
||||
};
|
||||
|
||||
let original_documents = grenad::Reader::new(original_documents)?;
|
||||
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||
|
||||
// create LMDB writer channel
|
||||
let (lmdb_writer_sx, lmdb_writer_rx): (
|
||||
Sender<Result<TypedChunk>>,
|
||||
@ -367,11 +364,7 @@ where
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let separators = self.index.allowed_separators(self.wtxn)?;
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let dictionary = self.index.dictionary(self.wtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
|
||||
|
||||
@ -381,141 +374,204 @@ where
|
||||
max_memory: self.indexer_config.max_memory,
|
||||
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
|
||||
};
|
||||
let documents_chunk_size =
|
||||
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
|
||||
let documents_chunk_size = match self.indexer_config.documents_chunk_size {
|
||||
Some(chunk_size) => chunk_size,
|
||||
None => {
|
||||
let default_chunk_size = 1024 * 1024 * 4; // 4MiB
|
||||
let min_chunk_size = 1024 * 512; // 512KiB
|
||||
|
||||
// compute the chunk size from the number of available threads and the inputed data size.
|
||||
let total_size = flattened_documents.metadata().map(|m| m.len());
|
||||
let current_num_threads = pool.current_num_threads();
|
||||
// if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
|
||||
let chunk_count = if current_num_threads > 2 {
|
||||
(current_num_threads * 3 / 4).max(2)
|
||||
} else {
|
||||
current_num_threads
|
||||
};
|
||||
total_size
|
||||
.map_or(default_chunk_size, |size| (size as usize) / chunk_count)
|
||||
.max(min_chunk_size)
|
||||
}
|
||||
};
|
||||
|
||||
let original_documents = grenad::Reader::new(original_documents)?;
|
||||
let flattened_documents = grenad::Reader::new(flattened_documents)?;
|
||||
|
||||
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
|
||||
|
||||
let cloned_embedder = self.embedders.clone();
|
||||
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
let mut databases_seen = 0;
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
let mut chunk_accumulator = ChunkAccumulator::default();
|
||||
let mut dimension = HashMap::new();
|
||||
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
|
||||
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
// Run extraction pipeline in parallel.
|
||||
pool.install(|| {
|
||||
let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks");
|
||||
let _enter = child_span.enter();
|
||||
puffin::profile_scope!("extract_and_send_grenad_chunks");
|
||||
// split obkv file into several chunks
|
||||
let original_chunk_iter =
|
||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||
// split obkv file into several chunks
|
||||
let original_chunk_iter =
|
||||
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
|
||||
|
||||
// split obkv file into several chunks
|
||||
let flattened_chunk_iter =
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||
// split obkv file into several chunks
|
||||
let flattened_chunk_iter =
|
||||
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
|
||||
|
||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||
let flattened_chunk = flattened_chunk_iter?;
|
||||
// extract all databases from the chunked obkv douments
|
||||
extract::data_from_obkv_documents(
|
||||
original_chunk,
|
||||
flattened_chunk,
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
field_id_map,
|
||||
stop_words,
|
||||
separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
cloned_embedder,
|
||||
)
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
let result = original_chunk_iter.and_then(|original_chunk| {
|
||||
let flattened_chunk = flattened_chunk_iter?;
|
||||
// extract all databases from the chunked obkv douments
|
||||
extract::data_from_obkv_documents(
|
||||
original_chunk,
|
||||
flattened_chunk,
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
searchable_fields,
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
field_id_map,
|
||||
stop_words,
|
||||
separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
cloned_embedder,
|
||||
)
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx);
|
||||
});
|
||||
|
||||
if let Err(e) = result {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
// needs to be dropped to avoid channel waiting lock.
|
||||
drop(lmdb_writer_sx);
|
||||
});
|
||||
loop {
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
|
||||
let mut final_documents_ids = RoaringBitmap::new();
|
||||
match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
|
||||
Err(status) => {
|
||||
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
|
||||
if !docids.is_empty() {
|
||||
final_documents_ids |= docids;
|
||||
let documents_seen_count = final_documents_ids.len();
|
||||
(self.progress)(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: documents_seen_count as usize,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
debug!(documents = documents_seen_count, total = documents_count, "Seen");
|
||||
}
|
||||
if is_merged_database {
|
||||
databases_seen += 1;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
}
|
||||
// If no more chunk remains in the chunk accumulator and the channel is disconected, break.
|
||||
} else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
|
||||
break;
|
||||
} else {
|
||||
rayon::yield_now();
|
||||
}
|
||||
}
|
||||
Ok(result) => {
|
||||
let typed_chunk = match result? {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
let word_docids = word_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
});
|
||||
word_docids.push(cloneable_chunk.into_cursor()?);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
let exact_word_docids =
|
||||
exact_word_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(
|
||||
merge_deladd_cbo_roaring_bitmaps as MergeFn,
|
||||
)
|
||||
});
|
||||
exact_word_docids.push(cloneable_chunk.into_cursor()?);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
});
|
||||
word_fid_docids.push(cloneable_chunk.into_cursor()?);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
let word_position_docids =
|
||||
word_position_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(
|
||||
merge_deladd_cbo_roaring_bitmaps as MergeFn,
|
||||
)
|
||||
});
|
||||
word_position_docids.push(cloneable_chunk.into_cursor()?);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
TypedChunk::VectorPoints {
|
||||
expected_dimension,
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
} => {
|
||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||
TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
}
|
||||
}
|
||||
otherwise => otherwise,
|
||||
};
|
||||
|
||||
let mut databases_seen = 0;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
|
||||
let mut dimension = HashMap::new();
|
||||
|
||||
for result in lmdb_writer_rx {
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let typed_chunk = match result? {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
exact_word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
chunk_accumulator.insert(typed_chunk);
|
||||
}
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_position_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
TypedChunk::VectorPoints {
|
||||
expected_dimension,
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
} => {
|
||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||
TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
}
|
||||
}
|
||||
otherwise => otherwise,
|
||||
};
|
||||
}
|
||||
|
||||
let (docids, is_merged_database) =
|
||||
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
|
||||
if !docids.is_empty() {
|
||||
final_documents_ids |= docids;
|
||||
let documents_seen_count = final_documents_ids.len();
|
||||
(self.progress)(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: documents_seen_count as usize,
|
||||
total_documents: documents_count,
|
||||
});
|
||||
debug_span!("Seen", documents = documents_seen_count, total = documents_count);
|
||||
}
|
||||
if is_merged_database {
|
||||
databases_seen += 1;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// We write the field distribution into the main database
|
||||
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
|
||||
@ -548,10 +604,10 @@ where
|
||||
}
|
||||
|
||||
self.execute_prefix_databases(
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
word_docids.map(MergerBuilder::build),
|
||||
exact_word_docids.map(MergerBuilder::build),
|
||||
word_position_docids.map(MergerBuilder::build),
|
||||
word_fid_docids.map(MergerBuilder::build),
|
||||
)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
@ -565,10 +621,10 @@ where
|
||||
)]
|
||||
pub fn execute_prefix_databases(
|
||||
self,
|
||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -751,7 +807,7 @@ where
|
||||
)]
|
||||
fn execute_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn,
|
||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
indexer_config: &IndexerConfig,
|
||||
@ -761,13 +817,12 @@ fn execute_word_prefix_docids(
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let cursor = reader.into_cursor()?;
|
||||
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
|
||||
builder.chunk_compression_type = indexer_config.chunk_compression_type;
|
||||
builder.chunk_compression_level = indexer_config.chunk_compression_level;
|
||||
builder.max_nb_chunks = indexer_config.max_nb_chunks;
|
||||
builder.max_memory = indexer_config.max_memory;
|
||||
builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
|
||||
builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -5,27 +5,64 @@ use std::io::{self, BufReader};
|
||||
|
||||
use bytemuck::allocation::pod_collect_to_vec;
|
||||
use charabia::{Language, Script};
|
||||
use grenad::MergerBuilder;
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::types::Bytes;
|
||||
use heed::{PutFlags, RwTxn};
|
||||
use heed::RwTxn;
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
|
||||
valid_lmdb_key, CursorClonableMmap,
|
||||
self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key,
|
||||
CursorClonableMmap,
|
||||
};
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use super::MergeFn;
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||
use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
|
||||
};
|
||||
use crate::{
|
||||
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError,
|
||||
};
|
||||
|
||||
/// This struct accumulates and group the TypedChunks
|
||||
/// and is able to give the biggest accumulated group to index them all together
|
||||
/// with a merger.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct ChunkAccumulator {
|
||||
inner: Vec<Vec<TypedChunk>>,
|
||||
}
|
||||
|
||||
impl ChunkAccumulator {
|
||||
pub fn pop_longest(&mut self) -> Option<Vec<TypedChunk>> {
|
||||
match self.inner.iter().max_by_key(|v| v.len()) {
|
||||
Some(left) => {
|
||||
let position = self.inner.iter().position(|right| left.len() == right.len());
|
||||
position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty())
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, chunk: TypedChunk) {
|
||||
match self
|
||||
.inner
|
||||
.iter()
|
||||
.position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right)))
|
||||
{
|
||||
Some(position) => {
|
||||
let v = self.inner.get_mut(position).unwrap();
|
||||
v.push(chunk);
|
||||
}
|
||||
None => self.inner.push(vec![chunk]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||
@ -38,7 +75,7 @@ pub(crate) enum TypedChunk {
|
||||
},
|
||||
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetStringDocids((grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||
@ -54,6 +91,33 @@ pub(crate) enum TypedChunk {
|
||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||
}
|
||||
|
||||
impl TypedChunk {
|
||||
fn mergeable_with(&self, other: &Self) -> bool {
|
||||
use TypedChunk::*;
|
||||
match (self, other) {
|
||||
(FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_))
|
||||
| (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_))
|
||||
| (Documents(_), Documents(_))
|
||||
| (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_))
|
||||
| (WordDocids { .. }, WordDocids { .. })
|
||||
| (WordPositionDocids(_), WordPositionDocids(_))
|
||||
| (WordPairProximityDocids(_), WordPairProximityDocids(_))
|
||||
| (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_))
|
||||
| (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_))
|
||||
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
|
||||
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
|
||||
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
|
||||
| (GeoPoints(_), GeoPoints(_))
|
||||
| (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true,
|
||||
(
|
||||
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
|
||||
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
|
||||
) => left == right && left_dim == right_dim,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TypedChunk {
|
||||
pub fn to_debug_string(&self) -> String {
|
||||
match self {
|
||||
@ -85,7 +149,7 @@ impl TypedChunk {
|
||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(grenad) => {
|
||||
TypedChunk::FieldIdFacetStringDocids((grenad, _)) => {
|
||||
format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len())
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(grenad) => {
|
||||
@ -117,23 +181,32 @@ impl TypedChunk {
|
||||
/// Return new documents seen.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
pub(crate) fn write_typed_chunk_into_index(
|
||||
typed_chunk: TypedChunk,
|
||||
typed_chunks: Vec<TypedChunk>,
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
puffin::profile_function!(typed_chunk.to_debug_string());
|
||||
puffin::profile_function!(typed_chunks[0].to_debug_string());
|
||||
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunk {
|
||||
TypedChunk::Documents(obkv_documents_iter) => {
|
||||
match typed_chunks[0] {
|
||||
TypedChunk::Documents(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::Documents(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
let mut operations: Vec<DocumentOperation> = Default::default();
|
||||
|
||||
let mut docids = index.documents_ids(wtxn)?;
|
||||
let mut cursor = obkv_documents_iter.into_cursor()?;
|
||||
while let Some((key, reader)) = cursor.move_on_next()? {
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, reader)) = iter.next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
let reader: KvReader<FieldId> = KvReader::new(reader);
|
||||
|
||||
@ -174,59 +247,91 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
external_documents_docids.apply(wtxn, operations)?;
|
||||
index.put_documents_ids(wtxn, &docids)?;
|
||||
}
|
||||
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
|
||||
TypedChunk::FieldIdWordCountDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
append_entries_into_database(
|
||||
fid_word_count_docids_iter,
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.field_id_word_count_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
TypedChunk::WordDocids { .. } => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
||||
let _entered = span.enter();
|
||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
word_docids_iter.clone(),
|
||||
|
||||
let mut word_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut exact_word_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut word_fid_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
let clonable_exact_word_docids =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
|
||||
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
|
||||
}
|
||||
|
||||
let word_docids_merger = word_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
word_docids_merger,
|
||||
&index.word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
exact_word_docids_iter.clone(),
|
||||
let exact_word_docids_merger = exact_word_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
exact_word_docids_merger,
|
||||
&index.exact_word_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||
append_entries_into_database(
|
||||
word_fid_docids_iter,
|
||||
let word_fid_docids_merger = word_fid_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
word_fid_docids_merger,
|
||||
&index.word_fid_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
|
||||
let fst_merger = fst_merger_builder.build();
|
||||
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
|
||||
let db_fst = index.words_fst(wtxn)?;
|
||||
|
||||
// merge new fst with database fst
|
||||
@ -237,98 +342,202 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.put_words_fst(wtxn, &fst)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPositionDocids(word_position_docids_iter) => {
|
||||
TypedChunk::WordPositionDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
append_entries_into_database(
|
||||
word_position_docids_iter,
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_position_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
||||
TypedChunk::FieldIdFacetNumberDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
||||
let _entered = span.enter();
|
||||
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
data_size += facet_id_number_docids.len();
|
||||
builder.push(facet_id_number_docids.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
|
||||
indexer.execute(wtxn)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => {
|
||||
TypedChunk::FieldIdFacetStringDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
||||
let _entered = span.enter();
|
||||
let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter);
|
||||
|
||||
let mut facet_id_string_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut normalized_facet_id_string_builder =
|
||||
MergerBuilder::new(merge_deladd_btreeset_string as MergeFn);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetStringDocids((
|
||||
facet_id_string_docids,
|
||||
normalized_facet_id_string_docids,
|
||||
)) = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
data_size += facet_id_string_docids.len();
|
||||
facet_id_string_builder.push(facet_id_string_docids.into_cursor()?);
|
||||
normalized_facet_id_string_builder
|
||||
.push(normalized_facet_id_string_docids.into_cursor()?);
|
||||
}
|
||||
let facet_id_string_merger = facet_id_string_builder.build();
|
||||
let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(
|
||||
index,
|
||||
FacetType::String,
|
||||
facet_id_string_merger,
|
||||
Some(normalized_facet_id_string_merger),
|
||||
data_size,
|
||||
);
|
||||
indexer.execute(wtxn)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
||||
TypedChunk::FieldIdFacetExistsDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
||||
let _entered = span.enter();
|
||||
append_entries_into_database(
|
||||
facet_id_exists_docids,
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_exists_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
|
||||
TypedChunk::FieldIdFacetIsNullDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
||||
let _entered = span.enter();
|
||||
append_entries_into_database(
|
||||
facet_id_is_null_docids,
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
|
||||
let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids");
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
|
||||
let _entered = span.enter();
|
||||
append_entries_into_database(
|
||||
facet_id_is_empty_docids,
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
|
||||
TypedChunk::WordPairProximityDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
append_entries_into_database(
|
||||
word_pair_proximity_docids_iter,
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => {
|
||||
TypedChunk::FieldIdDocidFacetNumbers(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let index_fid_docid_facet_numbers =
|
||||
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
@ -344,14 +553,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => {
|
||||
TypedChunk::FieldIdDocidFacetStrings(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let index_fid_docid_facet_strings =
|
||||
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
|
||||
let mut cursor = fid_docid_facet_string.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
@ -367,14 +587,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::GeoPoints(geo_points) => {
|
||||
TypedChunk::GeoPoints(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::GeoPoints(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||
|
||||
let mut cursor = geo_points.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
@ -393,15 +624,38 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
manual_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
embedder_name,
|
||||
} => {
|
||||
TypedChunk::VectorPoints { .. } => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut params = None;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
manual_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
embedder_name,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
params = Some((expected_dimension, embedder_name));
|
||||
|
||||
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
||||
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
||||
if let Some(embeddings) = embeddings {
|
||||
embeddings_builder.push(embeddings.into_cursor()?);
|
||||
}
|
||||
}
|
||||
|
||||
// typed chunks has always at least 1 chunk.
|
||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
@ -419,8 +673,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let writers = writers?;
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let mut cursor = remove_vectors.into_cursor()?;
|
||||
while let Some((key, _)) = cursor.move_on_next()? {
|
||||
let merger = remove_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, _)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
for writer in &writers {
|
||||
@ -432,40 +687,39 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
if let Some(embeddings) = embeddings {
|
||||
let mut cursor = embeddings.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let data = pod_collect_to_vec(value);
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
let embeddings =
|
||||
crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||
// code error if we somehow got the wrong dimension
|
||||
.unwrap();
|
||||
let merger = embeddings_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let data = pod_collect_to_vec(value);
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||
// code error if we somehow got the wrong dimension
|
||||
.unwrap();
|
||||
|
||||
if embeddings.embedding_count() > usize::from(u8::MAX) {
|
||||
let external_docid = if let Ok(Some(Ok(index))) = index
|
||||
.external_id_of(wtxn, std::iter::once(docid))
|
||||
.map(|it| it.into_iter().next())
|
||||
{
|
||||
index
|
||||
} else {
|
||||
format!("internal docid={docid}")
|
||||
};
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
external_docid,
|
||||
embeddings.embedding_count(),
|
||||
)));
|
||||
}
|
||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||
writer.add_item(wtxn, docid, embedding)?;
|
||||
}
|
||||
if embeddings.embedding_count() > usize::from(u8::MAX) {
|
||||
let external_docid = if let Ok(Some(Ok(index))) = index
|
||||
.external_id_of(wtxn, std::iter::once(docid))
|
||||
.map(|it| it.into_iter().next())
|
||||
{
|
||||
index
|
||||
} else {
|
||||
format!("internal docid={docid}")
|
||||
};
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
external_docid,
|
||||
embeddings.embedding_count(),
|
||||
)));
|
||||
}
|
||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||
writer.add_item(wtxn, docid, embedding)?;
|
||||
}
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
let mut cursor = manual_vectors.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let merger = manual_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
let docid = DocumentId::from_be_bytes(left);
|
||||
@ -519,26 +773,30 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
|
||||
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
||||
}
|
||||
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||
TypedChunk::ScriptLanguageDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids");
|
||||
let _entered = span.enter();
|
||||
for (key, (deletion, addition)) in sl_map {
|
||||
let mut db_key_exists = false;
|
||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||
Some(db_values) => {
|
||||
db_key_exists = true;
|
||||
(db_values - deletion) | addition
|
||||
}
|
||||
None => addition,
|
||||
};
|
||||
|
||||
if final_value.is_empty() {
|
||||
// If the database entry exists, delete it.
|
||||
if db_key_exists {
|
||||
index.script_language_docids.delete(wtxn, &key)?;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() };
|
||||
for (key, (deletion, addition)) in sl_map {
|
||||
let mut db_key_exists = false;
|
||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||
Some(db_values) => {
|
||||
db_key_exists = true;
|
||||
(db_values - deletion) | addition
|
||||
}
|
||||
None => addition,
|
||||
};
|
||||
|
||||
if final_value.is_empty() {
|
||||
// If the database entry exists, delete it.
|
||||
if db_key_exists {
|
||||
index.script_language_docids.delete(wtxn, &key)?;
|
||||
}
|
||||
} else {
|
||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||
}
|
||||
} else {
|
||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -557,13 +815,9 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||
}
|
||||
|
||||
fn merge_word_docids_reader_into_fst(
|
||||
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
|
||||
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||
) -> Result<fst::Set<Vec<u8>>> {
|
||||
let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
||||
merger_builder.push(word_docids_iter.into_cursor()?);
|
||||
merger_builder.push(exact_word_docids_iter.into_cursor()?);
|
||||
let mut iter = merger_builder.build().into_stream_merger_iter()?;
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
while let Some((k, _)) = iter.next()? {
|
||||
@ -577,10 +831,9 @@ fn merge_word_docids_reader_into_fst(
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
data: grenad::Reader<R>,
|
||||
merger: Merger<R, MergeFn>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
@ -589,22 +842,17 @@ where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||
|
||||
puffin::profile_function!();
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut cursor = data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
let value = match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
@ -614,62 +862,5 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
/// All provided entries must be ordered.
|
||||
/// If the index is not empty, write_entries_into_database is called instead.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn append_entries_into_database<R, K, V, FS, FM>(
|
||||
data: grenad::Reader<R>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
K: for<'a> heed::BytesDecode<'a>,
|
||||
{
|
||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||
|
||||
if !index_is_empty {
|
||||
return write_entries_into_database(
|
||||
data,
|
||||
database,
|
||||
wtxn,
|
||||
false,
|
||||
serialize_value,
|
||||
merge_values,
|
||||
);
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = database.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut cursor = data.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
debug_assert!(
|
||||
K::bytes_decode(key).is_ok(),
|
||||
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
||||
key.len(),
|
||||
&key
|
||||
);
|
||||
buffer.clear();
|
||||
let value = serialize_value(value, &mut buffer)?;
|
||||
unsafe {
|
||||
// safety: We do not keep a reference to anything that lives inside the database
|
||||
database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, value)?
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -3,9 +3,8 @@ pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::{
|
||||
merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||
MergeFn,
|
||||
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
|
||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::settings::{validate_embedding_settings, Setting, Settings};
|
||||
|
@ -979,6 +979,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
crate::vector::settings::EmbeddingSettings::apply_default_source(
|
||||
&mut setting,
|
||||
);
|
||||
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
|
||||
&mut setting,
|
||||
);
|
||||
let setting = validate_embedding_settings(setting, &name)?;
|
||||
changed = true;
|
||||
new_configs.insert(name, setting);
|
||||
@ -1029,6 +1032,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
{
|
||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
let existing_fields: HashSet<_> = self
|
||||
.index
|
||||
.field_distribution(self.wtxn)?
|
||||
.into_iter()
|
||||
.filter_map(|(field, count)| (count != 0).then_some(field))
|
||||
.collect();
|
||||
|
||||
let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
|
||||
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
|
||||
|
||||
@ -1049,7 +1059,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
// index new fields as facets. It means that the distinct attribute,
|
||||
// an Asc/Desc criterion or a filtered attribute as be added or removed.
|
||||
let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
|
||||
let faceted_updated = old_faceted_fields != new_faceted_fields;
|
||||
let faceted_updated =
|
||||
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
|
||||
|
||||
let stop_words_updated = self.update_stop_words()?;
|
||||
let non_separator_tokens_updated = self.update_non_separator_tokens()?;
|
||||
@ -1124,6 +1135,14 @@ pub fn validate_embedding_settings(
|
||||
let Setting::Set(settings) = settings else { return Ok(settings) };
|
||||
let EmbeddingSettings { source, model, revision, api_key, dimensions, document_template } =
|
||||
settings;
|
||||
|
||||
if let Some(0) = dimensions.set() {
|
||||
return Err(crate::error::UserError::InvalidSettingsDimensions {
|
||||
embedder_name: name.to_owned(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
|
||||
let Some(inferred_source) = source.set() else {
|
||||
return Ok(Setting::Set(EmbeddingSettings {
|
||||
source,
|
||||
@ -1137,14 +1156,34 @@ pub fn validate_embedding_settings(
|
||||
match inferred_source {
|
||||
EmbedderSource::OpenAi => {
|
||||
check_unset(&revision, "revision", inferred_source, name)?;
|
||||
check_unset(&dimensions, "dimensions", inferred_source, name)?;
|
||||
if let Setting::Set(model) = &model {
|
||||
crate::vector::openai::EmbeddingModel::from_name(model.as_str()).ok_or(
|
||||
crate::error::UserError::InvalidOpenAiModel {
|
||||
let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str())
|
||||
.ok_or(crate::error::UserError::InvalidOpenAiModel {
|
||||
embedder_name: name.to_owned(),
|
||||
model: model.clone(),
|
||||
},
|
||||
)?;
|
||||
})?;
|
||||
if let Setting::Set(dimensions) = dimensions {
|
||||
if !model.supports_overriding_dimensions()
|
||||
&& dimensions != model.default_dimensions()
|
||||
{
|
||||
return Err(crate::error::UserError::InvalidOpenAiModelDimensions {
|
||||
embedder_name: name.to_owned(),
|
||||
model: model.name(),
|
||||
dimensions,
|
||||
expected_dimensions: model.default_dimensions(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
if dimensions > model.default_dimensions() {
|
||||
return Err(crate::error::UserError::InvalidOpenAiModelDimensionsMax {
|
||||
embedder_name: name.to_owned(),
|
||||
model: model.name(),
|
||||
dimensions,
|
||||
max_dimensions: model.default_dimensions(),
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
EmbedderSource::HuggingFace => {
|
||||
|
@ -47,7 +47,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
||||
)]
|
||||
pub fn execute(
|
||||
self,
|
||||
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
|
||||
new_word_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
@ -68,7 +68,8 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
||||
if !common_prefix_fst_words.is_empty() {
|
||||
let mut current_prefixes: Option<&&[String]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
|
||||
let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?;
|
||||
while let Some((word, data)) = new_word_docids_iter.next()? {
|
||||
current_prefixes = match current_prefixes.take() {
|
||||
Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes),
|
||||
_otherwise => {
|
||||
|
@ -52,7 +52,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
||||
)]
|
||||
pub fn execute(
|
||||
self,
|
||||
new_word_integer_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
@ -69,14 +69,14 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
||||
self.max_memory,
|
||||
);
|
||||
|
||||
let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?;
|
||||
|
||||
if !common_prefix_fst_words.is_empty() {
|
||||
// We fetch all the new common prefixes between the previous and new prefix fst.
|
||||
let mut buffer = Vec::new();
|
||||
let mut current_prefixes: Option<&&[String]> = None;
|
||||
let mut prefixes_cache = HashMap::new();
|
||||
while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? {
|
||||
let mut new_word_integer_docids_iter =
|
||||
new_word_integer_docids.into_stream_merger_iter()?;
|
||||
while let Some((key, data)) = new_word_integer_docids_iter.next()? {
|
||||
let (word, pos) =
|
||||
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
|
||||
|
@ -59,14 +59,18 @@ pub enum EmbedErrorKind {
|
||||
OpenAiAuth(OpenAiError),
|
||||
#[error("sent too many requests to OpenAI: {0}")]
|
||||
OpenAiTooManyRequests(OpenAiError),
|
||||
#[error("received internal error from OpenAI: {0}")]
|
||||
OpenAiInternalServerError(OpenAiError),
|
||||
#[error("received internal error from OpenAI: {0:?}")]
|
||||
OpenAiInternalServerError(Option<OpenAiError>),
|
||||
#[error("sent too many tokens in a request to OpenAI: {0}")]
|
||||
OpenAiTooManyTokens(OpenAiError),
|
||||
#[error("received unhandled HTTP status code {0} from OpenAI")]
|
||||
OpenAiUnhandledStatusCode(u16),
|
||||
#[error("attempt to embed the following text in a configuration where embeddings must be user provided: {0:?}")]
|
||||
ManualEmbed(String),
|
||||
#[error("could not initialize asynchronous runtime: {0}")]
|
||||
OpenAiRuntimeInit(std::io::Error),
|
||||
#[error("initializing web client for sending embedding requests failed: {0}")]
|
||||
InitWebClient(reqwest::Error),
|
||||
}
|
||||
|
||||
impl EmbedError {
|
||||
@ -102,7 +106,7 @@ impl EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiTooManyRequests(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_internal_server_error(inner: OpenAiError) -> EmbedError {
|
||||
pub(crate) fn openai_internal_server_error(inner: Option<OpenAiError>) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiInternalServerError(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
@ -117,6 +121,14 @@ impl EmbedError {
|
||||
pub(crate) fn embed_on_manual_embedder(texts: String) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::ManualEmbed(texts), fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn openai_runtime_init(inner: std::io::Error) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::OpenAiRuntimeInit(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_initialize_web_client(inner: reqwest::Error) -> Self {
|
||||
Self { kind: EmbedErrorKind::InitWebClient(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@ -183,10 +195,6 @@ impl NewEmbedderError {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn openai_initialize_web_client(inner: reqwest::Error) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::InitWebClient(inner), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub fn openai_invalid_api_key_format(inner: reqwest::header::InvalidHeaderValue) -> Self {
|
||||
Self { kind: NewEmbedderErrorKind::InvalidApiKeyFormat(inner), fault: FaultSource::User }
|
||||
}
|
||||
@ -237,8 +245,6 @@ pub enum NewEmbedderErrorKind {
|
||||
#[error("loading model failed: {0}")]
|
||||
LoadModel(candle_core::Error),
|
||||
// openai
|
||||
#[error("initializing web client for sending embedding requests failed: {0}")]
|
||||
InitWebClient(reqwest::Error),
|
||||
#[error("The API key passed to Authorization error was in an invalid format: {0}")]
|
||||
InvalidApiKeyFormat(reqwest::header::InvalidHeaderValue),
|
||||
}
|
||||
|
@ -151,7 +151,8 @@ impl Embedder {
|
||||
let token_ids = tokens
|
||||
.iter()
|
||||
.map(|tokens| {
|
||||
let tokens = tokens.get_ids().to_vec();
|
||||
let mut tokens = tokens.get_ids().to_vec();
|
||||
tokens.truncate(512);
|
||||
Tensor::new(tokens.as_slice(), &self.model.device).map_err(EmbedError::tensor_shape)
|
||||
})
|
||||
.collect::<Result<Vec<_>, EmbedError>>()?;
|
||||
|
@ -163,18 +163,24 @@ impl Embedder {
|
||||
) -> std::result::Result<Vec<Embeddings<f32>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed(texts),
|
||||
Embedder::OpenAi(embedder) => embedder.embed(texts).await,
|
||||
Embedder::OpenAi(embedder) => {
|
||||
let client = embedder.new_client()?;
|
||||
embedder.embed(texts, &client).await
|
||||
}
|
||||
Embedder::UserProvided(embedder) => embedder.embed(texts),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn embed_chunks(
|
||||
/// # Panics
|
||||
///
|
||||
/// - if called from an asynchronous context
|
||||
pub fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks).await,
|
||||
Embedder::OpenAi(embedder) => embedder.embed_chunks(text_chunks),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_chunks(text_chunks),
|
||||
}
|
||||
}
|
||||
@ -255,3 +261,7 @@ impl DistributionShift {
|
||||
score
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn is_cuda_enabled() -> bool {
|
||||
cfg!(feature = "cuda")
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ use super::{DistributionShift, Embedding, Embeddings};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Embedder {
|
||||
client: reqwest::Client,
|
||||
headers: reqwest::header::HeaderMap,
|
||||
tokenizer: tiktoken_rs::CoreBPE,
|
||||
options: EmbedderOptions,
|
||||
}
|
||||
@ -17,6 +17,7 @@ pub struct Embedder {
|
||||
pub struct EmbedderOptions {
|
||||
pub api_key: Option<String>,
|
||||
pub embedding_model: EmbeddingModel,
|
||||
pub dimensions: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(
|
||||
@ -41,34 +42,50 @@ pub enum EmbeddingModel {
|
||||
#[serde(rename = "text-embedding-ada-002")]
|
||||
#[deserr(rename = "text-embedding-ada-002")]
|
||||
TextEmbeddingAda002,
|
||||
|
||||
#[serde(rename = "text-embedding-3-small")]
|
||||
#[deserr(rename = "text-embedding-3-small")]
|
||||
TextEmbedding3Small,
|
||||
|
||||
#[serde(rename = "text-embedding-3-large")]
|
||||
#[deserr(rename = "text-embedding-3-large")]
|
||||
TextEmbedding3Large,
|
||||
}
|
||||
|
||||
impl EmbeddingModel {
|
||||
pub fn supported_models() -> &'static [&'static str] {
|
||||
&["text-embedding-ada-002"]
|
||||
&["text-embedding-ada-002", "text-embedding-3-small", "text-embedding-3-large"]
|
||||
}
|
||||
|
||||
pub fn max_token(&self) -> usize {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => 8191,
|
||||
EmbeddingModel::TextEmbedding3Large => 8191,
|
||||
EmbeddingModel::TextEmbedding3Small => 8191,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dimensions(&self) -> usize {
|
||||
pub fn default_dimensions(&self) -> usize {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => 1536,
|
||||
EmbeddingModel::TextEmbedding3Large => 3072,
|
||||
EmbeddingModel::TextEmbedding3Small => 1536,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &'static str {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => "text-embedding-ada-002",
|
||||
EmbeddingModel::TextEmbedding3Large => "text-embedding-3-large",
|
||||
EmbeddingModel::TextEmbedding3Small => "text-embedding-3-small",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_name(name: &str) -> Option<Self> {
|
||||
match name {
|
||||
"text-embedding-ada-002" => Some(EmbeddingModel::TextEmbeddingAda002),
|
||||
"text-embedding-3-large" => Some(EmbeddingModel::TextEmbedding3Large),
|
||||
"text-embedding-3-small" => Some(EmbeddingModel::TextEmbedding3Small),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@ -78,6 +95,20 @@ impl EmbeddingModel {
|
||||
EmbeddingModel::TextEmbeddingAda002 => {
|
||||
Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 })
|
||||
}
|
||||
EmbeddingModel::TextEmbedding3Large => {
|
||||
Some(DistributionShift { current_mean: 0.70, current_sigma: 0.1 })
|
||||
}
|
||||
EmbeddingModel::TextEmbedding3Small => {
|
||||
Some(DistributionShift { current_mean: 0.75, current_sigma: 0.1 })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn supports_overriding_dimensions(&self) -> bool {
|
||||
match self {
|
||||
EmbeddingModel::TextEmbeddingAda002 => false,
|
||||
EmbeddingModel::TextEmbedding3Large => true,
|
||||
EmbeddingModel::TextEmbedding3Small => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -86,15 +117,22 @@ pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings";
|
||||
|
||||
impl EmbedderOptions {
|
||||
pub fn with_default_model(api_key: Option<String>) -> Self {
|
||||
Self { api_key, embedding_model: Default::default() }
|
||||
Self { api_key, embedding_model: Default::default(), dimensions: None }
|
||||
}
|
||||
|
||||
pub fn with_embedding_model(api_key: Option<String>, embedding_model: EmbeddingModel) -> Self {
|
||||
Self { api_key, embedding_model }
|
||||
Self { api_key, embedding_model, dimensions: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl Embedder {
|
||||
pub fn new_client(&self) -> Result<reqwest::Client, EmbedError> {
|
||||
reqwest::ClientBuilder::new()
|
||||
.default_headers(self.headers.clone())
|
||||
.build()
|
||||
.map_err(EmbedError::openai_initialize_web_client)
|
||||
}
|
||||
|
||||
pub fn new(options: EmbedderOptions) -> Result<Self, NewEmbedderError> {
|
||||
let mut headers = reqwest::header::HeaderMap::new();
|
||||
let mut inferred_api_key = Default::default();
|
||||
@ -111,25 +149,25 @@ impl Embedder {
|
||||
reqwest::header::CONTENT_TYPE,
|
||||
reqwest::header::HeaderValue::from_static("application/json"),
|
||||
);
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.default_headers(headers)
|
||||
.build()
|
||||
.map_err(NewEmbedderError::openai_initialize_web_client)?;
|
||||
|
||||
// looking at the code it is very unclear that this can actually fail.
|
||||
let tokenizer = tiktoken_rs::cl100k_base().unwrap();
|
||||
|
||||
Ok(Self { options, client, tokenizer })
|
||||
Ok(Self { options, headers, tokenizer })
|
||||
}
|
||||
|
||||
pub async fn embed(&self, texts: Vec<String>) -> Result<Vec<Embeddings<f32>>, EmbedError> {
|
||||
pub async fn embed(
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
client: &reqwest::Client,
|
||||
) -> Result<Vec<Embeddings<f32>>, EmbedError> {
|
||||
let mut tokenized = false;
|
||||
|
||||
for attempt in 0..7 {
|
||||
let result = if tokenized {
|
||||
self.try_embed_tokenized(&texts).await
|
||||
self.try_embed_tokenized(&texts, client).await
|
||||
} else {
|
||||
self.try_embed(&texts).await
|
||||
self.try_embed(&texts, client).await
|
||||
};
|
||||
|
||||
let retry_duration = match result {
|
||||
@ -140,6 +178,8 @@ impl Embedder {
|
||||
retry.into_duration(attempt)
|
||||
}
|
||||
}?;
|
||||
|
||||
let retry_duration = retry_duration.min(std::time::Duration::from_secs(60)); // don't wait more than a minute
|
||||
tracing::warn!(
|
||||
"Attempt #{}, retrying after {}ms.",
|
||||
attempt,
|
||||
@ -149,9 +189,9 @@ impl Embedder {
|
||||
}
|
||||
|
||||
let result = if tokenized {
|
||||
self.try_embed_tokenized(&texts).await
|
||||
self.try_embed_tokenized(&texts, client).await
|
||||
} else {
|
||||
self.try_embed(&texts).await
|
||||
self.try_embed(&texts, client).await
|
||||
};
|
||||
|
||||
result.map_err(Retry::into_error)
|
||||
@ -182,24 +222,12 @@ impl Embedder {
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
StatusCode::INTERNAL_SERVER_ERROR => {
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
StatusCode::INTERNAL_SERVER_ERROR
|
||||
| StatusCode::BAD_GATEWAY
|
||||
| StatusCode::SERVICE_UNAVAILABLE => {
|
||||
let error_response: Result<OpenAiErrorResponse, _> = response.json().await;
|
||||
return Err(Retry::retry_later(EmbedError::openai_internal_server_error(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
StatusCode::SERVICE_UNAVAILABLE => {
|
||||
let error_response: OpenAiErrorResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
return Err(Retry::retry_later(EmbedError::openai_internal_server_error(
|
||||
error_response.error,
|
||||
error_response.ok().map(|error_response| error_response.error),
|
||||
)));
|
||||
}
|
||||
StatusCode::BAD_REQUEST => {
|
||||
@ -210,14 +238,14 @@ impl Embedder {
|
||||
.map_err(EmbedError::openai_unexpected)
|
||||
.map_err(Retry::retry_later)?;
|
||||
|
||||
tracing::warn!("OpenAI: input was too long, retrying on tokenized version. For best performance, limit the size of your prompt.");
|
||||
tracing::warn!("OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your prompt.");
|
||||
|
||||
return Err(Retry::retry_tokenized(EmbedError::openai_too_many_tokens(
|
||||
error_response.error,
|
||||
)));
|
||||
}
|
||||
code => {
|
||||
return Err(Retry::give_up(EmbedError::openai_unhandled_status_code(
|
||||
return Err(Retry::retry_later(EmbedError::openai_unhandled_status_code(
|
||||
code.as_u16(),
|
||||
)));
|
||||
}
|
||||
@ -229,13 +257,17 @@ impl Embedder {
|
||||
async fn try_embed<S: AsRef<str> + serde::Serialize>(
|
||||
&self,
|
||||
texts: &[S],
|
||||
client: &reqwest::Client,
|
||||
) -> Result<Vec<Embeddings<f32>>, Retry> {
|
||||
for text in texts {
|
||||
tracing::trace!("Received prompt: {}", text.as_ref())
|
||||
}
|
||||
let request = OpenAiRequest { model: self.options.embedding_model.name(), input: texts };
|
||||
let response = self
|
||||
.client
|
||||
let request = OpenAiRequest {
|
||||
model: self.options.embedding_model.name(),
|
||||
input: texts,
|
||||
dimensions: self.overriden_dimensions(),
|
||||
};
|
||||
let response = client
|
||||
.post(OPENAI_EMBEDDINGS_URL)
|
||||
.json(&request)
|
||||
.send()
|
||||
@ -260,7 +292,11 @@ impl Embedder {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn try_embed_tokenized(&self, text: &[String]) -> Result<Vec<Embeddings<f32>>, Retry> {
|
||||
async fn try_embed_tokenized(
|
||||
&self,
|
||||
text: &[String],
|
||||
client: &reqwest::Client,
|
||||
) -> Result<Vec<Embeddings<f32>>, Retry> {
|
||||
pub const OVERLAP_SIZE: usize = 200;
|
||||
let mut all_embeddings = Vec::with_capacity(text.len());
|
||||
for text in text {
|
||||
@ -268,31 +304,34 @@ impl Embedder {
|
||||
let encoded = self.tokenizer.encode_ordinary(text.as_str());
|
||||
let len = encoded.len();
|
||||
if len < max_token_count {
|
||||
all_embeddings.append(&mut self.try_embed(&[text]).await?);
|
||||
all_embeddings.append(&mut self.try_embed(&[text], client).await?);
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut tokens = encoded.as_slice();
|
||||
let mut embeddings_for_prompt =
|
||||
Embeddings::new(self.options.embedding_model.dimensions());
|
||||
let mut embeddings_for_prompt = Embeddings::new(self.dimensions());
|
||||
while tokens.len() > max_token_count {
|
||||
let window = &tokens[..max_token_count];
|
||||
embeddings_for_prompt.push(self.embed_tokens(window).await?).unwrap();
|
||||
embeddings_for_prompt.push(self.embed_tokens(window, client).await?).unwrap();
|
||||
|
||||
tokens = &tokens[max_token_count - OVERLAP_SIZE..];
|
||||
}
|
||||
|
||||
// end of text
|
||||
embeddings_for_prompt.push(self.embed_tokens(tokens).await?).unwrap();
|
||||
embeddings_for_prompt.push(self.embed_tokens(tokens, client).await?).unwrap();
|
||||
|
||||
all_embeddings.push(embeddings_for_prompt);
|
||||
}
|
||||
Ok(all_embeddings)
|
||||
}
|
||||
|
||||
async fn embed_tokens(&self, tokens: &[usize]) -> Result<Embedding, Retry> {
|
||||
async fn embed_tokens(
|
||||
&self,
|
||||
tokens: &[usize],
|
||||
client: &reqwest::Client,
|
||||
) -> Result<Embedding, Retry> {
|
||||
for attempt in 0..9 {
|
||||
let duration = match self.try_embed_tokens(tokens).await {
|
||||
let duration = match self.try_embed_tokens(tokens, client).await {
|
||||
Ok(embedding) => return Ok(embedding),
|
||||
Err(retry) => retry.into_duration(attempt),
|
||||
}
|
||||
@ -301,14 +340,22 @@ impl Embedder {
|
||||
tokio::time::sleep(duration).await;
|
||||
}
|
||||
|
||||
self.try_embed_tokens(tokens).await.map_err(|retry| Retry::give_up(retry.into_error()))
|
||||
self.try_embed_tokens(tokens, client)
|
||||
.await
|
||||
.map_err(|retry| Retry::give_up(retry.into_error()))
|
||||
}
|
||||
|
||||
async fn try_embed_tokens(&self, tokens: &[usize]) -> Result<Embedding, Retry> {
|
||||
let request =
|
||||
OpenAiTokensRequest { model: self.options.embedding_model.name(), input: tokens };
|
||||
let response = self
|
||||
.client
|
||||
async fn try_embed_tokens(
|
||||
&self,
|
||||
tokens: &[usize],
|
||||
client: &reqwest::Client,
|
||||
) -> Result<Embedding, Retry> {
|
||||
let request = OpenAiTokensRequest {
|
||||
model: self.options.embedding_model.name(),
|
||||
input: tokens,
|
||||
dimensions: self.overriden_dimensions(),
|
||||
};
|
||||
let response = client
|
||||
.post(OPENAI_EMBEDDINGS_URL)
|
||||
.json(&request)
|
||||
.send()
|
||||
@ -326,12 +373,19 @@ impl Embedder {
|
||||
Ok(response.data.pop().map(|data| data.embedding).unwrap_or_default())
|
||||
}
|
||||
|
||||
pub async fn embed_chunks(
|
||||
pub fn embed_chunks(
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
|
||||
futures::future::try_join_all(text_chunks.into_iter().map(|prompts| self.embed(prompts)))
|
||||
.await
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_io()
|
||||
.enable_time()
|
||||
.build()
|
||||
.map_err(EmbedError::openai_runtime_init)?;
|
||||
let client = self.new_client()?;
|
||||
rt.block_on(futures::future::try_join_all(
|
||||
text_chunks.into_iter().map(|prompts| self.embed(prompts, &client)),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn chunk_count_hint(&self) -> usize {
|
||||
@ -343,12 +397,24 @@ impl Embedder {
|
||||
}
|
||||
|
||||
pub fn dimensions(&self) -> usize {
|
||||
self.options.embedding_model.dimensions()
|
||||
if self.options.embedding_model.supports_overriding_dimensions() {
|
||||
self.options.dimensions.unwrap_or(self.options.embedding_model.default_dimensions())
|
||||
} else {
|
||||
self.options.embedding_model.default_dimensions()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn distribution(&self) -> Option<DistributionShift> {
|
||||
self.options.embedding_model.distribution()
|
||||
}
|
||||
|
||||
fn overriden_dimensions(&self) -> Option<usize> {
|
||||
if self.options.embedding_model.supports_overriding_dimensions() {
|
||||
self.options.dimensions
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// retrying in case of failure
|
||||
@ -408,12 +474,16 @@ impl Retry {
|
||||
struct OpenAiRequest<'a, S: AsRef<str> + serde::Serialize> {
|
||||
model: &'a str,
|
||||
input: &'a [S],
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
dimensions: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct OpenAiTokensRequest<'a> {
|
||||
model: &'a str,
|
||||
input: &'a [usize],
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
dimensions: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
|
@ -1,6 +1,7 @@
|
||||
use deserr::Deserr;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::openai;
|
||||
use crate::prompt::PromptData;
|
||||
use crate::update::Setting;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
@ -82,7 +83,7 @@ impl EmbeddingSettings {
|
||||
Self::MODEL => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi],
|
||||
Self::REVISION => &[EmbedderSource::HuggingFace],
|
||||
Self::API_KEY => &[EmbedderSource::OpenAi],
|
||||
Self::DIMENSIONS => &[EmbedderSource::UserProvided],
|
||||
Self::DIMENSIONS => &[EmbedderSource::OpenAi, EmbedderSource::UserProvided],
|
||||
Self::DOCUMENT_TEMPLATE => &[EmbedderSource::HuggingFace, EmbedderSource::OpenAi],
|
||||
_other => unreachable!("unknown field"),
|
||||
}
|
||||
@ -90,9 +91,13 @@ impl EmbeddingSettings {
|
||||
|
||||
pub fn allowed_fields_for_source(source: EmbedderSource) -> &'static [&'static str] {
|
||||
match source {
|
||||
EmbedderSource::OpenAi => {
|
||||
&[Self::SOURCE, Self::MODEL, Self::API_KEY, Self::DOCUMENT_TEMPLATE]
|
||||
}
|
||||
EmbedderSource::OpenAi => &[
|
||||
Self::SOURCE,
|
||||
Self::MODEL,
|
||||
Self::API_KEY,
|
||||
Self::DOCUMENT_TEMPLATE,
|
||||
Self::DIMENSIONS,
|
||||
],
|
||||
EmbedderSource::HuggingFace => {
|
||||
&[Self::SOURCE, Self::MODEL, Self::REVISION, Self::DOCUMENT_TEMPLATE]
|
||||
}
|
||||
@ -109,6 +114,17 @@ impl EmbeddingSettings {
|
||||
*source = Setting::Set(EmbedderSource::default())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn apply_default_openai_model(setting: &mut Setting<EmbeddingSettings>) {
|
||||
if let Setting::Set(EmbeddingSettings {
|
||||
source: Setting::Set(EmbedderSource::OpenAi),
|
||||
model: model @ (Setting::NotSet | Setting::Reset),
|
||||
..
|
||||
}) = setting
|
||||
{
|
||||
*model = Setting::Set(openai::EmbeddingModel::default().name().to_owned())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]
|
||||
@ -176,7 +192,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
model: Setting::Set(options.embedding_model.name().to_owned()),
|
||||
revision: Setting::NotSet,
|
||||
api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
|
||||
dimensions: Setting::NotSet,
|
||||
dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(),
|
||||
document_template: Setting::Set(prompt.template),
|
||||
},
|
||||
super::EmbedderOptions::UserProvided(options) => Self {
|
||||
@ -208,6 +224,9 @@ impl From<EmbeddingSettings> for EmbeddingConfig {
|
||||
if let Some(api_key) = api_key.set() {
|
||||
options.api_key = Some(api_key);
|
||||
}
|
||||
if let Some(dimensions) = dimensions.set() {
|
||||
options.dimensions = Some(dimensions);
|
||||
}
|
||||
this.embedder_options = super::EmbedderOptions::OpenAi(options);
|
||||
}
|
||||
EmbedderSource::HuggingFace => {
|
||||
|
@ -13,10 +13,11 @@ serde_json = "1.0.111"
|
||||
tracing = "0.1.40"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-subscriber = "0.3.18"
|
||||
byte-unit = { version = "4.0.19", default-features = false, features = [
|
||||
"std",
|
||||
"serde",
|
||||
] }
|
||||
tokio = { version = "1.35.1", features = ["sync"] }
|
||||
clap = { version = "4.4.18", features = ["derive"] }
|
||||
anyhow = "1.0.79"
|
||||
byte-unit = { version = "5.1.4", features = ["byte"] }
|
||||
|
||||
[target.'cfg(any(target_os = "linux", target_os = "macos"))'.dependencies]
|
||||
libproc = "0.14.2"
|
||||
|
@ -1,103 +0,0 @@
|
||||
use std::collections::vec_deque::Drain;
|
||||
use std::collections::VecDeque;
|
||||
use std::io::{self, BufReader, BufWriter, Stdout, Write};
|
||||
use std::mem;
|
||||
|
||||
use anyhow::Context;
|
||||
use byte_unit::Byte;
|
||||
use clap::Parser;
|
||||
use tracing_trace::entry::{Entry, NewSpan};
|
||||
|
||||
/// A program that filters trace logs to only keeps
|
||||
/// the logs related to memory usage above the given threshold.
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// The threshold that a log must have to be returned by this program.
|
||||
#[arg(short, long)]
|
||||
memory_threshold: Byte,
|
||||
|
||||
/// Number of context lines to keep around high memory log lines.
|
||||
#[arg(long, default_value_t = 10)]
|
||||
context: usize,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let Args { memory_threshold, context } = Args::parse();
|
||||
|
||||
let mut context = EntryContext::new(context);
|
||||
let mut currently_in_threshold = false;
|
||||
|
||||
let input = BufReader::new(io::stdin());
|
||||
let mut output = io::BufWriter::new(io::stdout());
|
||||
for result in tracing_trace::TraceReader::new(input) {
|
||||
let entry = result?;
|
||||
|
||||
match entry {
|
||||
Entry::NewCallsite(_) | Entry::NewThread(_) => {
|
||||
write_to_output(&mut output, &entry)?;
|
||||
}
|
||||
Entry::NewSpan(NewSpan { id, call_id, parent_id, thread_id }) => todo!(),
|
||||
Entry::SpanEnter(_) => todo!(),
|
||||
Entry::SpanExit(_) => todo!(),
|
||||
Entry::SpanClose(_) => todo!(),
|
||||
Entry::Event(_) => todo!(),
|
||||
}
|
||||
|
||||
// if matches!(entry, Entry::NewCallsite(_) | Entry::NewThread(_)) {
|
||||
// write_to_output(&mut output, &entry)?;
|
||||
// } else if entry.memory().map_or(true, |m| m.resident < memory_threshold.as_u64()) {
|
||||
// if mem::replace(&mut currently_in_threshold, false) {
|
||||
// for entry in context.drain() {
|
||||
// write_to_output(&mut output, &entry)?;
|
||||
// }
|
||||
// }
|
||||
// context.push(entry);
|
||||
// } else {
|
||||
// currently_in_threshold = true;
|
||||
// for entry in context.drain() {
|
||||
// write_to_output(&mut output, &entry)?;
|
||||
// }
|
||||
// write_to_output(&mut output, &entry)?;
|
||||
// }
|
||||
}
|
||||
|
||||
for entry in context.drain() {
|
||||
write_to_output(&mut output, &entry)?;
|
||||
}
|
||||
|
||||
output.flush().context("flushing stdout")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_to_output(writer: &mut BufWriter<Stdout>, entry: &Entry) -> anyhow::Result<()> {
|
||||
serde_json::to_writer(writer, &entry).context("while serializing and writing to stdout")
|
||||
}
|
||||
|
||||
/// Keeps only the last `size` element in memory.
|
||||
/// It's basically a sliding window.
|
||||
pub struct EntryContext {
|
||||
size: usize,
|
||||
queue: VecDeque<Entry>,
|
||||
}
|
||||
|
||||
impl EntryContext {
|
||||
pub fn new(size: usize) -> EntryContext {
|
||||
EntryContext { size, queue: VecDeque::with_capacity(size) }
|
||||
}
|
||||
|
||||
pub fn is_full(&self) -> bool {
|
||||
self.size >= self.queue.len()
|
||||
}
|
||||
|
||||
pub fn push(&mut self, entry: Entry) {
|
||||
if self.queue.len() == self.size {
|
||||
self.queue.pop_front();
|
||||
}
|
||||
self.queue.push_back(entry);
|
||||
}
|
||||
|
||||
pub fn drain(&mut self) -> Drain<Entry> {
|
||||
self.queue.drain(..)
|
||||
}
|
||||
}
|
@ -38,20 +38,6 @@ pub enum Entry {
|
||||
Event(Event),
|
||||
}
|
||||
|
||||
impl Entry {
|
||||
pub fn memory(&self) -> Option<MemoryStats> {
|
||||
match self {
|
||||
Entry::NewCallsite(_)
|
||||
| Entry::NewThread(_)
|
||||
| Entry::NewSpan(_)
|
||||
| Entry::SpanClose(_) => None,
|
||||
Entry::SpanEnter(event) => event.memory,
|
||||
Entry::SpanExit(event) => event.memory,
|
||||
Entry::Event(event) => event.memory,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct SpanId(u64);
|
||||
|
||||
|
@ -189,7 +189,7 @@ fn print_duration(duration: std::time::Duration) -> String {
|
||||
|
||||
/// Format only the allocated bytes, deallocated bytes and reallocated bytes in GiB, MiB, KiB, Bytes.
|
||||
fn print_memory(MemoryStats { resident }: MemoryStats) -> String {
|
||||
use byte_unit::{Byte, UnitType};
|
||||
let rss_bytes = Byte::from_u64(resident).get_appropriate_unit(UnitType::Binary);
|
||||
use byte_unit::Byte;
|
||||
let rss_bytes = Byte::from_bytes(resident).get_appropriate_unit(true);
|
||||
format!("RSS {rss_bytes:.2}")
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::ops::Range;
|
||||
use std::time::Duration;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
@ -16,6 +17,51 @@ enum SpanStatus {
|
||||
pub struct CallStats {
|
||||
pub call_count: usize,
|
||||
pub time: u64,
|
||||
pub self_time: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct SelfTime {
|
||||
child_ranges: Vec<Range<Duration>>,
|
||||
}
|
||||
|
||||
impl SelfTime {
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
pub fn add_child_range(&mut self, child_range: Range<Duration>) {
|
||||
self.child_ranges.push(child_range)
|
||||
}
|
||||
|
||||
pub fn self_duration(&mut self, self_range: Range<Duration>) -> Duration {
|
||||
if self.child_ranges.is_empty() {
|
||||
return self_range.end - self_range.start;
|
||||
}
|
||||
|
||||
// by sorting child ranges by their start time,
|
||||
// we make sure that no child will start before the last one we visited.
|
||||
self.child_ranges
|
||||
.sort_by(|left, right| left.start.cmp(&right.start).then(left.end.cmp(&right.end)));
|
||||
// self duration computed by adding all the segments where the span is not executing a child
|
||||
let mut self_duration = Duration::from_nanos(0);
|
||||
|
||||
// last point in time where we are certain that this span was not executing a child.
|
||||
let mut committed_point = self_range.start;
|
||||
|
||||
for child_range in &self.child_ranges {
|
||||
if child_range.start > committed_point {
|
||||
// we add to the self duration the point between the end of the latest span and the beginning of the next span
|
||||
self_duration += child_range.start - committed_point;
|
||||
}
|
||||
if committed_point < child_range.end {
|
||||
// then we set ourselves to the end of the latest span
|
||||
committed_point = child_range.end;
|
||||
}
|
||||
}
|
||||
|
||||
self_duration
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_call_stats<R: std::io::Read>(
|
||||
@ -23,6 +69,9 @@ pub fn to_call_stats<R: std::io::Read>(
|
||||
) -> Result<BTreeMap<String, CallStats>, Error> {
|
||||
let mut calls = HashMap::new();
|
||||
let mut spans = HashMap::new();
|
||||
let mut last_point = Duration::from_nanos(0);
|
||||
let mut first_point = None;
|
||||
let mut total_self_time = SelfTime::new();
|
||||
for entry in trace {
|
||||
let entry = entry?;
|
||||
match entry {
|
||||
@ -31,10 +80,11 @@ pub fn to_call_stats<R: std::io::Read>(
|
||||
}
|
||||
Entry::NewThread(_) => {}
|
||||
Entry::NewSpan(span) => {
|
||||
spans.insert(span.id, (span, SpanStatus::Outside));
|
||||
spans.insert(span.id, (span, SpanStatus::Outside, SelfTime::new()));
|
||||
}
|
||||
Entry::SpanEnter(SpanEnter { id, time, memory: _ }) => {
|
||||
let (_, status) = spans.get_mut(&id).unwrap();
|
||||
first_point.get_or_insert(time);
|
||||
let (_, status, _) = spans.get_mut(&id).unwrap();
|
||||
|
||||
let SpanStatus::Outside = status else {
|
||||
continue;
|
||||
@ -43,18 +93,32 @@ pub fn to_call_stats<R: std::io::Read>(
|
||||
*status = SpanStatus::Inside(time);
|
||||
}
|
||||
Entry::SpanExit(SpanExit { id, time: end, memory: _ }) => {
|
||||
let (span, status) = spans.get_mut(&id).unwrap();
|
||||
let (span, status, self_time) = spans.get_mut(&id).unwrap();
|
||||
|
||||
let SpanStatus::Inside(begin) = status else {
|
||||
continue;
|
||||
};
|
||||
let begin = *begin;
|
||||
|
||||
if last_point < end {
|
||||
last_point = end;
|
||||
}
|
||||
|
||||
*status = SpanStatus::Outside;
|
||||
|
||||
let self_range = begin..end;
|
||||
|
||||
let self_duration = self_time.self_duration(self_range.clone());
|
||||
*self_time = SelfTime::new();
|
||||
|
||||
let span = *span;
|
||||
if let Some(parent_id) = span.parent_id {
|
||||
let (_, _, parent_self_time) = spans.get_mut(&parent_id).unwrap();
|
||||
parent_self_time.add_child_range(self_range.clone())
|
||||
}
|
||||
total_self_time.add_child_range(self_range);
|
||||
let (_, call_list) = calls.get_mut(&span.call_id).unwrap();
|
||||
call_list.push(end - begin);
|
||||
call_list.push((end - begin, self_duration));
|
||||
}
|
||||
Entry::SpanClose(SpanClose { id, time: _ }) => {
|
||||
spans.remove(&id);
|
||||
@ -63,17 +127,31 @@ pub fn to_call_stats<R: std::io::Read>(
|
||||
}
|
||||
}
|
||||
|
||||
let total_self_time = first_point
|
||||
.map(|first_point| (first_point, total_self_time.self_duration(first_point..last_point)));
|
||||
|
||||
Ok(calls
|
||||
.into_iter()
|
||||
.map(|(_, (call_site, calls))| (site_to_string(call_site), calls_to_stats(calls)))
|
||||
.chain(total_self_time.map(|(first_point, total_self_time)| {
|
||||
(
|
||||
"::meta::total".to_string(),
|
||||
CallStats {
|
||||
call_count: 1,
|
||||
time: (last_point - first_point).as_nanos() as u64,
|
||||
self_time: total_self_time.as_nanos() as u64,
|
||||
},
|
||||
)
|
||||
}))
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn site_to_string(call_site: NewCallsite) -> String {
|
||||
format!("{}::{}", call_site.target, call_site.name)
|
||||
}
|
||||
fn calls_to_stats(calls: Vec<Duration>) -> CallStats {
|
||||
fn calls_to_stats(calls: Vec<(Duration, Duration)>) -> CallStats {
|
||||
let nb = calls.len();
|
||||
let sum: Duration = calls.iter().sum();
|
||||
CallStats { call_count: nb, time: sum.as_nanos() as u64 }
|
||||
let sum: Duration = calls.iter().map(|(total, _)| total).sum();
|
||||
let self_sum: Duration = calls.iter().map(|(_, self_duration)| self_duration).sum();
|
||||
CallStats { call_count: nb, time: sum.as_nanos() as u64, self_time: self_sum.as_nanos() as u64 }
|
||||
}
|
||||
|
164
workloads/hackernews.json
Normal file
164
workloads/hackernews.json
Normal file
@ -0,0 +1,164 @@
|
||||
{
|
||||
"name": "hackernews.ndjson_1M",
|
||||
"run_count": 3,
|
||||
"extra_cli_args": [],
|
||||
"assets": {
|
||||
"hackernews-100_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-100_000.ndjson",
|
||||
"sha256": "60ecd23485d560edbd90d9ca31f0e6dba1455422f2a44e402600fbb5f7f1b213"
|
||||
},
|
||||
"hackernews-200_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-200_000.ndjson",
|
||||
"sha256": "785b0271fdb47cba574fab617d5d332276b835c05dd86e4a95251cf7892a1685"
|
||||
},
|
||||
"hackernews-300_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-300_000.ndjson",
|
||||
"sha256": "de73c7154652eddfaf69cdc3b2f824d5c452f095f40a20a1c97bb1b5c4d80ab2"
|
||||
},
|
||||
"hackernews-400_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-400_000.ndjson",
|
||||
"sha256": "c1b00a24689110f366447e434c201c086d6f456d54ed1c4995894102794d8fe7"
|
||||
},
|
||||
"hackernews-500_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-500_000.ndjson",
|
||||
"sha256": "ae98f9dbef8193d750e3e2dbb6a91648941a1edca5f6e82c143e7996f4840083"
|
||||
},
|
||||
"hackernews-600_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-600_000.ndjson",
|
||||
"sha256": "b495fdc72c4a944801f786400f22076ab99186bee9699f67cbab2f21f5b74dbe"
|
||||
},
|
||||
"hackernews-700_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-700_000.ndjson",
|
||||
"sha256": "4b2c63974f3dabaa4954e3d4598b48324d03c522321ac05b0d583f36cb78a28b"
|
||||
},
|
||||
"hackernews-800_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-800_000.ndjson",
|
||||
"sha256": "cb7b6afe0e6caa1be111be256821bc63b0771b2a0e1fad95af7aaeeffd7ba546"
|
||||
},
|
||||
"hackernews-900_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-900_000.ndjson",
|
||||
"sha256": "e1154ddcd398f1c867758a93db5bcb21a07b9e55530c188a2917fdef332d3ba9"
|
||||
},
|
||||
"hackernews-1_000_000.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-1_000_000.ndjson",
|
||||
"sha256": "27e25efd0b68b159b8b21350d9af76938710cb29ce0393fa71b41c4f3c630ffe"
|
||||
}
|
||||
},
|
||||
"commands": [
|
||||
{
|
||||
"route": "indexes/movies/settings",
|
||||
"method": "PATCH",
|
||||
"body": {
|
||||
"inline": {
|
||||
"displayedAttributes": [
|
||||
"title",
|
||||
"by",
|
||||
"score",
|
||||
"time"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"title"
|
||||
],
|
||||
"filterableAttributes": [
|
||||
"by"
|
||||
],
|
||||
"sortableAttributes": [
|
||||
"score",
|
||||
"time"
|
||||
]
|
||||
}
|
||||
},
|
||||
"synchronous": "DontWait"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-100_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-200_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-300_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-400_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-500_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-600_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-700_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-800_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-900_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForResponse"
|
||||
},
|
||||
{
|
||||
"route": "indexes/movies/documents",
|
||||
"method": "POST",
|
||||
"body": {
|
||||
"asset": "hackernews-1_000_000.ndjson"
|
||||
},
|
||||
"synchronous": "WaitForTask"
|
||||
}
|
||||
]
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user