Compare commits

..

1 Commits

Author SHA1 Message Date
c02d585f5b Upgrade rustls to 0.21.10 and ring to 0.17 2024-02-12 14:32:29 +08:00
96 changed files with 2589 additions and 5796 deletions

View File

@ -1,2 +1,2 @@
[alias]
xtask = "run --release --package xtask --"
xtask = "run --package xtask --"

View File

@ -1,30 +0,0 @@
name: Bench (manual)
on:
workflow_dispatch:
inputs:
workload:
description: 'The path to the workloads to execute (workloads/...)'
required: true
default: 'workloads/movies.json'
env:
WORKLOAD_NAME: ${{ github.event.inputs.workload }}
jobs:
benchmarks:
name: Run and upload benchmarks
runs-on: benchmarks
timeout-minutes: 180 # 3h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Run benchmarks - workload ${WORKLOAD_NAME} - branch ${{ github.ref }} - commit ${{ github.sha }}
run: |
cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Manual [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- ${WORKLOAD_NAME}

View File

@ -1,46 +0,0 @@
name: Bench (PR)
on:
issue_comment:
types: [created]
permissions:
issues: write
env:
GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
jobs:
run-benchmarks-on-comment:
if: startsWith(github.event.comment.body, '/bench')
name: Run and upload benchmarks
runs-on: benchmarks
timeout-minutes: 180 # 3h
steps:
- name: Check for Command
id: command
uses: xt0rted/slash-command-action@v2
with:
command: bench
reaction-type: "rocket"
repo-token: ${{ env.GH_TOKEN }}
- uses: xt0rted/pull-request-comment-branch@v2
id: comment-branch
with:
repo_token: ${{ env.GH_TOKEN }}
- uses: actions/checkout@v3
if: success()
with:
fetch-depth: 0 # fetch full history to be able to get main commit sha
ref: ${{ steps.comment-branch.outputs.head_ref }}
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Run benchmarks on PR ${{ github.event.issue.id }}
run: |
cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "[Comment](${{ github.event.comment.url }}) on [#${{github.event.issue.id}}](${{ github.event.issue.url }})" -- ${{ steps.command.outputs.command-arguments }}

View File

@ -1,25 +0,0 @@
name: Indexing bench (push)
on:
push:
branches:
- main
jobs:
benchmarks:
name: Run and upload benchmarks
runs-on: benchmarks
timeout-minutes: 180 # 3h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
# Run benchmarks
- name: Run benchmarks - Dataset ${BENCH_NAME} - Branch main - Commit ${{ github.sha }}
run: |
cargo xtask bench --api-key "${{ secrets.BENCHMARK_API_KEY }}" --dashboard-url "${{ vars.BENCHMARK_DASHBOARD_URL }}" --reason "Push on `main` [Run #${{ github.run_id }}](https://github.com/meilisearch/meilisearch/actions/runs/${{ github.run_id }})" -- workloads/*.json

View File

@ -31,10 +31,17 @@ jobs:
apt-get update && apt-get install -y curl
apt-get install build-essential -y
- name: Setup test with Rust stable
if: github.event_name != 'schedule'
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Setup test with Rust nightly
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
- name: Cache dependencies
uses: Swatinem/rust-cache@v2.7.1
- name: Run cargo check without any default features

2
.gitignore vendored
View File

@ -9,8 +9,6 @@
/data.ms
/snapshots
/dumps
/bench
/_xtask_benchmark.ms
# Snapshots
## ... large

1206
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -17,11 +17,11 @@ members = [
"benchmarks",
"fuzzers",
"tracing-trace",
"xtask", "build-info",
"xtask",
]
[workspace.package]
version = "1.7.1"
version = "1.6.1"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",

View File

@ -8,7 +8,7 @@ WORKDIR /
ARG COMMIT_SHA
ARG COMMIT_DATE
ARG GIT_TAG
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_DESCRIBE=${GIT_TAG}
ENV VERGEN_GIT_SHA=${COMMIT_SHA} VERGEN_GIT_COMMIT_TIMESTAMP=${COMMIT_DATE} VERGEN_GIT_SEMVER_LIGHTWEIGHT=${GIT_TAG}
ENV RUSTFLAGS="-C target-feature=-crt-static"
COPY . .

View File

@ -1,18 +0,0 @@
[package]
name = "build-info"
version.workspace = true
authors.workspace = true
description.workspace = true
homepage.workspace = true
readme.workspace = true
edition.workspace = true
license.workspace = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
time = { version = "0.3.34", features = ["parsing"] }
[build-dependencies]
anyhow = "1.0.80"
vergen-git2 = "1.0.0-beta.2"

View File

@ -1,22 +0,0 @@
fn main() {
if let Err(err) = emit_git_variables() {
println!("cargo:warning=vergen: {}", err);
}
}
fn emit_git_variables() -> anyhow::Result<()> {
// Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them
// in the corresponding GitHub workflow (publish_docker.yml).
// This is due to the Dockerfile building the binary outside of the git directory.
let mut builder = vergen_git2::Git2Builder::default();
builder.branch(true);
builder.commit_timestamp(true);
builder.commit_message(true);
builder.describe(true, true, None);
builder.sha(false);
let git2 = builder.build()?;
vergen_git2::Emitter::default().fail_on_error().add_instructions(&git2)?.emit()
}

View File

@ -1,203 +0,0 @@
use time::format_description::well_known::Iso8601;
#[derive(Debug, Clone)]
pub struct BuildInfo {
pub branch: Option<&'static str>,
pub describe: Option<DescribeResult>,
pub commit_sha1: Option<&'static str>,
pub commit_msg: Option<&'static str>,
pub commit_timestamp: Option<time::OffsetDateTime>,
}
impl BuildInfo {
pub fn from_build() -> Self {
let branch: Option<&'static str> = option_env!("VERGEN_GIT_BRANCH");
let describe = DescribeResult::from_build();
let commit_sha1 = option_env!("VERGEN_GIT_SHA");
let commit_msg = option_env!("VERGEN_GIT_COMMIT_MESSAGE");
let commit_timestamp = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP");
let commit_timestamp = commit_timestamp.and_then(|commit_timestamp| {
time::OffsetDateTime::parse(commit_timestamp, &Iso8601::DEFAULT).ok()
});
Self { branch, describe, commit_sha1, commit_msg, commit_timestamp }
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DescribeResult {
Prototype { name: &'static str },
Release { version: &'static str, major: u64, minor: u64, patch: u64 },
Prerelease { version: &'static str, major: u64, minor: u64, patch: u64, rc: u64 },
NotATag { describe: &'static str },
}
impl DescribeResult {
pub fn new(describe: &'static str) -> Self {
if let Some(name) = prototype_name(describe) {
Self::Prototype { name }
} else if let Some(release) = release_version(describe) {
release
} else if let Some(prerelease) = prerelease_version(describe) {
prerelease
} else {
Self::NotATag { describe }
}
}
pub fn from_build() -> Option<Self> {
let describe: &'static str = option_env!("VERGEN_GIT_DESCRIBE")?;
Some(Self::new(describe))
}
pub fn as_tag(&self) -> Option<&'static str> {
match self {
DescribeResult::Prototype { name } => Some(name),
DescribeResult::Release { version, .. } => Some(version),
DescribeResult::Prerelease { version, .. } => Some(version),
DescribeResult::NotATag { describe: _ } => None,
}
}
pub fn as_prototype(&self) -> Option<&'static str> {
match self {
DescribeResult::Prototype { name } => Some(name),
DescribeResult::Release { .. }
| DescribeResult::Prerelease { .. }
| DescribeResult::NotATag { .. } => None,
}
}
}
/// Parses the input as a prototype name.
///
/// Returns `Some(prototype_name)` if the following conditions are met on this value:
///
/// 1. starts with `prototype-`,
/// 2. ends with `-<some_number>`,
/// 3. does not end with `<some_number>-<some_number>`.
///
/// Otherwise, returns `None`.
fn prototype_name(describe: &'static str) -> Option<&'static str> {
if !describe.starts_with("prototype-") {
return None;
}
let mut rsplit_prototype = describe.rsplit('-');
// last component MUST be a number
rsplit_prototype.next()?.parse::<u64>().ok()?;
// before than last component SHALL NOT be a number
rsplit_prototype.next()?.parse::<u64>().err()?;
Some(describe)
}
fn release_version(describe: &'static str) -> Option<DescribeResult> {
if !describe.starts_with('v') {
return None;
}
// full release version don't contain a `-`
if describe.contains('-') {
return None;
}
// full release version parse as vX.Y.Z, with X, Y, Z numbers.
let mut dots = describe[1..].split('.');
let major: u64 = dots.next()?.parse().ok()?;
let minor: u64 = dots.next()?.parse().ok()?;
let patch: u64 = dots.next()?.parse().ok()?;
if dots.next().is_some() {
return None;
}
Some(DescribeResult::Release { version: describe, major, minor, patch })
}
fn prerelease_version(describe: &'static str) -> Option<DescribeResult> {
// prerelease version is in the shape vM.N.P-rc.C
let mut hyphen = describe.rsplit('-');
let prerelease = hyphen.next()?;
if !prerelease.starts_with("rc.") {
return None;
}
let rc: u64 = prerelease[3..].parse().ok()?;
let release = hyphen.next()?;
let DescribeResult::Release { version: _, major, minor, patch } = release_version(release)?
else {
return None;
};
Some(DescribeResult::Prerelease { version: describe, major, minor, patch, rc })
}
#[cfg(test)]
mod test {
use super::DescribeResult;
fn assert_not_a_tag(describe: &'static str) {
assert_eq!(DescribeResult::NotATag { describe }, DescribeResult::new(describe))
}
fn assert_proto(describe: &'static str) {
assert_eq!(DescribeResult::Prototype { name: describe }, DescribeResult::new(describe))
}
fn assert_release(describe: &'static str, major: u64, minor: u64, patch: u64) {
assert_eq!(
DescribeResult::Release { version: describe, major, minor, patch },
DescribeResult::new(describe)
)
}
fn assert_prerelease(describe: &'static str, major: u64, minor: u64, patch: u64, rc: u64) {
assert_eq!(
DescribeResult::Prerelease { version: describe, major, minor, patch, rc },
DescribeResult::new(describe)
)
}
#[test]
fn not_a_tag() {
assert_not_a_tag("whatever-fuzzy");
assert_not_a_tag("whatever-fuzzy-5-ggg-dirty");
assert_not_a_tag("whatever-fuzzy-120-ggg-dirty");
// technically a tag, but not a proto nor a version, so not parsed as a tag
assert_not_a_tag("whatever");
// dirty version
assert_not_a_tag("v1.7.0-1-ggga-dirty");
assert_not_a_tag("v1.7.0-rc.1-1-ggga-dirty");
// after version
assert_not_a_tag("v1.7.0-1-ggga");
assert_not_a_tag("v1.7.0-rc.1-1-ggga");
// after proto
assert_not_a_tag("protoype-tag-0-1-ggga");
assert_not_a_tag("protoype-tag-0-1-ggga-dirty");
}
#[test]
fn prototype() {
assert_proto("prototype-tag-0");
assert_proto("prototype-tag-10");
assert_proto("prototype-long-name-tag-10");
}
#[test]
fn release() {
assert_release("v1.7.2", 1, 7, 2);
}
#[test]
fn prerelease() {
assert_prerelease("v1.7.2-rc.3", 1, 7, 2, 3);
}
}

View File

@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::str::FromStr;
use time::OffsetDateTime;

View File

@ -13,7 +13,6 @@ license.workspace = true
[dependencies]
tempfile = "3.9.0"
thiserror = "1.0.56"
tracing = "0.1.40"
uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies]

View File

@ -1,5 +1,5 @@
use std::fs::File as StdFile;
use std::io::Write;
use std::ops::{Deref, DerefMut};
use std::path::{Path, PathBuf};
use std::str::FromStr;
@ -22,6 +22,20 @@ pub enum Error {
pub type Result<T> = std::result::Result<T, Error>;
impl Deref for File {
type Target = NamedTempFile;
fn deref(&self) -> &Self::Target {
&self.file
}
}
impl DerefMut for File {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.file
}
}
#[derive(Clone, Debug)]
pub struct FileStore {
path: PathBuf,
@ -42,7 +56,7 @@ impl FileStore {
let file = NamedTempFile::new_in(&self.path)?;
let uuid = Uuid::new_v4();
let path = self.path.join(uuid.to_string());
let update_file = File { file: Some(file), path };
let update_file = File { file, path };
Ok((uuid, update_file))
}
@ -53,7 +67,7 @@ impl FileStore {
let file = NamedTempFile::new_in(&self.path)?;
let uuid = Uuid::from_u128(uuid);
let path = self.path.join(uuid.to_string());
let update_file = File { file: Some(file), path };
let update_file = File { file, path };
Ok((uuid, update_file))
}
@ -61,13 +75,7 @@ impl FileStore {
/// Returns the file corresponding to the requested uuid.
pub fn get_update(&self, uuid: Uuid) -> Result<StdFile> {
let path = self.get_update_path(uuid);
let file = match StdFile::open(path) {
Ok(file) => file,
Err(e) => {
tracing::error!("Can't access update file {uuid}: {e}");
return Err(e.into());
}
};
let file = StdFile::open(path)?;
Ok(file)
}
@ -102,12 +110,8 @@ impl FileStore {
pub fn delete(&self, uuid: Uuid) -> Result<()> {
let path = self.path.join(uuid.to_string());
if let Err(e) = std::fs::remove_file(path) {
tracing::error!("Can't delete file {uuid}: {e}");
Err(e.into())
} else {
Ok(())
}
std::fs::remove_file(path)?;
Ok(())
}
/// List the Uuids of the files in the FileStore
@ -132,40 +136,16 @@ impl FileStore {
pub struct File {
path: PathBuf,
file: Option<NamedTempFile>,
file: NamedTempFile,
}
impl File {
pub fn dry_file() -> Result<Self> {
Ok(Self { path: PathBuf::new(), file: None })
}
pub fn persist(self) -> Result<()> {
if let Some(file) = self.file {
file.persist(&self.path)?;
}
self.file.persist(&self.path)?;
Ok(())
}
}
impl Write for File {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
if let Some(file) = self.file.as_mut() {
file.write(buf)
} else {
Ok(buf.len())
}
}
fn flush(&mut self) -> std::io::Result<()> {
if let Some(file) = self.file.as_mut() {
file.flush()
} else {
Ok(())
}
}
}
#[cfg(test)]
mod test {
use std::io::Write;

View File

@ -23,7 +23,6 @@ meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
page_size = "0.5.0"
puffin = { version = "0.16.0", features = ["serialization"] }
rayon = "1.8.1"
roaring = { version = "0.10.2", features = ["serde"] }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] }

View File

@ -142,28 +142,22 @@ pub(crate) enum IndexOperation {
impl Batch {
/// Return the task ids associated with this batch.
pub fn ids(&self) -> RoaringBitmap {
pub fn ids(&self) -> Vec<TaskId> {
match self {
Batch::TaskCancelation { task, .. }
| Batch::Dump(task)
| Batch::IndexCreation { task, .. }
| Batch::IndexUpdate { task, .. } => {
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
}
| Batch::IndexUpdate { task, .. } => vec![task.uid],
Batch::SnapshotCreation(tasks)
| Batch::TaskDeletions(tasks)
| Batch::IndexDeletion { tasks, .. } => {
RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid))
}
| Batch::IndexDeletion { tasks, .. } => tasks.iter().map(|task| task.uid).collect(),
Batch::IndexOperation { op, .. } => match op {
IndexOperation::DocumentOperation { tasks, .. }
| IndexOperation::Settings { tasks, .. }
| IndexOperation::DocumentClear { tasks, .. } => {
RoaringBitmap::from_iter(tasks.iter().map(|task| task.uid))
}
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => {
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
tasks.iter().map(|task| task.uid).collect()
}
IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
IndexOperation::SettingsAndDocumentOperation {
document_import_tasks: tasks,
settings_tasks: other,
@ -173,11 +167,9 @@ impl Batch {
cleared_tasks: tasks,
settings_tasks: other,
..
} => RoaringBitmap::from_iter(tasks.iter().chain(other).map(|task| task.uid)),
} => tasks.iter().chain(other).map(|task| task.uid).collect(),
},
Batch::IndexSwap { task } => {
RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap()
}
Batch::IndexSwap { task } => vec![task.uid],
}
}
@ -961,22 +953,7 @@ impl IndexScheduler {
.set_currently_updating_index(Some((index_uid.clone(), index.clone())));
let mut index_wtxn = index.write_txn()?;
let mut tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
if index.is_corrupted(&index_wtxn)? {
tracing::error!("Aborting task due to corrupted index");
index_wtxn.abort();
for task in tasks.iter_mut() {
task.status = Status::Failed;
task.error = Some(Error::CorruptedIndex.into());
}
return Ok(tasks);
}
index.check_document_facet_consistency(&index_wtxn)?.check();
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
index_wtxn.commit()?;
// if the update processed successfully, we're going to store the new
@ -1354,7 +1331,6 @@ impl IndexScheduler {
} else {
unreachable!()
};
let deleted_documents = delete_document_by_filter(
index_wtxn,
filter,

View File

@ -48,8 +48,6 @@ impl From<DateField> for Code {
pub enum Error {
#[error("{1}")]
WithCustomErrorCode(Code, Box<Self>),
#[error("Received bad task id: {received} should be >= to {expected}.")]
BadTaskId { received: TaskId, expected: TaskId },
#[error("Index `{0}` not found.")]
IndexNotFound(String),
#[error("Index `{0}` already exists.")]
@ -138,8 +136,6 @@ pub enum Error {
CreateBatch(Box<Self>),
#[error("Corrupted task queue.")]
CorruptedTaskQueue,
#[error("Corrupted index.")]
CorruptedIndex,
#[error(transparent)]
TaskDatabaseUpdate(Box<Self>),
#[error(transparent)]
@ -165,7 +161,6 @@ impl Error {
match self {
Error::IndexNotFound(_)
| Error::WithCustomErrorCode(_, _)
| Error::BadTaskId { .. }
| Error::IndexAlreadyExists(_)
| Error::SwapDuplicateIndexFound(_)
| Error::SwapDuplicateIndexesFound(_)
@ -194,7 +189,6 @@ impl Error {
| Error::Anyhow(_) => true,
Error::CreateBatch(_)
| Error::CorruptedTaskQueue
| Error::CorruptedIndex
| Error::TaskDatabaseUpdate(_)
| Error::HeedTransaction(_) => false,
#[cfg(test)]
@ -211,7 +205,6 @@ impl ErrorCode for Error {
fn error_code(&self) -> Code {
match self {
Error::WithCustomErrorCode(code, _) => *code,
Error::BadTaskId { .. } => Code::BadRequest,
Error::IndexNotFound(_) => Code::IndexNotFound,
Error::IndexAlreadyExists(_) => Code::IndexAlreadyExists,
Error::SwapDuplicateIndexesFound(_) => Code::InvalidSwapDuplicateIndexFound,
@ -245,7 +238,6 @@ impl ErrorCode for Error {
Error::CorruptedDump => Code::Internal,
Error::TaskDatabaseUpdate(_) => Code::Internal,
Error::CreateBatch(_) => Code::Internal,
Error::CorruptedIndex => Code::Internal,
// This one should never be seen by the end user
Error::AbortedTask => Code::Internal,

View File

@ -48,7 +48,7 @@ impl RoFeatures {
Ok(())
} else {
Err(FeatureNotEnabledError {
disabled_action: "Modifying logs through the `/logs/*` routes",
disabled_action: "getting logs through the `/logs/stream` route",
feature: "logs route",
issue_link: "https://github.com/orgs/meilisearch/discussions/721",
}

View File

@ -15,7 +15,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
let IndexScheduler {
autobatching_enabled,
cleanup_enabled: _,
must_stop_processing: _,
processing_tasks,
file_store,

File diff suppressed because it is too large Load Diff

View File

@ -1,90 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"uid": 0,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": null,
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "succeeded",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
},
{
"uid": 1,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": {
"message": "Index `doggo` already exists.",
"code": "index_already_exists",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#index_already_exists"
},
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "failed",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
},
{
"uid": 2,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": null,
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "enqueued",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
},
{
"uid": 3,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": null,
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "enqueued",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
}
]

View File

@ -1,90 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"uid": 0,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": null,
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "succeeded",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
},
{
"uid": 1,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": {
"message": "Index `doggo` already exists.",
"code": "index_already_exists",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#index_already_exists"
},
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "failed",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
},
{
"uid": 2,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": null,
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "enqueued",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
},
{
"uid": 3,
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]",
"error": null,
"canceledBy": null,
"details": {
"IndexInfo": {
"primary_key": null
}
},
"status": "enqueued",
"kind": {
"indexCreation": {
"index_uid": "doggo",
"primary_key": null
}
}
}
]

View File

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::convert::TryInto;
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
use uuid::Uuid;

View File

@ -1,6 +1,7 @@
use std::borrow::Cow;
use std::cmp::Reverse;
use std::collections::HashSet;
use std::convert::{TryFrom, TryInto};
use std::fs::create_dir_all;
use std::path::Path;
use std::result::Result as StdResult;

View File

@ -11,7 +11,7 @@ edition.workspace = true
license.workspace = true
[dependencies]
actix-web = { version = "4.4.1", default-features = false }
actix-web = { version = "4.5.1", default-features = false }
anyhow = "1.0.79"
convert_case = "0.6.0"
csv = "1.3.0"
@ -54,5 +54,3 @@ thai = ["milli/thai"]
greek = ["milli/greek"]
# allow khmer specialized tokenization
khmer = ["milli/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["milli/vietnamese"]

View File

@ -1,6 +1,6 @@
use std::fmt::{self, Debug, Display};
use std::fs::File;
use std::io::{self, BufWriter, Write};
use std::io::{self, Seek, Write};
use std::marker::PhantomData;
use memmap2::MmapOptions;
@ -104,8 +104,8 @@ impl ErrorCode for DocumentFormatError {
}
/// Reads CSV from input and write an obkv batch to writer.
pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result<u64> {
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
pub fn read_csv(file: &File, writer: impl Write + Seek, delimiter: u8) -> Result<u64> {
let mut builder = DocumentsBatchBuilder::new(writer);
let mmap = unsafe { MmapOptions::new().map(file)? };
let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref());
builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?;
@ -116,9 +116,9 @@ pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result<u64> {
Ok(count as u64)
}
/// Reads JSON from temporary file and write an obkv batch to writer.
pub fn read_json(file: &File, writer: impl Write) -> Result<u64> {
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
/// Reads JSON from temporary file and write an obkv batch to writer.
pub fn read_json(file: &File, writer: impl Write + Seek) -> Result<u64> {
let mut builder = DocumentsBatchBuilder::new(writer);
let mmap = unsafe { MmapOptions::new().map(file)? };
let mut deserializer = serde_json::Deserializer::from_slice(&mmap);
@ -151,8 +151,8 @@ pub fn read_json(file: &File, writer: impl Write) -> Result<u64> {
}
/// Reads JSON from temporary file and write an obkv batch to writer.
pub fn read_ndjson(file: &File, writer: impl Write) -> Result<u64> {
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
pub fn read_ndjson(file: &File, writer: impl Write + Seek) -> Result<u64> {
let mut builder = DocumentsBatchBuilder::new(writer);
let mmap = unsafe { MmapOptions::new().map(file)? };
for result in serde_json::Deserializer::from_slice(&mmap).into_iter() {

View File

@ -14,18 +14,18 @@ default-run = "meilisearch"
[dependencies]
actix-cors = "0.7.0"
actix-http = { version = "3.5.1", default-features = false, features = [
actix-http = { version = "3.6.0", default-features = false, features = [
"compress-brotli",
"compress-gzip",
"rustls",
"rustls-0_21",
] }
actix-utils = "3.0.1"
actix-web = { version = "4.4.1", default-features = false, features = [
actix-web = { version = "4.5.1", default-features = false, features = [
"macros",
"compress-brotli",
"compress-gzip",
"cookies",
"rustls",
"rustls-0_21",
] }
actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true }
anyhow = { version = "1.0.79", features = ["backtrace"] }
@ -52,7 +52,7 @@ index-scheduler = { path = "../index-scheduler" }
indexmap = { version = "2.1.0", features = ["serde"] }
is-terminal = "0.4.10"
itertools = "0.11.0"
jsonwebtoken = "8.3.0"
jsonwebtoken = "9.2.0"
lazy_static = "1.4.0"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
@ -75,7 +75,7 @@ reqwest = { version = "0.11.23", features = [
"rustls-tls",
"json",
], default-features = false }
rustls = "0.20.8"
rustls = "0.21.6"
rustls-pemfile = "1.0.2"
segment = { version = "0.2.3", optional = true }
serde = { version = "1.0.195", features = ["derive"] }
@ -104,10 +104,9 @@ serde_urlencoded = "0.7.1"
termcolor = "1.4.1"
url = { version = "2.5.0", features = ["serde"] }
tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["json"] }
tracing-subscriber = "0.3.18"
tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
tracing-actix-web = "0.7.9"
build-info = { version = "1.7.0", path = "../build-info" }
[dev-dependencies]
actix-rt = "2.9.0"
@ -132,6 +131,7 @@ reqwest = { version = "0.11.23", features = [
sha-1 = { version = "0.10.1", optional = true }
static-files = { version = "0.2.3", optional = true }
tempfile = { version = "3.9.0", optional = true }
vergen = { version = "7.5.1", default-features = false, features = ["git"] }
zip = { version = "0.6.6", optional = true }
[features]
@ -154,7 +154,6 @@ japanese = ["meilisearch-types/japanese"]
thai = ["meilisearch-types/thai"]
greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
vietnamese = ["meilisearch-types/vietnamese"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"

View File

@ -1,4 +1,17 @@
use vergen::{vergen, Config, SemverKind};
fn main() {
// Note: any code that needs VERGEN_ environment variables should take care to define them manually in the Dockerfile and pass them
// in the corresponding GitHub workflow (publish_docker.yml).
// This is due to the Dockerfile building the binary outside of the git directory.
let mut config = Config::default();
// allow using non-annotated tags
*config.git_mut().semver_kind_mut() = SemverKind::Lightweight;
if let Err(e) = vergen(config) {
println!("cargo:warning=vergen: {}", e);
}
#[cfg(feature = "mini-dashboard")]
mini_dashboard::setup_mini_dashboard().expect("Could not load the mini-dashboard assets");
}

View File

@ -28,9 +28,7 @@ use super::{
config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH,
};
use crate::analytics::Analytics;
use crate::option::{
default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot,
};
use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, ScheduleSnapshot};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::indexes::facet_search::FacetSearchQuery;
use crate::routes::tasks::TasksFilterQuery;
@ -252,12 +250,9 @@ impl super::Analytics for SegmentAnalytics {
struct Infos {
env: String,
experimental_enable_metrics: bool,
experimental_logs_mode: LogMode,
experimental_replication_parameters: bool,
experimental_enable_logs_route: bool,
experimental_reduce_indexing_memory_usage: bool,
experimental_max_number_of_batched_tasks: usize,
gpu_enabled: bool,
db_path: bool,
import_dump: bool,
dump_dir: bool,
@ -293,8 +288,6 @@ impl From<Opt> for Infos {
let Opt {
db_path,
experimental_enable_metrics,
experimental_logs_mode,
experimental_replication_parameters,
experimental_enable_logs_route,
experimental_reduce_indexing_memory_usage,
experimental_max_number_of_batched_tasks,
@ -342,11 +335,8 @@ impl From<Opt> for Infos {
Self {
env,
experimental_enable_metrics,
experimental_logs_mode,
experimental_replication_parameters,
experimental_enable_logs_route,
experimental_reduce_indexing_memory_usage,
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
db_path: db_path != PathBuf::from("./data.ms"),
import_dump: import_dump.is_some(),
dump_dir: dump_dir != PathBuf::from("dumps/"),
@ -473,9 +463,7 @@ impl Segment {
create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default())
{
// Replace the version number with the prototype name if any.
let version = if let Some(prototype) = build_info::DescribeResult::from_build()
.and_then(|describe| describe.as_prototype())
{
let version = if let Some(prototype) = crate::prototype_name() {
prototype
} else {
env!("CARGO_PKG_VERSION")

View File

@ -131,7 +131,6 @@ gen_seq! { SeqFromRequestFut3; A B C }
gen_seq! { SeqFromRequestFut4; A B C D }
gen_seq! { SeqFromRequestFut5; A B C D E }
gen_seq! { SeqFromRequestFut6; A B C D E F }
gen_seq! { SeqFromRequestFut7; A B C D E F G }
pin_project! {
#[project = ExtractProj]

View File

@ -97,25 +97,11 @@ pub type LogRouteType = tracing_subscriber::filter::Filtered<
tracing_subscriber::Registry,
>;
pub type SubscriberForSecondLayer = tracing_subscriber::layer::Layered<
tracing_subscriber::reload::Layer<LogRouteType, tracing_subscriber::Registry>,
tracing_subscriber::Registry,
>;
pub type LogStderrHandle =
tracing_subscriber::reload::Handle<LogStderrType, SubscriberForSecondLayer>;
pub type LogStderrType = tracing_subscriber::filter::Filtered<
Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
Targets,
SubscriberForSecondLayer,
>;
pub fn create_app(
index_scheduler: Data<IndexScheduler>,
auth_controller: Data<AuthController>,
opt: Opt,
logs: (LogRouteHandle, LogStderrHandle),
logs: LogRouteHandle,
analytics: Arc<dyn Analytics>,
enable_dashboard: bool,
) -> actix_web::App<
@ -265,9 +251,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc<IndexScheduler>, Arc<
.name(String::from("register-snapshot-tasks"))
.spawn(move || loop {
thread::sleep(snapshot_delay);
if let Err(e) =
index_scheduler.register(KindWithContent::SnapshotCreation, None, false)
{
if let Err(e) = index_scheduler.register(KindWithContent::SnapshotCreation) {
error!("Error while registering snapshot: {}", e);
}
})
@ -302,7 +286,6 @@ fn open_or_create_database_unchecked(
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
indexer_config: (&opt.indexer_options).try_into()?,
autobatching_enabled: true,
cleanup_enabled: !opt.experimental_replication_parameters,
max_number_of_tasks: 1_000_000,
max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
@ -426,9 +409,6 @@ fn import_dump(
let reader = BufReader::new(file);
let reader = DocumentsBatchReader::from_reader(reader)?;
let embedder_configs = index.embedding_configs(&wtxn)?;
let embedders = index_scheduler.embedders(embedder_configs)?;
let builder = milli::update::IndexDocuments::new(
&mut wtxn,
&index,
@ -441,8 +421,6 @@ fn import_dump(
|| false,
)?;
let builder = builder.with_embedders(embedders);
let (builder, user_result) = builder.add_documents(reader)?;
let user_result = user_result?;
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
@ -466,7 +444,7 @@ pub fn configure_data(
index_scheduler: Data<IndexScheduler>,
auth: Data<AuthController>,
opt: &Opt,
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
logs: LogRouteHandle,
analytics: Arc<dyn Analytics>,
) {
let http_payload_size_limit = opt.http_payload_size_limit.get_bytes() as usize;
@ -474,9 +452,7 @@ pub fn configure_data(
.app_data(index_scheduler)
.app_data(auth)
.app_data(web::Data::from(analytics))
.app_data(web::Data::new(logs_route))
.app_data(web::Data::new(logs_stderr))
.app_data(web::Data::new(opt.clone()))
.app_data(web::Data::new(logs))
.app_data(
web::JsonConfig::default()
.limit(http_payload_size_limit)
@ -536,3 +512,30 @@ pub fn dashboard(config: &mut web::ServiceConfig, enable_frontend: bool) {
pub fn dashboard(config: &mut web::ServiceConfig, _enable_frontend: bool) {
config.service(web::resource("/").route(web::get().to(routes::running)));
}
/// Parses the output of
/// [`VERGEN_GIT_SEMVER_LIGHTWEIGHT`](https://docs.rs/vergen/latest/vergen/struct.Git.html#instructions)
/// as a prototype name.
///
/// Returns `Some(prototype_name)` if the following conditions are met on this value:
///
/// 1. starts with `prototype-`,
/// 2. ends with `-<some_number>`,
/// 3. does not end with `<some_number>-<some_number>`.
///
/// Otherwise, returns `None`.
pub fn prototype_name() -> Option<&'static str> {
let prototype: &'static str = option_env!("VERGEN_GIT_SEMVER_LIGHTWEIGHT")?;
if !prototype.starts_with("prototype-") {
return None;
}
let mut rsplit_prototype = prototype.rsplit('-');
// last component MUST be a number
rsplit_prototype.next()?.parse::<u64>().ok()?;
// before than last component SHALL NOT be a number
rsplit_prototype.next()?.parse::<u64>().err()?;
Some(prototype)
}

View File

@ -1,5 +1,5 @@
use std::env;
use std::io::{stderr, LineWriter, Write};
use std::io::{stderr, Write};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;
@ -10,10 +10,8 @@ use actix_web::HttpServer;
use index_scheduler::IndexScheduler;
use is_terminal::IsTerminal;
use meilisearch::analytics::Analytics;
use meilisearch::option::LogMode;
use meilisearch::{
analytics, create_app, setup_meilisearch, LogRouteHandle, LogRouteType, LogStderrHandle,
LogStderrType, Opt, SubscriberForSecondLayer,
analytics, create_app, prototype_name, setup_meilisearch, LogRouteHandle, LogRouteType, Opt,
};
use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE};
use mimalloc::MiMalloc;
@ -25,44 +23,28 @@ use tracing_subscriber::Layer;
#[global_allocator]
static ALLOC: MiMalloc = MiMalloc;
fn default_log_route_layer() -> LogRouteType {
fn default_layer() -> LogRouteType {
None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF))
}
fn default_log_stderr_layer(opt: &Opt) -> LogStderrType {
let layer = tracing_subscriber::fmt::layer()
.with_writer(|| LineWriter::new(std::io::stderr()))
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE);
let layer = match opt.experimental_logs_mode {
LogMode::Human => Box::new(layer)
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
LogMode::Json => Box::new(layer.json())
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>,
};
layer.with_filter(
tracing_subscriber::filter::Targets::new()
.with_target("", LevelFilter::from_str(&opt.log_level.to_string()).unwrap()),
)
}
/// does all the setup before meilisearch is launched
fn setup(opt: &Opt) -> anyhow::Result<(LogRouteHandle, LogStderrHandle)> {
let (route_layer, route_layer_handle) =
tracing_subscriber::reload::Layer::new(default_log_route_layer());
fn setup(opt: &Opt) -> anyhow::Result<LogRouteHandle> {
let (route_layer, route_layer_handle) = tracing_subscriber::reload::Layer::new(default_layer());
let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer;
let (stderr_layer, stderr_layer_handle) =
tracing_subscriber::reload::Layer::new(default_log_stderr_layer(opt));
let route_layer: tracing_subscriber::reload::Layer<_, _> = route_layer;
let subscriber = tracing_subscriber::registry().with(route_layer).with(stderr_layer);
let subscriber = tracing_subscriber::registry().with(route_layer).with(
tracing_subscriber::fmt::layer()
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE)
.with_filter(
tracing_subscriber::filter::LevelFilter::from_str(&opt.log_level.to_string())
.unwrap(),
),
);
// set the subscriber as the default for the application
tracing::subscriber::set_global_default(subscriber).unwrap();
Ok((route_layer_handle, stderr_layer_handle))
Ok(route_layer_handle)
}
fn on_panic(info: &std::panic::PanicInfo) {
@ -74,9 +56,6 @@ fn on_panic(info: &std::panic::PanicInfo) {
async fn main() -> anyhow::Result<()> {
let (opt, config_read_from) = Opt::try_build()?;
std::env::var("MEILI_LOUIS_PUSHOVER_USER").expect("MEILI_LOUIS_PUSHOVER_USER not set");
std::env::var("MEILI_LOUIS_PUSHOVER_APP").expect("MEILI_LOUIS_PUSHOVER_APP not set");
std::panic::set_hook(Box::new(on_panic));
anyhow::ensure!(
@ -131,7 +110,7 @@ async fn run_http(
index_scheduler: Arc<IndexScheduler>,
auth_controller: Arc<AuthController>,
opt: Opt,
logs: (LogRouteHandle, LogStderrHandle),
logs: LogRouteHandle,
analytics: Arc<dyn Analytics>,
) -> anyhow::Result<()> {
let enable_dashboard = &opt.env == "development";
@ -154,7 +133,7 @@ async fn run_http(
.keep_alive(KeepAlive::Os);
if let Some(config) = opt_clone.get_ssl_config()? {
http_server.bind_rustls(opt_clone.http_addr, config)?.run().await?;
http_server.bind_rustls_021(opt_clone.http_addr, config)?.run().await?;
} else {
http_server.bind(&opt_clone.http_addr)?.run().await?;
}
@ -166,8 +145,8 @@ pub fn print_launch_resume(
analytics: Arc<dyn Analytics>,
config_read_from: Option<PathBuf>,
) {
let build_info = build_info::BuildInfo::from_build();
let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown");
let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown");
let protocol =
if opt.ssl_cert_path.is_some() && opt.ssl_key_path.is_some() { "https" } else { "http" };
let ascii_name = r#"
@ -192,18 +171,10 @@ pub fn print_launch_resume(
eprintln!("Database path:\t\t{:?}", opt.db_path);
eprintln!("Server listening on:\t\"{}://{}\"", protocol, opt.http_addr);
eprintln!("Environment:\t\t{:?}", opt.env);
eprintln!("Commit SHA:\t\t{:?}", build_info.commit_sha1.unwrap_or("unknown"));
eprintln!(
"Commit date:\t\t{:?}",
build_info
.commit_timestamp
.and_then(|commit_timestamp| commit_timestamp
.format(&time::format_description::well_known::Rfc3339)
.ok())
.unwrap_or("unknown".into())
);
eprintln!("Commit SHA:\t\t{:?}", commit_sha.to_string());
eprintln!("Commit date:\t\t{:?}", commit_date.to_string());
eprintln!("Package version:\t{:?}", env!("CARGO_PKG_VERSION").to_string());
if let Some(prototype) = build_info.describe.and_then(|describe| describe.as_prototype()) {
if let Some(prototype) = prototype_name() {
eprintln!("Prototype:\t\t{:?}", prototype);
}

View File

@ -1,3 +1,4 @@
use std::convert::TryFrom;
use std::env::VarError;
use std::ffi::OsStr;
use std::fmt::Display;
@ -50,8 +51,6 @@ const MEILI_IGNORE_MISSING_DUMP: &str = "MEILI_IGNORE_MISSING_DUMP";
const MEILI_IGNORE_DUMP_IF_DB_EXISTS: &str = "MEILI_IGNORE_DUMP_IF_DB_EXISTS";
const MEILI_DUMP_DIR: &str = "MEILI_DUMP_DIR";
const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
const MEILI_EXPERIMENTAL_LOGS_MODE: &str = "MEILI_EXPERIMENTAL_LOGS_MODE";
const MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS: &str = "MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS";
const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE";
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
@ -80,39 +79,6 @@ const DEFAULT_LOG_EVERY_N: usize = 100_000;
pub const INDEX_SIZE: u64 = 2 * 1024 * 1024 * 1024 * 1024; // 2 TiB
pub const TASK_DB_SIZE: u64 = 20 * 1024 * 1024 * 1024; // 20 GiB
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
pub enum LogMode {
#[default]
Human,
Json,
}
impl Display for LogMode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
LogMode::Human => Display::fmt("HUMAN", f),
LogMode::Json => Display::fmt("JSON", f),
}
}
}
impl FromStr for LogMode {
type Err = LogModeError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.trim().to_lowercase().as_str() {
"human" => Ok(LogMode::Human),
"json" => Ok(LogMode::Json),
_ => Err(LogModeError(s.to_owned())),
}
}
}
#[derive(Debug, thiserror::Error)]
#[error("Unsupported log mode level `{0}`. Supported values are `HUMAN` and `JSON`.")]
pub struct LogModeError(String);
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
pub enum LogLevel {
@ -344,30 +310,13 @@ pub struct Opt {
#[serde(default)]
pub experimental_enable_metrics: bool,
/// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723>
///
/// Change the mode of the logs on the console.
#[clap(long, env = MEILI_EXPERIMENTAL_LOGS_MODE, default_value_t)]
#[serde(default)]
pub experimental_logs_mode: LogMode,
/// Experimental logs route feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/721>
///
/// Enables the log routes on the `POST /logs/stream`, `POST /logs/stderr` endpoints, and the `DELETE /logs/stream` to stop receiving logs.
/// Enables the log route on the `POST /logs/stream` endpoint and the `DELETE /logs/stream` to stop receiving logs.
#[clap(long, env = MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE)]
#[serde(default)]
pub experimental_enable_logs_route: bool,
/// Enable multiple features that helps you to run meilisearch in a replicated context.
/// For more information, see: <https://github.com/orgs/meilisearch/discussions/725>
///
/// - /!\ Disable the automatic clean up of old processed tasks, you're in charge of that now
/// - Lets you specify a custom task ID upon registering a task
/// - Lets you execute dry-register a task (get an answer from the route but nothing is actually registered in meilisearch and it won't be processed)
#[clap(long, env = MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS)]
#[serde(default)]
pub experimental_replication_parameters: bool,
/// Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
#[clap(long, env = MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE)]
#[serde(default)]
@ -473,9 +422,7 @@ impl Opt {
#[cfg(feature = "analytics")]
no_analytics,
experimental_enable_metrics,
experimental_logs_mode,
experimental_enable_logs_route,
experimental_replication_parameters,
experimental_reduce_indexing_memory_usage,
} = self;
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
@ -532,14 +479,6 @@ impl Opt {
MEILI_EXPERIMENTAL_ENABLE_METRICS,
experimental_enable_metrics.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_LOGS_MODE,
experimental_logs_mode.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_REPLICATION_PARAMETERS,
experimental_replication_parameters.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE,
experimental_enable_logs_route.to_string(),
@ -564,11 +503,11 @@ impl Opt {
}
if self.ssl_require_auth {
let verifier = AllowAnyAuthenticatedClient::new(client_auth_roots);
config.with_client_cert_verifier(verifier)
config.with_client_cert_verifier(Arc::from(verifier))
} else {
let verifier =
AllowAnyAnonymousOrAuthenticatedClient::new(client_auth_roots);
config.with_client_cert_verifier(verifier)
config.with_client_cert_verifier(Arc::from(verifier))
}
}
None => config.with_no_client_auth(),

View File

@ -10,7 +10,7 @@ use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::keys::{CreateApiKey, Key, PatchApiKey};
use meilisearch_types::keys::{Action, CreateApiKey, Key, PatchApiKey};
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use uuid::Uuid;

View File

@ -11,8 +11,7 @@ use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView};
use crate::Opt;
use crate::routes::SummarizedTaskView;
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump))));
@ -22,7 +21,6 @@ pub async fn create_dump(
index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>,
auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.publish("Dump Created".to_string(), json!({}), Some(&req));
@ -31,12 +29,8 @@ pub async fn create_dump(
keys: auth_controller.list_keys()?,
instance_uid: analytics.instance_uid().cloned(),
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Create dump");
Ok(HttpResponse::Accepted().json(task))

View File

@ -7,7 +7,7 @@ use bstr::ByteSlice as _;
use deserr::actix_web::{AwebJson, AwebQueryParameter};
use deserr::Deserr;
use futures::StreamExt;
use index_scheduler::{IndexScheduler, TaskId};
use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::document_formats::{read_csv, read_json, read_ndjson, PayloadType};
@ -36,11 +36,8 @@ use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::payload::Payload;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::{
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
};
use crate::routes::{PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
use crate::search::parse_filter;
use crate::Opt;
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()]
@ -122,7 +119,6 @@ pub async fn delete_document(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
path: web::Path<DocumentParam>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let DocumentParam { index_uid, document_id } = path.into_inner();
@ -134,13 +130,9 @@ pub async fn delete_document(
index_uid: index_uid.to_string(),
documents_ids: vec![document_id],
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
debug!("returns: {:?}", task);
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Delete document");
Ok(HttpResponse::Accepted().json(task))
}
@ -275,7 +267,6 @@ pub async fn replace_documents(
params: AwebQueryParameter<UpdateDocumentsQuery, DeserrQueryParamError>,
body: Payload,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -286,8 +277,6 @@ pub async fn replace_documents(
analytics.add_documents(&params, index_scheduler.index(&index_uid).is_err(), &req);
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task = document_addition(
extract_mime_type(&req)?,
index_scheduler,
@ -296,8 +285,6 @@ pub async fn replace_documents(
params.csv_delimiter,
body,
IndexDocumentsMethod::ReplaceDocuments,
uid,
dry_run,
allow_index_creation,
)
.await?;
@ -312,7 +299,6 @@ pub async fn update_documents(
params: AwebQueryParameter<UpdateDocumentsQuery, DeserrQueryParamError>,
body: Payload,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -323,8 +309,6 @@ pub async fn update_documents(
analytics.update_documents(&params, index_scheduler.index(&index_uid).is_err(), &req);
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task = document_addition(
extract_mime_type(&req)?,
index_scheduler,
@ -333,8 +317,6 @@ pub async fn update_documents(
params.csv_delimiter,
body,
IndexDocumentsMethod::UpdateDocuments,
uid,
dry_run,
allow_index_creation,
)
.await?;
@ -352,8 +334,6 @@ async fn document_addition(
csv_delimiter: Option<u8>,
mut body: Payload,
method: IndexDocumentsMethod,
task_id: Option<TaskId>,
dry_run: bool,
allow_index_creation: bool,
) -> Result<SummarizedTaskView, MeilisearchHttpError> {
let format = match (
@ -386,7 +366,7 @@ async fn document_addition(
}
};
let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?;
let (uuid, mut update_file) = index_scheduler.create_update_file()?;
let temp_file = match tempfile() {
Ok(file) => file,
@ -425,9 +405,11 @@ async fn document_addition(
let read_file = buffer.into_inner().into_std().await;
let documents_count = tokio::task::spawn_blocking(move || {
let documents_count = match format {
PayloadType::Json => read_json(&read_file, &mut update_file)?,
PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?,
PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?,
PayloadType::Json => read_json(&read_file, update_file.as_file_mut())?,
PayloadType::Csv { delimiter } => {
read_csv(&read_file, update_file.as_file_mut(), delimiter)?
}
PayloadType::Ndjson => read_ndjson(&read_file, update_file.as_file_mut())?,
};
// we NEED to persist the file here because we moved the `udpate_file` in another task.
update_file.persist()?;
@ -468,9 +450,7 @@ async fn document_addition(
};
let scheduler = index_scheduler.clone();
let task = match tokio::task::spawn_blocking(move || scheduler.register(task, task_id, dry_run))
.await?
{
let task = match tokio::task::spawn_blocking(move || scheduler.register(task)).await? {
Ok(task) => task,
Err(e) => {
index_scheduler.delete_update_file(uuid)?;
@ -486,7 +466,6 @@ pub async fn delete_documents_batch(
index_uid: web::Path<String>,
body: web::Json<Vec<Value>>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Delete documents by batch");
@ -501,12 +480,8 @@ pub async fn delete_documents_batch(
let task =
KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), documents_ids: ids };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Delete documents by batch");
Ok(HttpResponse::Accepted().json(task))
@ -524,7 +499,6 @@ pub async fn delete_documents_by_filter(
index_uid: web::Path<String>,
body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Delete documents by filter");
@ -542,12 +516,8 @@ pub async fn delete_documents_by_filter(
.map_err(|err| ResponseError::from_msg(err.message, Code::InvalidDocumentFilter))?;
let task = KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr: filter };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Delete documents by filter");
Ok(HttpResponse::Accepted().json(task))
@ -557,19 +527,14 @@ pub async fn clear_all_documents(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
analytics.delete_documents(DocumentDeletionKind::ClearAll, &req);
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Delete all documents");
Ok(HttpResponse::Accepted().json(task))

View File

@ -17,13 +17,11 @@ use serde_json::json;
use time::OffsetDateTime;
use tracing::debug;
use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
use super::{Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::is_dry_run;
use crate::Opt;
pub mod documents;
pub mod facet_search;
@ -125,7 +123,6 @@ pub async fn create_index(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>,
body: AwebJson<IndexCreateRequest, DeserrJsonError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Create index");
@ -140,12 +137,8 @@ pub async fn create_index(
);
let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Create index");
Ok(HttpResponse::Accepted().json(task))
@ -197,7 +190,6 @@ pub async fn update_index(
index_uid: web::Path<String>,
body: AwebJson<UpdateIndexRequest, DeserrJsonError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?body, "Update index");
@ -214,12 +206,8 @@ pub async fn update_index(
primary_key: body.primary_key,
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Update index");
Ok(HttpResponse::Accepted().json(task))
@ -228,17 +216,11 @@ pub async fn update_index(
pub async fn delete_index(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_DELETE }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
req: HttpRequest,
opt: web::Data<Opt>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let task = KindWithContent::IndexDeletion { index_uid: index_uid.into_inner() };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Delete index");
Ok(HttpResponse::Accepted().json(task))

View File

@ -15,8 +15,7 @@ use tracing::debug;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView};
use crate::Opt;
use crate::routes::SummarizedTaskView;
#[macro_export]
macro_rules! make_setting_route {
@ -35,8 +34,7 @@ macro_rules! make_setting_route {
use $crate::extractors::authentication::policies::*;
use $crate::extractors::authentication::GuardedData;
use $crate::extractors::sequential_extractor::SeqHandler;
use $crate::Opt;
use $crate::routes::{is_dry_run, get_task_id, SummarizedTaskView};
use $crate::routes::SummarizedTaskView;
pub async fn delete(
index_scheduler: GuardedData<
@ -44,8 +42,6 @@ macro_rules! make_setting_route {
Data<IndexScheduler>,
>,
index_uid: web::Path<String>,
req: HttpRequest,
opt: web::Data<Opt>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -60,10 +56,8 @@ macro_rules! make_setting_route {
is_deletion: true,
allow_index_creation,
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
tokio::task::spawn_blocking(move || index_scheduler.register(task))
.await??
.into();
@ -79,7 +73,6 @@ macro_rules! make_setting_route {
index_uid: actix_web::web::Path<String>,
body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>,
req: HttpRequest,
opt: web::Data<Opt>,
$analytics_var: web::Data<dyn Analytics>,
) -> std::result::Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -112,10 +105,8 @@ macro_rules! make_setting_route {
is_deletion: false,
allow_index_creation,
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
tokio::task::spawn_blocking(move || index_scheduler.register(task))
.await??
.into();
@ -661,7 +652,6 @@ pub async fn update_all(
index_uid: web::Path<String>,
body: AwebJson<Settings<Unchecked>, DeserrJsonError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -777,12 +767,8 @@ pub async fn update_all(
is_deletion: false,
allow_index_creation,
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Update all settings");
Ok(HttpResponse::Accepted().json(task))
@ -804,8 +790,6 @@ pub async fn get_all(
pub async fn delete_all(
index_scheduler: GuardedData<ActionPolicy<{ actions::SETTINGS_UPDATE }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,
req: HttpRequest,
opt: web::Data<Opt>,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
@ -819,12 +803,8 @@ pub async fn delete_all(
is_deletion: true,
allow_index_creation,
};
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Delete all settings");
Ok(HttpResponse::Accepted().json(task))

View File

@ -22,15 +22,14 @@ use crate::error::MeilisearchHttpError;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::{LogRouteHandle, LogStderrHandle};
use crate::LogRouteHandle;
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(
web::resource("stream")
.route(web::post().to(SeqHandler(get_logs)))
.route(web::delete().to(SeqHandler(cancel_logs))),
)
.service(web::resource("stderr").route(web::post().to(SeqHandler(update_stderr_target))));
);
}
#[derive(Debug, Default, Clone, Copy, Deserr, PartialEq, Eq)]
@ -38,7 +37,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub enum LogMode {
#[default]
Human,
Json,
Profile,
}
@ -167,18 +165,7 @@ fn make_layer<
let fmt_layer = tracing_subscriber::fmt::layer()
.with_writer(move || LogWriter { sender: sender.clone() })
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE);
let stream = byte_stream(receiver, guard);
(Box::new(fmt_layer) as Box<dyn Layer<S> + Send + Sync>, Box::pin(stream))
}
LogMode::Json => {
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel();
let fmt_layer = tracing_subscriber::fmt::layer()
.with_writer(move || LogWriter { sender: sender.clone() })
.json()
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE);
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE);
let stream = byte_stream(receiver, guard);
(Box::new(fmt_layer) as Box<dyn Layer<S> + Send + Sync>, Box::pin(stream))
@ -292,27 +279,3 @@ pub async fn cancel_logs(
Ok(HttpResponse::NoContent().finish())
}
#[derive(Debug, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct UpdateStderrLogs {
#[deserr(default = "info".parse().unwrap(), try_from(&String) = MyTargets::from_str -> DeserrJsonError<BadRequest>)]
target: MyTargets,
}
pub async fn update_stderr_target(
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
logs: Data<LogStderrHandle>,
body: AwebJson<UpdateStderrLogs, DeserrJsonError>,
) -> Result<HttpResponse, ResponseError> {
index_scheduler.features().check_logs_route()?;
let opt = body.into_inner();
logs.modify(|layer| {
*layer.filter_mut() = opt.target.0.clone();
})
.unwrap();
Ok(HttpResponse::NoContent().finish())
}

View File

@ -4,7 +4,7 @@ use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse};
use index_scheduler::IndexScheduler;
use meilisearch_auth::AuthController;
use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::error::ResponseError;
use meilisearch_types::settings::{Settings, Unchecked};
use meilisearch_types::tasks::{Kind, Status, Task, TaskId};
use serde::{Deserialize, Serialize};
@ -15,7 +15,6 @@ use tracing::debug;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::Opt;
const PAGINATION_DEFAULT_LIMIT: usize = 20;
@ -46,56 +45,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
.service(web::scope("/experimental-features").configure(features::configure));
}
pub fn get_task_id(req: &HttpRequest, opt: &Opt) -> Result<Option<TaskId>, ResponseError> {
if !opt.experimental_replication_parameters {
return Ok(None);
}
let task_id = req
.headers()
.get("TaskId")
.map(|header| {
header.to_str().map_err(|e| {
ResponseError::from_msg(
format!("TaskId is not a valid utf-8 string: {e}"),
Code::BadRequest,
)
})
})
.transpose()?
.map(|s| {
s.parse::<TaskId>().map_err(|e| {
ResponseError::from_msg(
format!(
"Could not parse the TaskId as a {}: {e}",
std::any::type_name::<TaskId>(),
),
Code::BadRequest,
)
})
})
.transpose()?;
Ok(task_id)
}
pub fn is_dry_run(req: &HttpRequest, opt: &Opt) -> Result<bool, ResponseError> {
if !opt.experimental_replication_parameters {
return Ok(false);
}
Ok(req
.headers()
.get("DryRun")
.map(|header| {
header.to_str().map_err(|e| {
ResponseError::from_msg(
format!("DryRun is not a valid utf-8 string: {e}"),
Code::BadRequest,
)
})
})
.transpose()?
.map_or(false, |s| s.to_lowercase() == "true"))
}
#[derive(Debug, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct SummarizedTaskView {
@ -359,18 +308,12 @@ async fn get_version(
) -> HttpResponse {
analytics.publish("Version Seen".to_string(), json!(null), Some(&req));
let build_info = build_info::BuildInfo::from_build();
let commit_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown");
let commit_date = option_env!("VERGEN_GIT_COMMIT_TIMESTAMP").unwrap_or("unknown");
HttpResponse::Ok().json(VersionResponse {
commit_sha: build_info.commit_sha1.unwrap_or("unknown").to_string(),
commit_date: build_info
.commit_timestamp
.and_then(|commit_timestamp| {
commit_timestamp
.format(&time::format_description::well_known::Iso8601::DEFAULT)
.ok()
})
.unwrap_or("unknown".into()),
commit_sha: commit_sha.to_string(),
commit_date: commit_date.to_string(),
pkg_version: env!("CARGO_PKG_VERSION").to_string(),
})
}

View File

@ -10,8 +10,7 @@ use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView};
use crate::Opt;
use crate::routes::SummarizedTaskView;
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
@ -20,18 +19,13 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub async fn create_snapshot(
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
let task = KindWithContent::SnapshotCreation;
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
debug!(returns = ?task, "Create snapshot");
Ok(HttpResponse::Accepted().json(task))

View File

@ -10,13 +10,12 @@ use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::tasks::{IndexSwap, KindWithContent};
use serde_json::json;
use super::{get_task_id, is_dry_run, SummarizedTaskView};
use super::SummarizedTaskView;
use crate::analytics::Analytics;
use crate::error::MeilisearchHttpError;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler;
use crate::Opt;
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::post().to(SeqHandler(swap_indexes))));
@ -33,7 +32,6 @@ pub async fn swap_indexes(
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>,
params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let params = params.into_inner();
@ -62,11 +60,7 @@ pub async fn swap_indexes(
}
let task = KindWithContent::IndexSwap { swaps };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task: SummarizedTaskView =
tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run))
.await??
.into();
tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
Ok(HttpResponse::Accepted().json(task))
}

View File

@ -18,12 +18,11 @@ use time::macros::format_description;
use time::{Date, Duration, OffsetDateTime, Time};
use tokio::task;
use super::{get_task_id, is_dry_run, SummarizedTaskView};
use super::SummarizedTaskView;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::Opt;
const DEFAULT_LIMIT: u32 = 20;
@ -162,7 +161,6 @@ async fn cancel_tasks(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>,
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let params = params.into_inner();
@ -199,11 +197,7 @@ async fn cancel_tasks(
let task_cancelation =
KindWithContent::TaskCancelation { query: format!("?{}", req.query_string()), tasks };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task =
task::spawn_blocking(move || index_scheduler.register(task_cancelation, uid, dry_run))
.await??;
let task = task::spawn_blocking(move || index_scheduler.register(task_cancelation)).await??;
let task: SummarizedTaskView = task.into();
Ok(HttpResponse::Ok().json(task))
@ -213,7 +207,6 @@ async fn delete_tasks(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_DELETE }>, Data<IndexScheduler>>,
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
req: HttpRequest,
opt: web::Data<Opt>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let params = params.into_inner();
@ -249,10 +242,7 @@ async fn delete_tasks(
let task_deletion =
KindWithContent::TaskDeletion { query: format!("?{}", req.query_string()), tasks };
let uid = get_task_id(&req, &opt)?;
let dry_run = is_dry_run(&req, &opt)?;
let task = task::spawn_blocking(move || index_scheduler.register(task_deletion, uid, dry_run))
.await??;
let task = task::spawn_blocking(move || index_scheduler.register(task_deletion)).await??;
let task: SummarizedTaskView = task.into();
Ok(HttpResponse::Ok().json(task))

View File

@ -100,11 +100,16 @@ impl Index<'_> {
pub async fn raw_add_documents(
&self,
payload: &str,
headers: Vec<(&str, &str)>,
content_type: Option<&str>,
query_parameter: &str,
) -> (Value, StatusCode) {
let url = format!("/indexes/{}/documents{}", urlencode(self.uid.as_ref()), query_parameter);
self.service.post_str(url, payload, headers).await
if let Some(content_type) = content_type {
self.service.post_str(url, payload, vec![("Content-Type", content_type)]).await
} else {
self.service.post_str(url, payload, Vec::new()).await
}
}
pub async fn update_documents(

View File

@ -9,7 +9,7 @@ use actix_web::http::StatusCode;
use byte_unit::{Byte, ByteUnit};
use clap::Parser;
use meilisearch::option::{IndexerOpts, MaxMemory, Opt};
use meilisearch::{analytics, create_app, setup_meilisearch, SubscriberForSecondLayer};
use meilisearch::{analytics, create_app, setup_meilisearch};
use once_cell::sync::Lazy;
use tempfile::TempDir;
use tokio::time::sleep;
@ -87,20 +87,12 @@ impl Server {
tracing_subscriber::reload::Layer::new(None.with_filter(
tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF),
));
let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new(
(Box::new(
tracing_subscriber::fmt::layer()
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE),
)
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>)
.with_filter(tracing_subscriber::filter::Targets::new()),
);
actix_web::test::init_service(create_app(
self.service.index_scheduler.clone().into(),
self.service.auth.clone().into(),
self.service.options.clone(),
(route_layer_handle, stderr_layer_handle),
route_layer_handle,
analytics::MockAnalytics::new(&self.service.options),
true,
))

View File

@ -5,7 +5,7 @@ use actix_web::http::StatusCode;
use actix_web::test;
use actix_web::test::TestRequest;
use index_scheduler::IndexScheduler;
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
use meilisearch::{analytics, create_app, Opt};
use meilisearch_auth::AuthController;
use tracing::level_filters::LevelFilter;
use tracing_subscriber::Layer;
@ -111,20 +111,12 @@ impl Service {
tracing_subscriber::reload::Layer::new(None.with_filter(
tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF),
));
let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new(
(Box::new(
tracing_subscriber::fmt::layer()
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE),
)
as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>)
.with_filter(tracing_subscriber::filter::Targets::new()),
);
let app = test::init_service(create_app(
self.index_scheduler.clone().into(),
self.auth.clone().into(),
self.options.clone(),
(route_layer_handle, stderr_layer_handle),
route_layer_handle,
analytics::MockAnalytics::new(&self.options),
true,
))

View File

@ -1,11 +1,10 @@
use actix_web::test;
use meili_snap::{json_string, snapshot};
use meilisearch::Opt;
use time::format_description::well_known::Rfc3339;
use time::OffsetDateTime;
use crate::common::encoder::Encoder;
use crate::common::{default_settings, GetAllDocumentsOptions, Server, Value};
use crate::common::{GetAllDocumentsOptions, Server, Value};
use crate::json;
/// This is the basic usage of our API and every other tests uses the content-type application/json
@ -2158,49 +2157,3 @@ async fn batch_several_documents_addition() {
assert_eq!(code, 200, "failed with `{}`", response);
assert_eq!(response["results"].as_array().unwrap().len(), 120);
}
#[actix_rt::test]
async fn dry_register_file() {
let temp = tempfile::tempdir().unwrap();
let options =
Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) };
let server = Server::new_with_options(options).await.unwrap();
let index = server.index("tamo");
let documents = r#"
{
"id": "12",
"doggo": "kefir"
}
"#;
let (response, code) = index
.raw_add_documents(
documents,
vec![("Content-Type", "application/json"), ("DryRun", "true")],
"",
)
.await;
snapshot!(response, @r###"
{
"taskUid": 0,
"indexUid": "tamo",
"status": "enqueued",
"type": "documentAdditionOrUpdate",
"enqueuedAt": "[date]"
}
"###);
snapshot!(code, @"202 Accepted");
let (response, code) = index.get_task(response.uid()).await;
snapshot!(response, @r###"
{
"message": "Task `0` not found.",
"code": "task_not_found",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#task_not_found"
}
"###);
snapshot!(code, @"404 Not Found");
}

View File

@ -209,8 +209,7 @@ async fn replace_documents_missing_payload() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) =
index.raw_add_documents("", vec![("Content-Type", "application/json")], "").await;
let (response, code) = index.raw_add_documents("", Some("application/json"), "").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
@ -221,8 +220,7 @@ async fn replace_documents_missing_payload() {
}
"###);
let (response, code) =
index.raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "").await;
let (response, code) = index.raw_add_documents("", Some("application/x-ndjson"), "").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
@ -233,8 +231,7 @@ async fn replace_documents_missing_payload() {
}
"###);
let (response, code) =
index.raw_add_documents("", vec![("Content-Type", "text/csv")], "").await;
let (response, code) = index.raw_add_documents("", Some("text/csv"), "").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
@ -290,7 +287,7 @@ async fn replace_documents_missing_content_type() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.raw_add_documents("", Vec::new(), "").await;
let (response, code) = index.raw_add_documents("", None, "").await;
snapshot!(code, @"415 Unsupported Media Type");
snapshot!(json_string!(response), @r###"
{
@ -302,7 +299,7 @@ async fn replace_documents_missing_content_type() {
"###);
// even with a csv delimiter specified this error is triggered first
let (response, code) = index.raw_add_documents("", Vec::new(), "?csvDelimiter=;").await;
let (response, code) = index.raw_add_documents("", None, "?csvDelimiter=;").await;
snapshot!(code, @"415 Unsupported Media Type");
snapshot!(json_string!(response), @r###"
{
@ -348,7 +345,7 @@ async fn replace_documents_bad_content_type() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.raw_add_documents("", vec![("Content-Type", "doggo")], "").await;
let (response, code) = index.raw_add_documents("", Some("doggo"), "").await;
snapshot!(code, @"415 Unsupported Media Type");
snapshot!(json_string!(response), @r###"
{
@ -382,9 +379,8 @@ async fn replace_documents_bad_csv_delimiter() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index
.raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter")
.await;
let (response, code) =
index.raw_add_documents("", Some("application/json"), "?csvDelimiter").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
@ -395,9 +391,8 @@ async fn replace_documents_bad_csv_delimiter() {
}
"###);
let (response, code) = index
.raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=doggo")
.await;
let (response, code) =
index.raw_add_documents("", Some("application/json"), "?csvDelimiter=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
@ -409,11 +404,7 @@ async fn replace_documents_bad_csv_delimiter() {
"###);
let (response, code) = index
.raw_add_documents(
"",
vec![("Content-Type", "application/json")],
&format!("?csvDelimiter={}", encode("🍰")),
)
.raw_add_documents("", Some("application/json"), &format!("?csvDelimiter={}", encode("🍰")))
.await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
@ -478,9 +469,8 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index
.raw_add_documents("", vec![("Content-Type", "application/json")], "?csvDelimiter=a")
.await;
let (response, code) =
index.raw_add_documents("", Some("application/json"), "?csvDelimiter=a").await;
snapshot!(code, @"415 Unsupported Media Type");
snapshot!(json_string!(response), @r###"
{
@ -491,9 +481,8 @@ async fn replace_documents_csv_delimiter_with_bad_content_type() {
}
"###);
let (response, code) = index
.raw_add_documents("", vec![("Content-Type", "application/x-ndjson")], "?csvDelimiter=a")
.await;
let (response, code) =
index.raw_add_documents("", Some("application/x-ndjson"), "?csvDelimiter=a").await;
snapshot!(code, @"415 Unsupported Media Type");
snapshot!(json_string!(response), @r###"
{

View File

@ -1,4 +1,4 @@
use meili_snap::{json_string, snapshot};
use meili_snap::snapshot;
use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, Server};
@ -209,93 +209,3 @@ async fn error_update_documents_missing_document_id() {
"https://docs.meilisearch.com/errors#missing_document_id"
);
}
#[actix_rt::test]
async fn update_faceted_document() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index
.update_settings(json!({
"rankingRules": ["facet:asc"],
}))
.await;
assert_eq!("202", code.as_str(), "{:?}", response);
index.wait_task(0).await;
let documents: Vec<_> = (0..1000)
.map(|id| {
json!({
"doc_id": id,
"facet": (id/3),
})
})
.collect();
let (_response, code) = index.add_documents(documents.into(), None).await;
assert_eq!(code, 202);
index.wait_task(1).await;
let documents = json!([
{
"doc_id": 9,
"facet": 1.5,
}
]);
let (response, code) = index.update_documents(documents, None).await;
assert_eq!(code, 202, "response: {}", response);
index.wait_task(2).await;
index
.search(json!({"limit": 10}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"doc_id": 0,
"facet": 0
},
{
"doc_id": 1,
"facet": 0
},
{
"doc_id": 2,
"facet": 0
},
{
"doc_id": 3,
"facet": 1
},
{
"doc_id": 4,
"facet": 1
},
{
"doc_id": 5,
"facet": 1
},
{
"doc_id": 9,
"facet": 1.5
},
{
"doc_id": 6,
"facet": 2
},
{
"doc_id": 7,
"facet": 2
},
{
"doc_id": 8,
"facet": 2
}
]
"###);
})
.await;
}

View File

@ -2,10 +2,9 @@ use actix_web::http::header::ContentType;
use actix_web::test;
use http::header::ACCEPT_ENCODING;
use meili_snap::{json_string, snapshot};
use meilisearch::Opt;
use crate::common::encoder::Encoder;
use crate::common::{default_settings, Server, Value};
use crate::common::{Server, Value};
use crate::json;
#[actix_rt::test]
@ -200,79 +199,3 @@ async fn error_create_with_invalid_index_uid() {
}
"###);
}
#[actix_rt::test]
async fn send_task_id() {
let temp = tempfile::tempdir().unwrap();
let options =
Opt { experimental_replication_parameters: true, ..default_settings(temp.path()) };
let server = Server::new_with_options(options).await.unwrap();
let app = server.init_web_app().await;
let index = server.index("catto");
let (response, code) = index.create(None).await;
snapshot!(code, @"202 Accepted");
snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
{
"taskUid": 0,
"indexUid": "catto",
"status": "enqueued",
"type": "indexCreation",
"enqueuedAt": "[date]"
}
"###);
let body = serde_json::to_string(&json!({
"uid": "doggo",
"primaryKey": None::<&str>,
}))
.unwrap();
let req = test::TestRequest::post()
.uri("/indexes")
.insert_header(("TaskId", "25"))
.insert_header(ContentType::json())
.set_payload(body)
.to_request();
let res = test::call_service(&app, req).await;
snapshot!(res.status(), @"202 Accepted");
let bytes = test::read_body(res).await;
let response = serde_json::from_slice::<Value>(&bytes).expect("Expecting valid json");
snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
{
"taskUid": 25,
"indexUid": "doggo",
"status": "enqueued",
"type": "indexCreation",
"enqueuedAt": "[date]"
}
"###);
let body = serde_json::to_string(&json!({
"uid": "girafo",
"primaryKey": None::<&str>,
}))
.unwrap();
let req = test::TestRequest::post()
.uri("/indexes")
.insert_header(("TaskId", "12"))
.insert_header(ContentType::json())
.set_payload(body)
.to_request();
let res = test::call_service(&app, req).await;
snapshot!(res.status(), @"400 Bad Request");
let bytes = test::read_body(res).await;
let response = serde_json::from_slice::<Value>(&bytes).expect("Expecting valid json");
snapshot!(json_string!(response), @r###"
{
"message": "Received bad task id: 12 should be >= to 26.",
"code": "bad_request",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#bad_request"
}
"###);
}

View File

@ -89,7 +89,7 @@ async fn logs_stream_bad_mode() {
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Unknown value `tamo` at `.mode`: expected one of `human`, `json`, `profile`",
"message": "Unknown value `tamo` at `.mode`: expected one of `human`, `profile`",
"code": "bad_request",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#bad_request"
@ -146,7 +146,7 @@ async fn logs_stream_bad_profile_memory() {
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Unknown value `fmt` at `.mode`: expected one of `human`, `json`, `profile`",
"message": "Unknown value `fmt` at `.mode`: expected one of `human`, `profile`",
"code": "bad_request",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#bad_request"
@ -162,7 +162,7 @@ async fn logs_stream_without_enabling_the_route() {
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
"message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
@ -173,18 +173,7 @@ async fn logs_stream_without_enabling_the_route() {
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
let (response, code) = server.service.post("/logs/stderr", json!({})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Modifying logs through the `/logs/*` routes requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
"message": "getting logs through the `/logs/stream` route requires enabling the `logs route` experimental feature. See https://github.com/orgs/meilisearch/discussions/721",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"

View File

@ -5,7 +5,7 @@ use std::str::FromStr;
use actix_web::http::header::ContentType;
use meili_snap::snapshot;
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
use meilisearch::{analytics, create_app, Opt};
use tracing::level_filters::LevelFilter;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::Layer;
@ -27,25 +27,18 @@ async fn basic_test_log_stream_route() {
tracing_subscriber::reload::Layer::new(None.with_filter(
tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF),
));
let (_stderr_layer, stderr_layer_handle) = tracing_subscriber::reload::Layer::new(
(Box::new(
tracing_subscriber::fmt::layer()
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE),
) as Box<dyn tracing_subscriber::Layer<SubscriberForSecondLayer> + Send + Sync>)
.with_filter(tracing_subscriber::filter::Targets::new()),
);
let subscriber = tracing_subscriber::registry().with(route_layer).with(
tracing_subscriber::fmt::layer()
.with_span_events(tracing_subscriber::fmt::format::FmtSpan::ACTIVE)
.with_filter(tracing_subscriber::filter::LevelFilter::from_str("OFF").unwrap()),
.with_filter(tracing_subscriber::filter::LevelFilter::from_str("INFO").unwrap()),
);
let app = actix_web::test::init_service(create_app(
server.service.index_scheduler.clone().into(),
server.service.auth.clone().into(),
server.service.options.clone(),
(route_layer_handle, stderr_layer_handle),
route_layer_handle,
analytics::MockAnalytics::new(&server.service.options),
true,
))

View File

@ -7,7 +7,7 @@ use std::sync::Arc;
use actix_http::body::MessageBody;
use actix_web::dev::{ServiceFactory, ServiceResponse};
use actix_web::web::{Bytes, Data};
use actix_web::{post, App, HttpRequest, HttpResponse, HttpServer};
use actix_web::{post, App, HttpResponse, HttpServer};
use meili_snap::{json_string, snapshot};
use meilisearch::Opt;
use tokio::sync::mpsc;
@ -17,17 +17,7 @@ use crate::common::{default_settings, Server};
use crate::json;
#[post("/")]
async fn forward_body(
req: HttpRequest,
sender: Data<mpsc::UnboundedSender<Vec<u8>>>,
body: Bytes,
) -> HttpResponse {
let headers = req.headers();
assert_eq!(headers.get("content-type").unwrap(), "application/x-ndjson");
assert_eq!(headers.get("transfer-encoding").unwrap(), "chunked");
assert_eq!(headers.get("accept-encoding").unwrap(), "gzip");
assert_eq!(headers.get("content-encoding").unwrap(), "gzip");
async fn forward_body(sender: Data<mpsc::UnboundedSender<Vec<u8>>>, body: Bytes) -> HttpResponse {
let body = body.to_vec();
sender.send(body).unwrap();
HttpResponse::Ok().into()

View File

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::convert::TryInto;
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
use uuid::Uuid;

View File

@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.9.0"
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
byteorder = "1.5.0"
charabia = { version = "0.8.7", default-features = false }
charabia = { version = "0.8.5", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.11"
deserr = "0.6.1"
@ -26,7 +26,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7"
fxhash = "0.2.1"
geoutils = "0.5.1"
grenad = { git = "https://github.com/meilisearch/grenad.git", branch = "keep-source-index-in-merger", version = "0.4.5", default-features = false, features = [
grenad = { version = "0.4.5", default-features = false, features = [
"rayon",
"tempfile",
] }
@ -70,13 +70,13 @@ itertools = "0.11.0"
# profiling
puffin = "0.16.0"
# logging
logging_timer = "1.1.0"
csv = "1.3.0"
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = [
"onig",
] }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
"online",
] }
@ -102,16 +102,7 @@ meili-snap = { path = "../meili-snap" }
rand = { version = "0.8.5", features = ["small_rng"] }
[features]
all-tokenizations = [
"charabia/chinese",
"charabia/hebrew",
"charabia/japanese",
"charabia/thai",
"charabia/korean",
"charabia/greek",
"charabia/khmer",
"charabia/vietnamese",
]
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
# Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml
@ -139,7 +130,5 @@ greek = ["charabia/greek"]
# allow khmer specialized tokenization
khmer = ["charabia/khmer"]
vietnamese = ["charabia/vietnamese"]
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
cuda = ["candle-core/cuda"]

View File

@ -67,8 +67,6 @@ pub mod main_key {
pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits";
pub const PROXIMITY_PRECISION: &str = "proximity-precision";
pub const EMBEDDING_CONFIGS: &str = "embedding_configs";
pub const CORRUPTED: &str = "corrupted";
}
pub mod db_name {
@ -1509,103 +1507,6 @@ impl Index {
_ => "default".to_owned(),
})
}
pub fn check_document_facet_consistency(
&self,
rtxn: &RoTxn<'_>,
) -> Result<DocumentFacetConsistency> {
let documents = self.documents_ids(rtxn)?;
let field_ids_map = self.fields_ids_map(rtxn)?;
let mut facets = Vec::new();
let mut facet_exists = Vec::new();
let faceted_fields = self.user_defined_faceted_fields(rtxn)?;
for fid in field_ids_map.ids() {
let facet_name = field_ids_map.name(fid).unwrap();
if !faceted_fields.contains(facet_name) {
continue;
};
let mut facet = RoaringBitmap::new();
// value doesn't matter here we'll truncate to the level
let key = crate::heed_codec::facet::FacetGroupKey {
field_id: fid,
level: 0,
left_bound: &[] as _,
};
for res in self
.facet_id_f64_docids
.remap_key_type::<FacetGroupKeyCodec<crate::heed_codec::BytesRefCodec>>()
.prefix_iter(rtxn, &key)?
{
let (_k, v) = res?;
facet |= v.bitmap;
}
for res in self
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<crate::heed_codec::BytesRefCodec>>()
.prefix_iter(rtxn, &key)?
{
let (_k, v) = res?;
facet |= v.bitmap;
}
facets.push((field_ids_map.name(fid).unwrap().to_owned(), facet));
facet_exists.push(self.exists_faceted_documents_ids(rtxn, fid)?);
}
Ok(DocumentFacetConsistency { documents, facets, facet_exists })
}
pub fn mark_as_corrupted(&self, wtxn: &mut RwTxn<'_>) -> Result<()> {
Ok(self.main.remap_types::<Str, Str>().put(wtxn, main_key::CORRUPTED, "corrupted")?)
}
pub fn is_corrupted(&self, txn: &RoTxn<'_>) -> Result<bool> {
Ok(self.main.remap_types::<Str, Str>().get(txn, main_key::CORRUPTED)?.is_some())
}
}
pub struct DocumentFacetConsistency {
documents: RoaringBitmap,
facets: Vec<(String, RoaringBitmap)>,
facet_exists: Vec<RoaringBitmap>,
}
impl DocumentFacetConsistency {
pub fn check(&self) {
let mut inconsistencies = 0;
for ((field_name, facet), _facet_exists) in self.facets.iter().zip(self.facet_exists.iter())
{
if field_name == "_geo" {
continue;
}
// only check the internal ids missing in documents as it is the grave condition
// let documents = self.documents.clone() & facet_exists;
let documents = self.documents.clone();
// let missing_in_facets = &documents - facet;
let missing_in_documents = facet - documents;
/*for id in missing_in_facets {
tracing::error!(id, field_name, "Missing in facets");
inconsistencies += 1;
}*/
for id in missing_in_documents {
tracing::error!(id, field_name, "Missing in documents");
inconsistencies += 1;
}
}
if inconsistencies > 0 {
panic!(
"Panicked due to the previous {} inconsistencies between documents and facets",
inconsistencies
)
}
}
}
#[cfg(test)]

View File

@ -15,7 +15,7 @@ pub struct BucketSortOutput {
// TODO: would probably be good to regroup some of these inside of a struct?
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")]
#[logging_timer::time]
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,

View File

@ -191,7 +191,7 @@ fn resolve_maximally_reduced_query_graph(
Ok(docids)
}
#[tracing::instrument(level = "trace", skip_all, target = "search")]
#[logging_timer::time]
fn resolve_universe(
ctx: &mut SearchContext,
initial_universe: &RoaringBitmap,
@ -557,7 +557,7 @@ pub fn execute_vector_search(
}
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "search")]
#[logging_timer::time]
pub fn execute_search(
ctx: &mut SearchContext,
query: Option<&str>,
@ -577,9 +577,6 @@ pub fn execute_search(
let mut located_query_terms = None;
let query_terms = if let Some(query) = query {
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
let entered = span.enter();
// We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them.
let mut tokbuilder = TokenizerBuilder::new();
@ -608,12 +605,7 @@ pub fn execute_search(
}
let tokenizer = tokbuilder.build();
drop(entered);
let span = tracing::trace_span!(target: "search::tokens", "tokenize");
let entered = span.enter();
let tokens = tokenizer.tokenize(query);
drop(entered);
let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?;
if query_terms.is_empty() {

View File

@ -6,10 +6,9 @@ use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use heed::types::DecodeIgnore;
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
use super::*;
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{Lazy, TwoTypoTerm};
use crate::search::new::query_term::TwoTypoTerm;
use crate::search::new::{limits, SearchContext};
use crate::search::{build_dfa, get_first};
use crate::{Result, MAX_WORD_LENGTH};

View File

@ -7,6 +7,7 @@ use std::collections::BTreeSet;
use std::iter::FromIterator;
use std::ops::RangeInclusive;
use compute_derivations::partially_initialized_term_from_word;
use either::Either;
pub use ntypo_subset::NTypoTermSubset;
pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};

View File

@ -1,15 +1,11 @@
use std::collections::BTreeSet;
use charabia::normalizer::NormalizedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use super::compute_derivations::partially_initialized_term_from_word;
use super::{LocatedQueryTerm, ZeroTypoTerm};
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
use super::*;
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
/// Convert the tokenised search query into a list of located query terms.
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
#[logging_timer::time]
pub fn located_query_terms_from_tokens(
ctx: &mut SearchContext,
query: NormalizedTokenIter,
@ -229,7 +225,7 @@ pub fn make_ngram(
}
struct PhraseBuilder {
words: Vec<Option<crate::search::new::Interned<String>>>,
words: Vec<Option<Interned<String>>>,
start: u16,
end: u16,
}

View File

@ -1,7 +1,7 @@
use std::fs::File;
use std::io::BufReader;
use grenad::{CompressionType, Merger};
use grenad::CompressionType;
use heed::types::Bytes;
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
use roaring::RoaringBitmap;
@ -14,7 +14,6 @@ use crate::heed_codec::facet::{
use crate::heed_codec::BytesRefCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
use crate::update::MergeFn;
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
@ -29,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> {
facet_type: FacetType,
field_ids: Vec<FieldId>,
// None if level 0 does not need to be updated
delta_data: Option<Merger<BufReader<File>, MergeFn>>,
delta_data: Option<grenad::Reader<BufReader<File>>>,
}
impl<'i> FacetsUpdateBulk<'i> {
@ -37,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> {
index: &'i Index,
field_ids: Vec<FieldId>,
facet_type: FacetType,
delta_data: Merger<BufReader<File>, MergeFn>,
delta_data: grenad::Reader<BufReader<File>>,
group_size: u8,
min_level_size: u8,
) -> FacetsUpdateBulk<'i> {
@ -66,7 +65,7 @@ impl<'i> FacetsUpdateBulk<'i> {
}
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")]
#[logging_timer::time("FacetsUpdateBulk::{}")]
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
@ -90,7 +89,7 @@ impl<'i> FacetsUpdateBulk<'i> {
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
pub delta_data: Option<Merger<R, MergeFn>>,
pub delta_data: Option<grenad::Reader<R>>,
pub group_size: u8,
pub min_level_size: u8,
}
@ -130,8 +129,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
if self.db.is_empty(wtxn)? {
let mut buffer = Vec::new();
let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
let mut iter = delta_data.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) {
continue;
}
@ -155,8 +154,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
let mut buffer = Vec::new();
let database = self.db.remap_types::<Bytes, Bytes>();
let mut iter = delta_data.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) {
continue;
}

View File

@ -1,7 +1,6 @@
use std::fs::File;
use std::io::BufReader;
use grenad::Merger;
use heed::types::{Bytes, DecodeIgnore};
use heed::{BytesDecode, Error, RoTxn, RwTxn};
use obkv::KvReader;
@ -15,56 +14,31 @@ use crate::heed_codec::BytesRefCodec;
use crate::search::facet::get_highest_level;
use crate::update::del_add::DelAdd;
use crate::update::index_documents::valid_lmdb_key;
use crate::update::MergeFn;
use crate::{CboRoaringBitmapCodec, Index, Result};
/// Enum used as a return value for the facet incremental indexing.
///
/// - `ModificationResult::InPlace` means that modifying the `facet_value` into the `level` did not have
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
/// of the parent node.
///
/// - `ModificationResult::Insert` means that modifying the `facet_value` into the `level` resulted
/// in the addition of a new key in that level, and that therefore the number of children
/// of the parent node should be incremented.
///
/// - `ModificationResult::Remove` means that modifying the `facet_value` into the `level` resulted in a change in the
/// number of keys in the level. For example, removing a document id from the facet value `3` could
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
/// entirely. In that case, `ModificationResult::Remove` is returned. The parent of the deleted key must
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
///
/// - `ModificationResult::Reduce/Expand` means that modifying the `facet_value` into the `level` resulted in a change in the
/// bounds of the keys of the level. For example, removing a document id from the facet value
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
/// In that case `ModificationResult::Reduce` is returned. The parent of the reduced key may need to adjust
/// its left bound as well.
///
/// - `ModificationResult::Nothing` means that modifying the `facet_value` didn't have any impact into the `level`.
/// This case is reachable when a document id is removed from a sub-level node but is still present in another one.
/// For example, removing `2` from a document containing `2` and `3`, the document id will removed form the `level 0` but should remain in the group node [1..4] in `level 1`.
enum ModificationResult {
enum InsertionResult {
InPlace,
Expand,
Insert,
}
enum DeletionResult {
InPlace,
Reduce { next: Option<Vec<u8>> },
Remove { next: Option<Vec<u8>> },
Nothing,
}
/// Algorithm to incrementally insert and delete elememts into the
/// `facet_id_(string/f64)_docids` databases.
pub struct FacetsUpdateIncremental {
inner: FacetsUpdateIncrementalInner,
delta_data: Merger<BufReader<File>, MergeFn>,
delta_data: grenad::Reader<BufReader<File>>,
}
impl FacetsUpdateIncremental {
pub fn new(
index: &Index,
facet_type: FacetType,
delta_data: Merger<BufReader<File>, MergeFn>,
delta_data: grenad::Reader<BufReader<File>>,
group_size: u8,
min_level_size: u8,
max_group_size: u8,
@ -87,59 +61,34 @@ impl FacetsUpdateIncremental {
}
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::incremental")]
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
let mut current_field_id = None;
let mut facet_level_may_be_updated = false;
let mut iter = self.delta_data.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = self.delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if !valid_lmdb_key(key) {
continue;
}
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key)
.map_err(heed::Error::Encoding)?;
if facet_level_may_be_updated
&& current_field_id.map_or(false, |fid| fid != key.field_id)
{
// Only add or remove a level after making all the field modifications.
self.inner.add_or_delete_level(wtxn, current_field_id.unwrap())?;
facet_level_may_be_updated = false;
}
current_field_id = Some(key.field_id);
let value = KvReader::new(value);
let docids_to_delete = value
.get(DelAdd::Deletion)
.map(CboRoaringBitmapCodec::bytes_decode)
.map(|o| o.map_err(heed::Error::Encoding))
.transpose()?;
.map(|o| o.map_err(heed::Error::Encoding));
let docids_to_add = value
.get(DelAdd::Addition)
.map(CboRoaringBitmapCodec::bytes_decode)
.map(|o| o.map_err(heed::Error::Encoding))
.transpose()?;
.map(|o| o.map_err(heed::Error::Encoding));
let level_size_changed = self.inner.modify(
wtxn,
key.field_id,
key.left_bound,
docids_to_add.as_ref(),
docids_to_delete.as_ref(),
)?;
if level_size_changed {
// if a node has been added or removed from the highest level,
// we may have to update the facet level.
facet_level_may_be_updated = true;
if let Some(docids_to_delete) = docids_to_delete {
let docids_to_delete = docids_to_delete?;
self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
}
}
if let Some(field_id) = current_field_id {
if facet_level_may_be_updated {
self.inner.add_or_delete_level(wtxn, field_id)?;
if let Some(docids_to_add) = docids_to_add {
let docids_to_add = docids_to_add?;
self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
}
}
@ -213,78 +162,138 @@ impl FacetsUpdateIncrementalInner {
///
/// ## Return
/// See documentation of `insert_in_level`
fn modify_in_level_0(
fn insert_in_level_0(
&self,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
add_docids: Option<&RoaringBitmap>,
del_docids: Option<&RoaringBitmap>,
) -> Result<ModificationResult> {
docids: &RoaringBitmap,
) -> Result<InsertionResult> {
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
let value = FacetGroupValue { bitmap: docids.clone(), size: 1 };
let old_value = self.db.get(txn, &key)?;
match (old_value, add_docids, del_docids) {
// Addition + deletion on an existing value
(Some(FacetGroupValue { bitmap, .. }), Some(add_docids), Some(del_docids)) => {
let value = FacetGroupValue { bitmap: (bitmap - del_docids) | add_docids, size: 1 };
self.db.put(txn, &key, &value)?;
Ok(ModificationResult::InPlace)
}
// Addition on an existing value
(Some(FacetGroupValue { bitmap, .. }), Some(add_docids), None) => {
let value = FacetGroupValue { bitmap: bitmap | add_docids, size: 1 };
self.db.put(txn, &key, &value)?;
Ok(ModificationResult::InPlace)
}
// Addition of a new value (ignore deletion)
(None, Some(add_docids), _) => {
let value = FacetGroupValue { bitmap: add_docids.clone(), size: 1 };
self.db.put(txn, &key, &value)?;
Ok(ModificationResult::Insert)
}
// Deletion on an existing value, fully delete the key if the resulted value is empty.
(Some(FacetGroupValue { mut bitmap, .. }), None, Some(del_docids)) => {
bitmap -= del_docids;
if bitmap.is_empty() {
// Full deletion
let mut next_key = None;
if let Some((next, _)) =
self.db.remap_data_type::<DecodeIgnore>().get_greater_than(txn, &key)?
{
if next.field_id == field_id && next.level == 0 {
next_key = Some(next.left_bound.to_vec());
}
}
self.db.delete(txn, &key)?;
Ok(ModificationResult::Remove { next: next_key })
} else {
// Partial deletion
let value = FacetGroupValue { bitmap, size: 1 };
let mut level0_prefix = vec![];
level0_prefix.extend_from_slice(&field_id.to_be_bytes());
level0_prefix.push(0);
let mut iter =
self.db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, &level0_prefix)?;
if iter.next().is_none() {
drop(iter);
self.db.put(txn, &key, &value)?;
Ok(InsertionResult::Insert)
} else {
drop(iter);
let old_value = self.db.get(txn, &key)?;
match old_value {
Some(mut updated_value) => {
// now merge the two
updated_value.bitmap |= value.bitmap;
self.db.put(txn, &key, &updated_value)?;
Ok(InsertionResult::InPlace)
}
None => {
self.db.put(txn, &key, &value)?;
Ok(ModificationResult::InPlace)
Ok(InsertionResult::Insert)
}
}
// Otherwise do nothing (None + no addition + deletion == Some + no addition + no deletion == Nothing),
// may be unreachable at some point.
(None, None, _) | (Some(_), None, None) => Ok(ModificationResult::Nothing),
}
}
/// Split a level node into two balanced nodes.
/// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
/// This function works recursively.
///
/// # Return
/// Returns `ModificationResult::Insert` if the split is successful.
fn split_group(
/// ## Return
/// Returns the effect of adding the facet value to the database on the given `level`.
///
/// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have
/// an effect on the number of keys in that level. Therefore, it did not increase the number of children
/// of the parent node.
///
/// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
/// in the addition of a new key in that level, and that therefore the number of children
/// of the parent node should be incremented.
fn insert_in_level(
&self,
txn: &mut RwTxn,
field_id: u16,
level: u8,
insertion_key: FacetGroupKey<Vec<u8>>,
insertion_value: FacetGroupValue,
) -> Result<ModificationResult> {
let size_left = insertion_value.size / 2;
let size_right = insertion_value.size - size_left;
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<InsertionResult> {
if level == 0 {
return self.insert_in_level_0(txn, field_id, facet_value, docids);
}
let max_group_size = self.max_group_size;
let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?;
// level below inserted an element
let (insertion_key, insertion_value) =
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
match result {
// because we know that we inserted in place, the facet_value is not a new one
// thus it doesn't extend a group, and thus the insertion key computed above is
// still correct
InsertionResult::InPlace => {
let mut updated_value = insertion_value;
updated_value.bitmap |= docids;
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
return Ok(InsertionResult::InPlace);
}
InsertionResult::Expand => {}
InsertionResult::Insert => {}
}
// Here we know that inserting the facet value in the level below resulted in the creation
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
// could happen).
let (insertion_key, insertion_key_was_modified) = {
let mut new_insertion_key = insertion_key.clone();
let mut key_should_be_modified = false;
if facet_value < insertion_key.left_bound.as_slice() {
new_insertion_key.left_bound = facet_value.to_vec();
key_should_be_modified = true;
}
if key_should_be_modified {
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
assert!(is_deleted);
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
}
(new_insertion_key, key_should_be_modified)
};
// Now we know that the insertion key contains the `facet_value`.
// We still need to update the insertion value by:
// 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`)
// 2. Merge the previous docids with the new one
let mut updated_value = insertion_value;
if matches!(result, InsertionResult::Insert) {
updated_value.size += 1;
}
if updated_value.size < max_group_size {
updated_value.bitmap |= docids;
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
if insertion_key_was_modified {
return Ok(InsertionResult::Expand);
} else {
return Ok(InsertionResult::InPlace);
}
}
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
// Therefore it must be split into two nodes.
let size_left = updated_value.size / 2;
let size_right = updated_value.size - size_left;
let level_below = level - 1;
@ -338,228 +347,34 @@ impl FacetsUpdateIncrementalInner {
self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
Ok(ModificationResult::Insert)
Ok(InsertionResult::Insert)
}
/// Remove the docids still present in the related sub-level nodes from the del_docids.
///
/// This process is needed to avoid removing docids from a group node where the docid is present in several sub-nodes.
fn trim_del_docids<'a>(
&self,
txn: &mut RwTxn,
field_id: u16,
level: u8,
insertion_key: &FacetGroupKey<Vec<u8>>,
insertion_value_size: usize,
del_docids: &'a RoaringBitmap,
) -> Result<std::borrow::Cow<'a, RoaringBitmap>> {
let level_below = level - 1;
let start_key = FacetGroupKey {
field_id,
level: level_below,
left_bound: insertion_key.left_bound.as_slice(),
};
let mut del_docids = std::borrow::Cow::Borrowed(del_docids);
let iter = self.db.range(txn, &(start_key..))?.take(insertion_value_size);
for next in iter {
let (_, value) = next?;
// if a sublevel bitmap as common docids with del_docids,
// then these docids shouldn't be removed and so, remove them from the deletion list.
if !value.bitmap.is_disjoint(&del_docids) {
*del_docids.to_mut() -= value.bitmap;
}
}
Ok(del_docids)
}
/// Modify the given facet value and corresponding document ids in all the levels of the database up to the given `level`.
/// This function works recursively.
///
/// ## Return
/// Returns the effect of modifying the facet value to the database on the given `level`.
///
fn modify_in_level(
&self,
txn: &mut RwTxn,
field_id: u16,
level: u8,
facet_value: &[u8],
add_docids: Option<&RoaringBitmap>,
del_docids: Option<&RoaringBitmap>,
) -> Result<ModificationResult> {
if level == 0 {
return self.modify_in_level_0(txn, field_id, facet_value, add_docids, del_docids);
}
let result =
self.modify_in_level(txn, field_id, level - 1, facet_value, add_docids, del_docids)?;
// level below inserted an element
if let ModificationResult::Nothing = result {
// if the previous level has not been modified,
// early return ModificationResult::Nothing.
return Ok(ModificationResult::Nothing);
}
let (insertion_key, insertion_value) =
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
let insertion_value_size = insertion_value.size as usize;
let mut insertion_value_was_modified = false;
let mut updated_value = insertion_value;
if let ModificationResult::Insert = result {
// if a key has been inserted in the sub-level raise the value size.
updated_value.size += 1;
insertion_value_was_modified = true;
} else if let ModificationResult::Remove { .. } = result {
if updated_value.size <= 1 {
// if the only remaining node is the one to delete,
// delete the key instead and early return.
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
assert!(is_deleted);
return Ok(result);
} else {
// Reduce the value size
updated_value.size -= 1;
insertion_value_was_modified = true;
}
}
let (insertion_key, insertion_key_modification) =
if let ModificationResult::InPlace = result {
(insertion_key, ModificationResult::InPlace)
} else {
// Inserting or deleting the facet value in the level below resulted in the creation
// of a new key. Therefore, it may be the case that we need to modify the left bound of the
// insertion key (see documentation of `find_insertion_key_value` for an example of when that
// could happen).
let mut new_insertion_key = insertion_key.clone();
let mut key_modification = ModificationResult::InPlace;
if let ModificationResult::Remove { next } | ModificationResult::Reduce { next } =
result
{
// if the deleted facet_value is the left_bound of the current node,
// the left_bound should be updated reducing the current node.
let reduced_range = facet_value == insertion_key.left_bound;
if reduced_range {
new_insertion_key.left_bound = next.clone().unwrap();
key_modification = ModificationResult::Reduce { next };
}
} else if facet_value < insertion_key.left_bound.as_slice() {
// if the added facet_value is the under the left_bound of the current node,
// the left_bound should be updated expanding the current node.
new_insertion_key.left_bound = facet_value.to_vec();
key_modification = ModificationResult::Expand;
}
if matches!(
key_modification,
ModificationResult::Expand | ModificationResult::Reduce { .. }
) {
// if the node should be updated, delete it, it will be recreated using a new key later.
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
assert!(is_deleted);
}
(new_insertion_key, key_modification)
};
if updated_value.size < self.max_group_size {
// If there are docids to delete, trim them avoiding unexpected removal.
if let Some(del_docids) = del_docids
.map(|ids| {
self.trim_del_docids(
txn,
field_id,
level,
&insertion_key,
insertion_value_size,
ids,
)
})
.transpose()?
.filter(|ids| !ids.is_empty())
{
updated_value.bitmap -= &*del_docids;
insertion_value_was_modified = true;
}
if let Some(add_docids) = add_docids {
updated_value.bitmap |= add_docids;
insertion_value_was_modified = true;
}
if insertion_value_was_modified
|| matches!(
insertion_key_modification,
ModificationResult::Expand | ModificationResult::Reduce { .. }
)
{
// if any modification occured, insert it in the database.
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
Ok(insertion_key_modification)
} else {
// this case is reachable when a docid is removed from a sub-level node but is still present in another one.
// For instance, a document containing 2 and 3, if 2 is removed, the docid should remain in the group node [1..4].
Ok(ModificationResult::Nothing)
}
} else {
// We've increased the group size of the value and realised it has become greater than or equal to `max_group_size`
// Therefore it must be split into two nodes.
self.split_group(txn, field_id, level, insertion_key, updated_value)
}
}
/// Modify the given facet value and corresponding document ids in the database.
/// If no more document ids correspond to the facet value, delete it completely.
///
/// ## Return
/// Returns `true` if some tree-nodes of the highest level have been removed or added implying a potential
/// addition or deletion of a facet level.
/// Otherwise returns `false` if the tree-nodes have been modified in place.
pub fn modify(
/// Insert the given facet value and corresponding document ids in the database.
pub fn insert(
&self,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
add_docids: Option<&RoaringBitmap>,
del_docids: Option<&RoaringBitmap>,
) -> Result<bool> {
if add_docids.map_or(true, RoaringBitmap::is_empty)
&& del_docids.map_or(true, RoaringBitmap::is_empty)
{
return Ok(false);
docids: &RoaringBitmap,
) -> Result<()> {
if docids.is_empty() {
return Ok(());
}
let group_size = self.group_size;
let highest_level = get_highest_level(txn, self.db, field_id)?;
let result = self.modify_in_level(
txn,
field_id,
highest_level,
facet_value,
add_docids,
del_docids,
)?;
let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?;
match result {
ModificationResult::InPlace
| ModificationResult::Expand
| ModificationResult::Nothing
| ModificationResult::Reduce { .. } => Ok(false),
ModificationResult::Insert | ModificationResult::Remove { .. } => Ok(true),
InsertionResult::InPlace => return Ok(()),
InsertionResult::Expand => return Ok(()),
InsertionResult::Insert => {}
}
}
/// Check whether the highest level has exceeded `min_level_size` * `self.group_size`.
/// If it has, we must build an addition level above it.
/// Then check whether the highest level is under `min_level_size`.
/// If it has, we must remove the complete level.
pub(crate) fn add_or_delete_level(&self, txn: &mut RwTxn, field_id: u16) -> Result<()> {
let highest_level = get_highest_level(txn, self.db, field_id)?;
// Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`.
// If it has, we must build an addition level above it.
let mut highest_level_prefix = vec![];
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
highest_level_prefix.push(highest_level);
@ -567,48 +382,14 @@ impl FacetsUpdateIncrementalInner {
let size_highest_level =
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?.count();
if size_highest_level >= self.group_size as usize * self.min_level_size as usize {
self.add_level(txn, field_id, highest_level, &highest_level_prefix, size_highest_level)
} else if size_highest_level < self.min_level_size as usize && highest_level != 0 {
self.delete_level(txn, &highest_level_prefix)
} else {
Ok(())
if size_highest_level < self.group_size as usize * self.min_level_size as usize {
return Ok(());
}
}
/// Delete a level.
fn delete_level(&self, txn: &mut RwTxn, highest_level_prefix: &[u8]) -> Result<()> {
let mut to_delete = vec![];
let mut iter =
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, highest_level_prefix)?;
for el in iter.by_ref() {
let (k, _) = el?;
to_delete.push(
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(k)
.map_err(Error::Encoding)?
.into_owned(),
);
}
drop(iter);
for k in to_delete {
self.db.delete(txn, &k.as_ref())?;
}
Ok(())
}
/// Build an additional level for the field id.
fn add_level(
&self,
txn: &mut RwTxn,
field_id: u16,
highest_level: u8,
highest_level_prefix: &[u8],
size_highest_level: usize,
) -> Result<()> {
let mut groups_iter = self
.db
.remap_types::<Bytes, FacetGroupValueCodec>()
.prefix_iter(txn, highest_level_prefix)?;
.prefix_iter(txn, &highest_level_prefix)?;
let nbr_new_groups = size_highest_level / self.group_size as usize;
let nbr_leftover_elements = size_highest_level % self.group_size as usize;
@ -617,7 +398,7 @@ impl FacetsUpdateIncrementalInner {
for _ in 0..nbr_new_groups {
let mut first_key = None;
let mut values = RoaringBitmap::new();
for _ in 0..self.group_size {
for _ in 0..group_size {
let (key_bytes, value_i) = groups_iter.next().unwrap()?;
let key_i = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes)
.map_err(Error::Encoding)?;
@ -632,7 +413,7 @@ impl FacetsUpdateIncrementalInner {
level: highest_level + 1,
left_bound: first_key.unwrap().left_bound,
};
let value = FacetGroupValue { size: self.group_size, bitmap: values };
let value = FacetGroupValue { size: group_size, bitmap: values };
to_add.push((key.into_owned(), value));
}
// now we add the rest of the level, in case its size is > group_size * min_level_size
@ -667,6 +448,173 @@ impl FacetsUpdateIncrementalInner {
}
Ok(())
}
/// Delete the given document id from the given facet value in the database, from level 0 to the
/// the given level.
///
/// ## Return
/// Returns the effect of removing the document id from the database on the given `level`.
///
/// - `DeletionResult::InPlace` means that deleting the document id did not have
/// an effect on the keys in that level.
///
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
/// number of keys in the level. For example, removing a document id from the facet value `3` could
/// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted
/// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must
/// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well.
///
/// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the
/// bounds of the keys of the level. For example, removing a document id from the facet value
/// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore,
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
/// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
/// its left bound as well.
fn delete_in_level(
&self,
txn: &mut RwTxn,
field_id: u16,
level: u8,
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<DeletionResult> {
if level == 0 {
return self.delete_in_level_0(txn, field_id, facet_value, docids);
}
let (deletion_key, mut bitmap) =
self.find_insertion_key_value(field_id, level, facet_value, txn)?;
let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?;
let mut decrease_size = false;
let next_key = match result {
DeletionResult::InPlace => {
bitmap.bitmap -= docids;
self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
return Ok(DeletionResult::InPlace);
}
DeletionResult::Reduce { next } => next,
DeletionResult::Remove { next } => {
decrease_size = true;
next
}
};
// If either DeletionResult::Reduce or DeletionResult::Remove was returned,
// then we may need to adjust the left_bound of the deletion key.
// If DeletionResult::Remove was returned, then we need to decrease the group
// size of the deletion key.
let mut updated_value = bitmap;
if decrease_size {
updated_value.size -= 1;
}
if updated_value.size == 0 {
self.db.delete(txn, &deletion_key.as_ref())?;
Ok(DeletionResult::Remove { next: next_key })
} else {
let mut updated_deletion_key = deletion_key.clone();
let reduced_range = facet_value == deletion_key.left_bound;
if reduced_range {
updated_deletion_key.left_bound = next_key.clone().unwrap();
}
updated_value.bitmap -= docids;
let _ = self.db.delete(txn, &deletion_key.as_ref())?;
self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
if reduced_range {
Ok(DeletionResult::Reduce { next: next_key })
} else {
Ok(DeletionResult::InPlace)
}
}
}
fn delete_in_level_0(
&self,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<DeletionResult> {
let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value };
let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap;
bitmap -= docids;
if bitmap.is_empty() {
let mut next_key = None;
if let Some((next, _)) =
self.db.remap_data_type::<DecodeIgnore>().get_greater_than(txn, &key)?
{
if next.field_id == field_id && next.level == 0 {
next_key = Some(next.left_bound.to_vec());
}
}
self.db.delete(txn, &key)?;
Ok(DeletionResult::Remove { next: next_key })
} else {
self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
Ok(DeletionResult::InPlace)
}
}
pub fn delete(
&self,
txn: &mut RwTxn,
field_id: u16,
facet_value: &[u8],
docids: &RoaringBitmap,
) -> Result<()> {
if self
.db
.remap_data_type::<DecodeIgnore>()
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })?
.is_none()
{
return Ok(());
}
let highest_level = get_highest_level(txn, self.db, field_id)?;
let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?;
match result {
DeletionResult::InPlace => return Ok(()),
DeletionResult::Reduce { .. } => return Ok(()),
DeletionResult::Remove { .. } => {}
}
// if we either removed a key from the highest level, its size may have fallen
// below `min_level_size`, in which case we need to remove the entire level
let mut highest_level_prefix = vec![];
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
highest_level_prefix.push(highest_level);
if highest_level == 0
|| self
.db
.remap_types::<Bytes, Bytes>()
.prefix_iter(txn, &highest_level_prefix)?
.count()
>= self.min_level_size as usize
{
return Ok(());
}
let mut to_delete = vec![];
let mut iter =
self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?;
for el in iter.by_ref() {
let (k, _) = el?;
to_delete.push(
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(k)
.map_err(Error::Encoding)?
.into_owned(),
);
}
drop(iter);
for k in to_delete {
self.db.delete(txn, &k.as_ref())?;
}
Ok(())
}
}
impl<'a> FacetGroupKey<&'a [u8]> {

View File

@ -79,9 +79,12 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::BufReader;
use std::iter::FromIterator;
use grenad::Merger;
use heed::types::{Bytes, DecodeIgnore};
use charabia::normalizer::{Normalize, NormalizerOption};
use grenad::{CompressionType, SortAlgorithm};
use heed::types::{Bytes, DecodeIgnore, SerdeJson};
use heed::BytesEncode;
use time::OffsetDateTime;
use tracing::debug;
@ -90,9 +93,9 @@ use super::FacetsUpdateBulk;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::BytesRefCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::update::MergeFn;
use crate::{try_split_array_at, FieldId, Index, Result};
use crate::update::index_documents::create_sorter;
use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH};
pub mod bulk;
pub mod incremental;
@ -105,20 +108,16 @@ pub struct FacetsUpdate<'i> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
facet_type: FacetType,
delta_data: Merger<BufReader<File>, MergeFn>,
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
delta_data: grenad::Reader<BufReader<File>>,
group_size: u8,
max_group_size: u8,
min_level_size: u8,
data_size: u64,
}
impl<'i> FacetsUpdate<'i> {
pub fn new(
index: &'i Index,
facet_type: FacetType,
delta_data: Merger<BufReader<File>, MergeFn>,
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
data_size: u64,
delta_data: grenad::Reader<BufReader<File>>,
) -> Self {
let database = match facet_type {
FacetType::String => {
@ -136,20 +135,18 @@ impl<'i> FacetsUpdate<'i> {
min_level_size: FACET_MIN_LEVEL_SIZE,
facet_type,
delta_data,
normalized_delta_data,
data_size,
}
}
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
if self.data_size == 0 {
if self.delta_data.is_empty() {
return Ok(());
}
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
// See self::comparison_bench::benchmark_facet_indexing
if self.data_size >= (self.database.len(wtxn)? / 500) {
if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
let field_ids =
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
let bulk_update = FacetsUpdateBulk::new(
@ -173,108 +170,94 @@ impl<'i> FacetsUpdate<'i> {
incremental_update.execute(wtxn)?;
}
match self.normalized_delta_data {
Some(data) => index_facet_search(wtxn, data, self.index),
None => Ok(()),
}
}
}
// We clear the list of normalized-for-search facets
// and the previous FSTs to compute everything from scratch
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
self.index.facet_id_string_fst.clear(wtxn)?;
fn index_facet_search(
wtxn: &mut heed::RwTxn,
normalized_delta_data: Merger<BufReader<File>, MergeFn>,
index: &Index,
) -> Result<()> {
let mut iter = normalized_delta_data.into_stream_merger_iter()?;
while let Some((key_bytes, delta_bytes)) = iter.next()? {
let deladd_reader = KvReaderDelAdd::new(delta_bytes);
// As we can't use the same write transaction to read and write in two different databases
// we must create a temporary sorter that we will write into LMDB afterward.
// As multiple unnormalized facet values can become the same normalized facet value
// we must merge them together.
let mut sorter = create_sorter(
SortAlgorithm::Unstable,
merge_btreeset_string,
CompressionType::None,
None,
None,
None,
);
let database_set = index
.facet_id_normalized_string_strings
.remap_key_type::<Bytes>()
.get(wtxn, key_bytes)?
.unwrap_or_default();
let add_set = deladd_reader
.get(DelAdd::Addition)
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
.unwrap_or_default();
let del_set = match deladd_reader
.get(DelAdd::Deletion)
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
{
Some(del_set) => {
let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let mut set = BTreeSet::new();
for facet in del_set {
let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() };
// Check if the referenced value doesn't exist anymore before deleting it.
if index
.facet_id_string_docids
.remap_data_type::<DecodeIgnore>()
.get(wtxn, &key)?
.is_none()
{
set.insert(facet);
}
// We iterate on the list of original, semi-normalized, facet values
// and normalize them for search, inserting them in LMDB in any given order.
let options = NormalizerOption { lossy: true, ..Default::default() };
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, ()) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
let mut normalized_facet = left_bound.normalize(&options);
let normalized_truncated_facet: String;
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
normalized_truncated_facet = normalized_facet
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
normalized_facet = normalized_truncated_facet.into();
}
set
let set = BTreeSet::from_iter(std::iter::once(left_bound));
let key = (field_id, normalized_facet.as_ref());
let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
sorter.insert(key, val)?;
}
None => BTreeSet::new(),
};
let set: BTreeSet<_> =
database_set.difference(&del_set).chain(add_set.iter()).cloned().collect();
if set.is_empty() {
index
.facet_id_normalized_string_strings
.remap_key_type::<Bytes>()
.delete(wtxn, key_bytes)?;
} else {
index
.facet_id_normalized_string_strings
.remap_key_type::<Bytes>()
.put(wtxn, key_bytes, &set)?;
}
}
// We clear the FST of normalized-for-search to compute everything from scratch.
index.facet_id_string_fst.clear(wtxn)?;
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database = index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let ((field_id, normalized_facet), _) = result?;
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
// In this loop we don't need to take care of merging bitmaps
// as the grenad sorter already merged them for us.
let mut merger_iter = sorter.into_stream_merger_iter()?;
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
wtxn,
key_bytes,
btreeset_bytes,
)?;
}
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database =
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let ((field_id, normalized_facet), _) = result?;
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(normalized_facet)?;
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(normalized_facet)?;
}
}
if let Some((field_id, fst_builder)) = current_fst {
let fst = fst_builder.into_set();
text_fsts.push((field_id, fst));
}
if let Some((field_id, fst_builder)) = current_fst {
let fst = fst_builder.into_set();
text_fsts.push((field_id, fst));
}
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
}
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
}
Ok(())
Ok(())
}
}
#[cfg(test)]
@ -285,7 +268,6 @@ pub(crate) mod test_helpers {
use std::marker::PhantomData;
use std::rc::Rc;
use grenad::MergerBuilder;
use heed::types::Bytes;
use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
use roaring::RoaringBitmap;
@ -298,8 +280,7 @@ pub(crate) mod test_helpers {
use crate::search::facet::get_highest_level;
use crate::snapshot_tests::display_bitmap;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps;
use crate::update::{FacetsUpdateIncrementalInner, MergeFn};
use crate::update::FacetsUpdateIncrementalInner;
use crate::CboRoaringBitmapCodec;
/// Utility function to generate a string whose position in a lexicographically
@ -429,8 +410,7 @@ pub(crate) mod test_helpers {
max_group_size: self.max_group_size.get(),
};
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap();
update.add_or_delete_level(wtxn, field_id).unwrap();
update.insert(wtxn, field_id, &key_bytes, docids).unwrap();
}
pub fn delete_single_docid<'a>(
&self,
@ -456,8 +436,7 @@ pub(crate) mod test_helpers {
max_group_size: self.max_group_size.get(),
};
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap();
update.add_or_delete_level(wtxn, field_id).unwrap();
update.delete(wtxn, field_id, &key_bytes, docids).unwrap();
}
pub fn bulk_insert<'a, 'b>(
@ -484,13 +463,10 @@ pub(crate) mod test_helpers {
}
writer.finish().unwrap();
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
builder.push(reader.into_cursor().unwrap());
let merger = builder.build();
let update = FacetsUpdateBulkInner {
db: self.content,
delta_data: Some(merger),
delta_data: Some(reader),
group_size: self.group_size.get(),
min_level_size: self.min_level_size.get(),
};

View File

@ -26,7 +26,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<Vec<u8>>>,
stop_words: Option<&fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
@ -181,11 +181,11 @@ fn searchable_fields_changed(
/// Factorize tokenizer building.
fn tokenizer_builder<'a>(
stop_words: Option<&'a fst::Set<Vec<u8>>>,
stop_words: Option<&'a fst::Set<&[u8]>>,
allowed_separators: Option<&'a [&str]>,
dictionary: Option<&'a [&str]>,
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
) -> TokenizerBuilder<'a, Vec<u8>> {
) -> TokenizerBuilder<'a, &'a [u8]> {
let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
@ -211,7 +211,7 @@ fn lang_safe_tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer,
stop_words: Option<&fst::Set<Vec<u8>>>,
stop_words: Option<&fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: u32,

View File

@ -1,21 +1,15 @@
use std::collections::BTreeSet;
use std::fs::File;
use std::io::BufReader;
use std::iter::FromIterator;
use std::{io, str};
use charabia::normalizer::{Normalize, NormalizerOption};
use heed::types::SerdeJson;
use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::{
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
};
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
use crate::heed_codec::StrRefCodec;
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
use crate::{FieldId, Result};
/// Extracts the facet string and the documents ids where this facet string appear.
///
@ -25,11 +19,10 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let options = NormalizerOption { lossy: true, ..Default::default() };
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
@ -37,30 +30,12 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut normalized_facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_deladd_btreeset_string,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
max_memory,
);
let mut buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
// nothing to do if we delete and re-add the value.
if deladd_reader.get(DelAdd::Deletion).is_some()
&& deladd_reader.get(DelAdd::Addition).is_some()
{
continue;
}
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
@ -69,46 +44,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let document_id = u32::from_be_bytes(document_id_bytes);
let normalized_value = str::from_utf8(normalized_value_bytes)?;
// Facet search normalization
{
let mut hyper_normalized_value = normalized_value.normalize(&options);
let normalized_truncated_facet: String;
if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH {
normalized_truncated_facet = hyper_normalized_value
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
hyper_normalized_value = normalized_truncated_facet.into();
}
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in deladd_reader.iter() {
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
obkv.insert(deladd_key, val)?;
}
obkv.finish()?;
let key = (field_id, hyper_normalized_value.as_ref());
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
}
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in deladd_reader.iter() {
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
}
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
sorter_into_reader(facet_string_docids_sorter, indexer)
}

View File

@ -257,7 +257,6 @@ fn push_vectors_diff(
key_buffer: &mut Vec<u8>,
delta: VectorStateDelta,
) -> Result<()> {
puffin::profile_function!();
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove {
key_buffer.truncate(TRUNCATE_SIZE);
@ -333,14 +332,13 @@ fn extract_vectors(
}
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
#[logging_timer::time]
pub fn extract_embeddings<R: io::Read + io::Seek>(
// docid, prompt
prompt_reader: grenad::Reader<R>,
indexer: GrenadParameters,
embedder: Arc<Embedder>,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk

View File

@ -15,6 +15,7 @@ use std::io::BufReader;
use crossbeam_channel::Sender;
use rayon::prelude::*;
use tracing::debug;
use self::extract_docid_word_positions::extract_docid_word_positions;
use self::extract_facet_number_docids::extract_facet_number_docids;
@ -28,7 +29,10 @@ use self::extract_vector_points::{
use self::extract_word_docids::extract_word_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::helpers::{
as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
MergeFn, MergeableReader,
};
use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::vector::EmbeddingConfigs;
@ -48,7 +52,7 @@ pub(crate) fn data_from_obkv_documents(
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
field_id_map: FieldsIdsMap,
stop_words: Option<fst::Set<Vec<u8>>>,
stop_words: Option<fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
@ -58,170 +62,227 @@ pub(crate) fn data_from_obkv_documents(
) -> Result<()> {
puffin::profile_function!();
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|| {
original_obkv_chunks
.par_bridge()
.map(|original_documents_chunk| {
send_original_documents_data(
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
field_id_map.clone(),
embedders.clone(),
)
})
.collect::<Result<()>>()
},
|| {
flattened_obkv_chunks
.par_bridge()
.map(|flattened_obkv_chunks| {
send_and_extract_flattened_documents_data(
flattened_obkv_chunks,
indexer,
lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id,
geo_fields_ids,
&stop_words,
&allowed_separators,
&dictionary,
max_positions_per_attributes,
)
})
.map(|result| {
if let Ok((
ref docid_word_positions_chunk,
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
)) = result
{
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_fid_word_count_docids,
TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids",
);
original_obkv_chunks
.par_bridge()
.map(|original_documents_chunk| {
send_original_documents_data(
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
field_id_map.clone(),
embedders.clone(),
)
})
.collect::<Result<()>>()?;
let exact_attributes = exact_attributes.clone();
run_extraction_task::<
_,
_,
(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
),
>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| {
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
},
|(
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
)| {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
},
"word-docids",
);
#[allow(clippy::type_complexity)]
let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> =
flattened_obkv_chunks
.par_bridge()
.map(|flattened_obkv_chunks| {
send_and_extract_flattened_documents_data(
flattened_obkv_chunks,
indexer,
lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id,
geo_fields_ids,
&stop_words,
&allowed_separators,
&dictionary,
max_positions_per_attributes,
)
})
.collect();
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_position_docids,
TypedChunk::WordPositionDocids,
"word-position-docids",
);
let (
docid_word_positions_chunks,
(
fid_docid_facet_numbers_chunks,
(
fid_docid_facet_strings_chunks,
(
facet_is_null_docids_chunks,
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
),
),
),
) = result?;
run_extraction_task::<
_,
_,
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
>(
fid_docid_facet_strings_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
TypedChunk::FieldIdFacetStringDocids,
"field-id-facet-string-docids",
);
// merge facet_exists_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!(database = "facet-id-exists-docids", "merge");
match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
fid_docid_facet_numbers_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_facet_number_docids,
TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids",
);
// merge facet_is_null_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!(database = "facet-id-is-null-docids", "merge");
match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
if proximity_precision == ProximityPrecision::ByWord {
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
}
// merge facet_is_empty_docids and send them as a typed chunk
{
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
debug!(database = "facet-id-is-empty-docids", "merge");
match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
Ok(reader) => {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
});
}
Ok(())
})
.collect::<Result<()>>()
},
if proximity_precision == ProximityPrecision::ByWord {
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_fid_word_count_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids",
);
original_pipeline_result.and(flattened_pipeline_result)
spawn_extraction_task::<
_,
_,
Vec<(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)>,
>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
merge_deladd_cbo_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
},
"word-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_position_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::WordPositionDocids,
"word-position-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
fid_docid_facet_strings_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::FieldIdFacetStringDocids,
"field-id-facet-string-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
fid_docid_facet_numbers_chunks,
indexer,
lmdb_writer_sx,
extract_facet_number_docids,
merge_deladd_cbo_roaring_bitmaps,
TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids",
);
Ok(())
}
/// Spawn a new task to extract data for a specific DB using extract_fn.
/// Generated grenad chunks are merged using the merge_fn.
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
/// and sent into lmdb_writer_sx.
fn run_extraction_task<FE, FS, M>(
chunk: grenad::Reader<CursorClonableMmap>,
fn spawn_extraction_task<FE, FS, M>(
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
extract_fn: FE,
merge_fn: MergeFn,
serialize_fn: FS,
name: &'static str,
) where
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
+ Sync
+ Send
+ 'static,
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
M: Send,
FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
M::Output: Send,
{
let current_span = tracing::Span::current();
rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
let child_span =
tracing::trace_span!(target: "", parent: &current_span, "extract_multiple_chunks");
let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer) {
Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
puffin::profile_scope!("extract_multiple_chunksdexing::details, ", name);
let chunks: Result<M> =
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect();
let current_span = tracing::Span::current();
rayon::spawn(move || match chunks {
Ok(chunks) => {
let child_span = tracing::trace_span!(target: "", parent: &current_span, "merge_multiple_chunks");
let _entered = child_span.enter();
debug!(database = name, "merge");
puffin::profile_scope!("merge_multiple_chunks", name);
let reader = chunks.merge(merge_fn, &indexer);
let _ = lmdb_writer_sx.send(reader.map(serialize_fn));
}
Err(e) => {
let _ = lmdb_writer_sx.send(Err(e));
}
}
})
})
});
}
/// Extract chunked data and send it into lmdb_writer_sx sender:
@ -279,7 +340,7 @@ fn send_original_documents_data(
});
// TODO: create a custom internal error
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
Ok(())
}
@ -299,13 +360,22 @@ fn send_and_extract_flattened_documents_data(
faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
stop_words: &Option<fst::Set<Vec<u8>>>,
stop_words: &Option<fst::Set<&[u8]>>,
allowed_separators: &Option<&[&str]>,
dictionary: &Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
grenad::Reader<CursorClonableMmap>,
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
(
grenad::Reader<CursorClonableMmap>,
(
grenad::Reader<CursorClonableMmap>,
(
grenad::Reader<BufReader<File>>,
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
),
),
),
)> {
let flattened_documents_chunk =
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@ -376,17 +446,16 @@ fn send_and_extract_flattened_documents_data(
fid_docid_facet_strings_chunk.clone(),
)));
let _ = lmdb_writer_sx
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
fid_facet_is_empty_docids_chunk,
)));
let _ = lmdb_writer_sx
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
Ok((
fid_docid_facet_numbers_chunk,
(
fid_docid_facet_strings_chunk,
(
fid_facet_is_null_docids_chunk,
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
),
),
))
},
);

View File

@ -90,6 +90,90 @@ pub unsafe fn as_cloneable_grenad(
Ok(reader)
}
pub trait MergeableReader
where
Self: Sized,
{
type Output;
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
}
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
type Output = grenad::Reader<BufReader<File>>;
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut merger = MergerBuilder::new(merge_fn);
self.into_iter().try_for_each(|r| merger.push(r))?;
merger.finish(params)
}
}
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
let mut m2 = MergerBuilder::new(merge_fn);
for (r1, r2) in self.into_iter() {
m1.push(r1)?;
m2.push(r2)?;
}
Ok((m1.finish(params)?, m2.finish(params)?))
}
}
impl MergeableReader
for Vec<(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)>
{
type Output = (
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
let mut m2 = MergerBuilder::new(merge_fn);
let mut m3 = MergerBuilder::new(merge_fn);
for (r1, r2, r3) in self.into_iter() {
m1.push(r1)?;
m2.push(r2)?;
m3.push(r3)?;
}
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
}
}
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
impl<R: io::Read + io::Seek> MergerBuilder<R> {
fn new(merge_fn: MergeFn) -> Self {
Self(grenad::MergerBuilder::new(merge_fn))
}
fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
self.0.push(reader.into_cursor()?);
Ok(())
}
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
let merger = self.0.build();
let mut writer = create_writer(
params.chunk_compression_type,
params.chunk_compression_level,
tempfile::tempfile()?,
);
merger.write_into_stream_writer(&mut writer)?;
writer_into_reader(writer)
}
}
#[derive(Debug, Clone, Copy)]
pub struct GrenadParameters {
pub chunk_compression_type: CompressionType,

View File

@ -35,6 +35,27 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul
}
}
pub fn merge_btreeset_string<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// TODO improve the perf by using a `#[borrow] Cow<str>`.
let strings: BTreeSet<String> = values
.iter()
.map(AsRef::as_ref)
.map(serde_json::from_slice::<BTreeSet<String>>)
.map(StdResult::unwrap)
.reduce(|mut current, new| {
for x in new {
current.insert(x);
}
current
})
.unwrap();
Ok(Cow::Owned(serde_json::to_vec(&strings).unwrap()))
}
}
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(values[0].clone())
}
@ -222,40 +243,3 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
buffer,
)?)
}
/// Do a union of BtreeSet on both sides of a DelAdd obkv
/// separately and outputs a new DelAdd with both unions.
pub fn merge_deladd_btreeset_string<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// Retrieve the bitmaps from both sides
let mut del_set = BTreeSet::new();
let mut add_set = BTreeSet::new();
for value in values {
let obkv = KvReaderDelAdd::new(value);
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
for value in set {
del_set.insert(value);
}
}
if let Some(bytes) = obkv.get(DelAdd::Addition) {
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
for value in set {
add_set.insert(value);
}
}
}
let mut output_deladd_obkv = KvWriterDelAdd::memory();
let del = serde_json::to_vec(&del_set).unwrap();
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
let add = serde_json::to_vec(&add_set).unwrap();
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
}
}

View File

@ -10,10 +10,10 @@ use fst::{IntoStreamer, Streamer};
pub use grenad_helpers::{
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
GrenadParameters,
GrenadParameters, MergeableReader,
};
pub use merge_functions::{
keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string,
keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
obkvs_merge_additions_and_deletions, MergeFn,

View File

@ -5,29 +5,29 @@ mod transform;
mod typed_chunk;
use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek};
use std::io::{Cursor, Read, Seek};
use std::iter::FromIterator;
use std::num::NonZeroU32;
use std::result::Result as StdResult;
use crossbeam_channel::{Receiver, Sender};
use grenad::{Merger, MergerBuilder};
use heed::types::Str;
use heed::Database;
use rand::SeedableRng;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use slice_group_by::GroupBy;
use tracing::debug;
use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
use tracing::debug_span;
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
use self::enrich::enrich_documents_batch;
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
ClonableMmap, MergeFn,
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
@ -95,8 +95,8 @@ pub struct IndexDocumentsConfig {
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
where
FP: Fn(UpdateIndexingStep) + Sync + Send,
FA: Fn() -> bool + Sync + Send,
FP: Fn(UpdateIndexingStep) + Sync,
FA: Fn() -> bool + Sync,
{
pub fn new(
wtxn: &'t mut heed::RwTxn<'i>,
@ -284,7 +284,7 @@ where
#[tracing::instrument(
level = "trace",
skip_all,
target = "indexing::details",
target = "profile::indexing::details",
name = "index_documents_raw"
)]
pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
@ -326,6 +326,9 @@ where
}
};
let original_documents = grenad::Reader::new(original_documents)?;
let flattened_documents = grenad::Reader::new(flattened_documents)?;
// create LMDB writer channel
let (lmdb_writer_sx, lmdb_writer_rx): (
Sender<Result<TypedChunk>>,
@ -364,7 +367,11 @@ where
let stop_words = self.index.stop_words(self.wtxn)?;
let separators = self.index.allowed_separators(self.wtxn)?;
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
let dictionary = self.index.dictionary(self.wtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
@ -374,204 +381,141 @@ where
max_memory: self.indexer_config.max_memory,
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
};
let documents_chunk_size = match self.indexer_config.documents_chunk_size {
Some(chunk_size) => chunk_size,
None => {
let default_chunk_size = 1024 * 1024 * 4; // 4MiB
let min_chunk_size = 1024 * 512; // 512KiB
// compute the chunk size from the number of available threads and the inputed data size.
let total_size = flattened_documents.metadata().map(|m| m.len());
let current_num_threads = pool.current_num_threads();
// if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
let chunk_count = if current_num_threads > 2 {
(current_num_threads * 3 / 4).max(2)
} else {
current_num_threads
};
total_size
.map_or(default_chunk_size, |size| (size as usize) / chunk_count)
.max(min_chunk_size)
}
};
let original_documents = grenad::Reader::new(original_documents)?;
let flattened_documents = grenad::Reader::new(flattened_documents)?;
let documents_chunk_size =
self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
let cloned_embedder = self.embedders.clone();
let mut final_documents_ids = RoaringBitmap::new();
let mut databases_seen = 0;
let mut word_position_docids = None;
let mut word_fid_docids = None;
let mut word_docids = None;
let mut exact_word_docids = None;
let mut chunk_accumulator = ChunkAccumulator::default();
let mut dimension = HashMap::new();
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let current_span = tracing::Span::current();
// Run extraction pipeline in parallel.
pool.install(|| {
rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
let _enter = child_span.enter();
puffin::profile_scope!("extract_and_send_grenad_chunks");
// split obkv file into several chunks
let original_chunk_iter =
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
// split obkv file into several chunks
let original_chunk_iter =
grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size);
// split obkv file into several chunks
let flattened_chunk_iter =
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
// split obkv file into several chunks
let flattened_chunk_iter =
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let result = original_chunk_iter.and_then(|original_chunk| {
let flattened_chunk = flattened_chunk_iter?;
// extract all databases from the chunked obkv douments
extract::data_from_obkv_documents(
original_chunk,
flattened_chunk,
pool_params,
lmdb_writer_sx.clone(),
searchable_fields,
faceted_fields,
primary_key_id,
geo_fields_ids,
field_id_map,
stop_words,
separators.as_deref(),
dictionary.as_deref(),
max_positions_per_attributes,
exact_attributes,
proximity_precision,
cloned_embedder,
)
});
if let Err(e) = result {
let _ = lmdb_writer_sx.send(Err(e));
}
// needs to be dropped to avoid channel waiting lock.
drop(lmdb_writer_sx);
let result = original_chunk_iter.and_then(|original_chunk| {
let flattened_chunk = flattened_chunk_iter?;
// extract all databases from the chunked obkv douments
extract::data_from_obkv_documents(
original_chunk,
flattened_chunk,
pool_params,
lmdb_writer_sx.clone(),
searchable_fields,
faceted_fields,
primary_key_id,
geo_fields_ids,
field_id_map,
stop_words,
separators.as_deref(),
dictionary.as_deref(),
max_positions_per_attributes,
exact_attributes,
proximity_precision,
cloned_embedder,
)
});
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
loop {
if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) =
write_typed_chunk_into_index(typed_chunks, self.index, self.wtxn)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
(self.progress)(UpdateIndexingStep::IndexDocuments {
documents_seen: documents_seen_count as usize,
total_documents: documents_count,
});
debug!(documents = documents_seen_count, total = documents_count, "Seen");
}
if is_merged_database {
databases_seen += 1;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
}
// If no more chunk remains in the chunk accumulator and the channel is disconected, break.
} else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
break;
} else {
rayon::yield_now();
}
}
Ok(result) => {
let typed_chunk = match result? {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk =
unsafe { as_cloneable_grenad(&word_docids_reader)? };
let word_docids = word_docids.get_or_insert_with(|| {
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
});
word_docids.push(cloneable_chunk.into_cursor()?);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
let exact_word_docids =
exact_word_docids.get_or_insert_with(|| {
MergerBuilder::new(
merge_deladd_cbo_roaring_bitmaps as MergeFn,
)
});
exact_word_docids.push(cloneable_chunk.into_cursor()?);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
});
word_fid_docids.push(cloneable_chunk.into_cursor()?);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPositionDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
let word_position_docids =
word_position_docids.get_or_insert_with(|| {
MergerBuilder::new(
merge_deladd_cbo_roaring_bitmaps as MergeFn,
)
});
word_position_docids.push(cloneable_chunk.into_cursor()?);
TypedChunk::WordPositionDocids(chunk)
}
TypedChunk::VectorPoints {
expected_dimension,
remove_vectors,
embeddings,
manual_vectors,
embedder_name,
} => {
dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension,
manual_vectors,
embedder_name,
}
}
otherwise => otherwise,
};
chunk_accumulator.insert(typed_chunk);
}
}
if let Err(e) = result {
let _ = lmdb_writer_sx.send(Err(e));
}
Ok(())
})?;
// needs to be dropped to avoid channel waiting lock.
drop(lmdb_writer_sx);
});
let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
let mut final_documents_ids = RoaringBitmap::new();
let mut databases_seen = 0;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
let mut word_position_docids = None;
let mut word_fid_docids = None;
let mut word_docids = None;
let mut exact_word_docids = None;
let mut dimension = HashMap::new();
for result in lmdb_writer_rx {
if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
let typed_chunk = match result? {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
exact_word_docids = Some(cloneable_chunk);
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPositionDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
word_position_docids = Some(cloneable_chunk);
TypedChunk::WordPositionDocids(chunk)
}
TypedChunk::VectorPoints {
expected_dimension,
remove_vectors,
embeddings,
manual_vectors,
embedder_name,
} => {
dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension,
manual_vectors,
embedder_name,
}
}
otherwise => otherwise,
};
let (docids, is_merged_database) =
write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
(self.progress)(UpdateIndexingStep::IndexDocuments {
documents_seen: documents_seen_count as usize,
total_documents: documents_count,
});
debug_span!("Seen", documents = documents_seen_count, total = documents_count);
}
if is_merged_database {
databases_seen += 1;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
}
}
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@ -604,10 +548,10 @@ where
}
self.execute_prefix_databases(
word_docids.map(MergerBuilder::build),
exact_word_docids.map(MergerBuilder::build),
word_position_docids.map(MergerBuilder::build),
word_fid_docids.map(MergerBuilder::build),
word_docids,
exact_word_docids,
word_position_docids,
word_fid_docids,
)?;
Ok(number_of_documents)
@ -621,10 +565,10 @@ where
)]
pub fn execute_prefix_databases(
self,
word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
) -> Result<()>
where
FP: Fn(UpdateIndexingStep) + Sync,
@ -807,7 +751,7 @@ where
)]
fn execute_word_prefix_docids(
txn: &mut heed::RwTxn,
merger: Merger<CursorClonableMmap, MergeFn>,
reader: grenad::Reader<Cursor<ClonableMmap>>,
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
indexer_config: &IndexerConfig,
@ -817,12 +761,13 @@ fn execute_word_prefix_docids(
) -> Result<()> {
puffin::profile_function!();
let cursor = reader.into_cursor()?;
let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db);
builder.chunk_compression_type = indexer_config.chunk_compression_type;
builder.chunk_compression_level = indexer_config.chunk_compression_level;
builder.max_nb_chunks = indexer_config.max_nb_chunks;
builder.max_memory = indexer_config.max_memory;
builder.execute(merger, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?;
Ok(())
}

View File

@ -5,64 +5,27 @@ use std::io::{self, BufReader};
use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script};
use grenad::{Merger, MergerBuilder};
use grenad::MergerBuilder;
use heed::types::Bytes;
use heed::RwTxn;
use heed::{PutFlags, RwTxn};
use obkv::{KvReader, KvWriter};
use roaring::RoaringBitmap;
use super::helpers::{
self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key,
CursorClonableMmap,
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
valid_lmdb_key, CursorClonableMmap,
};
use super::MergeFn;
use super::{ClonableMmap, MergeFn};
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{
as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
};
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
use crate::{
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError,
};
/// This struct accumulates and group the TypedChunks
/// and is able to give the biggest accumulated group to index them all together
/// with a merger.
#[derive(Default)]
pub(crate) struct ChunkAccumulator {
inner: Vec<Vec<TypedChunk>>,
}
impl ChunkAccumulator {
pub fn pop_longest(&mut self) -> Option<Vec<TypedChunk>> {
match self.inner.iter().max_by_key(|v| v.len()) {
Some(left) => {
let position = self.inner.iter().position(|right| left.len() == right.len());
position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty())
}
None => None,
}
}
pub fn insert(&mut self, chunk: TypedChunk) {
match self
.inner
.iter()
.position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right)))
{
Some(position) => {
let v = self.inner.get_mut(position).unwrap();
v.push(chunk);
}
None => self.inner.push(vec![chunk]),
}
}
}
pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
@ -75,7 +38,7 @@ pub(crate) enum TypedChunk {
},
WordPositionDocids(grenad::Reader<BufReader<File>>),
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetStringDocids((grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)),
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
@ -91,33 +54,6 @@ pub(crate) enum TypedChunk {
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
}
impl TypedChunk {
fn mergeable_with(&self, other: &Self) -> bool {
use TypedChunk::*;
match (self, other) {
(FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_))
| (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_))
| (Documents(_), Documents(_))
| (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_))
| (WordDocids { .. }, WordDocids { .. })
| (WordPositionDocids(_), WordPositionDocids(_))
| (WordPairProximityDocids(_), WordPairProximityDocids(_))
| (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_))
| (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_))
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
| (GeoPoints(_), GeoPoints(_))
| (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true,
(
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
) => left == right && left_dim == right_dim,
_ => false,
}
}
}
impl TypedChunk {
pub fn to_debug_string(&self) -> String {
match self {
@ -149,7 +85,7 @@ impl TypedChunk {
TypedChunk::WordPairProximityDocids(grenad) => {
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::FieldIdFacetStringDocids((grenad, _)) => {
TypedChunk::FieldIdFacetStringDocids(grenad) => {
format!("FieldIdFacetStringDocids {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::FieldIdFacetNumberDocids(grenad) => {
@ -181,32 +117,23 @@ impl TypedChunk {
/// Return new documents seen.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
pub(crate) fn write_typed_chunk_into_index(
typed_chunks: Vec<TypedChunk>,
typed_chunk: TypedChunk,
index: &Index,
wtxn: &mut RwTxn,
index_is_empty: bool,
) -> Result<(RoaringBitmap, bool)> {
puffin::profile_function!(typed_chunks[0].to_debug_string());
puffin::profile_function!(typed_chunk.to_debug_string());
let mut is_merged_database = false;
match typed_chunks[0] {
TypedChunk::Documents(_) => {
match typed_chunk {
TypedChunk::Documents(obkv_documents_iter) => {
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
let _entered = span.enter();
let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::Documents(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
let mut operations: Vec<DocumentOperation> = Default::default();
let mut docids = index.documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, reader)) = iter.next()? {
let mut cursor = obkv_documents_iter.into_cursor()?;
while let Some((key, reader)) = cursor.move_on_next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
let reader: KvReader<FieldId> = KvReader::new(reader);
@ -247,91 +174,59 @@ pub(crate) fn write_typed_chunk_into_index(
external_documents_docids.apply(wtxn, operations)?;
index.put_documents_ids(wtxn, &docids)?;
}
TypedChunk::FieldIdWordCountDocids(_) => {
TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
write_entries_into_database(
merger,
append_entries_into_database(
fid_word_count_docids_iter,
&index.field_id_word_count_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
is_merged_database = true;
}
TypedChunk::WordDocids { .. } => {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
let _entered = span.enter();
let mut word_docids_builder =
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
let mut exact_word_docids_builder =
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
let mut word_fid_docids_builder =
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} = typed_chunk
else {
unreachable!();
};
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
let clonable_exact_word_docids =
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
word_docids_builder.push(word_docids_reader.into_cursor()?);
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
}
let word_docids_merger = word_docids_builder.build();
write_entries_into_database(
word_docids_merger,
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
append_entries_into_database(
word_docids_iter.clone(),
&index.word_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
let exact_word_docids_merger = exact_word_docids_builder.build();
write_entries_into_database(
exact_word_docids_merger,
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
append_entries_into_database(
exact_word_docids_iter.clone(),
&index.exact_word_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
let word_fid_docids_merger = word_fid_docids_builder.build();
write_entries_into_database(
word_fid_docids_merger,
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
append_entries_into_database(
word_fid_docids_iter,
&index.word_fid_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
// create fst from word docids
let fst_merger = fst_merger_builder.build();
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
let db_fst = index.words_fst(wtxn)?;
// merge new fst with database fst
@ -342,202 +237,98 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_words_fst(wtxn, &fst)?;
is_merged_database = true;
}
TypedChunk::WordPositionDocids(_) => {
TypedChunk::WordPositionDocids(word_position_docids_iter) => {
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
write_entries_into_database(
merger,
append_entries_into_database(
word_position_docids_iter,
&index.word_position_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetNumberDocids(_) => {
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
let span =
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
let mut data_size = 0;
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
else {
unreachable!();
};
data_size += facet_id_number_docids.len();
builder.push(facet_id_number_docids.into_cursor()?);
}
let merger = builder.build();
let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
indexer.execute(wtxn)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetStringDocids(_) => {
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
let _entered = span.enter();
let mut facet_id_string_builder =
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
let mut normalized_facet_id_string_builder =
MergerBuilder::new(merge_deladd_btreeset_string as MergeFn);
let mut data_size = 0;
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdFacetStringDocids((
facet_id_string_docids,
normalized_facet_id_string_docids,
)) = typed_chunk
else {
unreachable!();
};
data_size += facet_id_string_docids.len();
facet_id_string_builder.push(facet_id_string_docids.into_cursor()?);
normalized_facet_id_string_builder
.push(normalized_facet_id_string_docids.into_cursor()?);
}
let facet_id_string_merger = facet_id_string_builder.build();
let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build();
let indexer = FacetsUpdate::new(
index,
FacetType::String,
facet_id_string_merger,
Some(normalized_facet_id_string_merger),
data_size,
);
let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter);
indexer.execute(wtxn)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetExistsDocids(_) => {
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
write_entries_into_database(
merger,
append_entries_into_database(
facet_id_exists_docids,
&index.facet_id_exists_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetIsNullDocids(_) => {
TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
write_entries_into_database(
merger,
append_entries_into_database(
facet_id_is_null_docids,
&index.facet_id_is_null_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetIsEmptyDocids(_) => {
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => {
let span = tracing::trace_span!(target: "profile::indexing::write_db", "field_id_facet_is_empty_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
write_entries_into_database(
merger,
append_entries_into_database(
facet_id_is_empty_docids,
&index.facet_id_is_empty_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
is_merged_database = true;
}
TypedChunk::WordPairProximityDocids(_) => {
TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
let _entered = span.enter();
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
write_entries_into_database(
merger,
append_entries_into_database(
word_pair_proximity_docids_iter,
&index.word_pair_proximity_docids,
wtxn,
index_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdDocidFacetNumbers(_) => {
TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
let _entered = span.enter();
let mut builder = MergerBuilder::new(keep_first as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
let index_fid_docid_facet_numbers =
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = fid_docid_facet_number.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let reader = KvReaderDelAdd::new(value);
if valid_lmdb_key(key) {
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
@ -553,25 +344,14 @@ pub(crate) fn write_typed_chunk_into_index(
}
}
}
TypedChunk::FieldIdDocidFacetStrings(_) => {
TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => {
let span =
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
let _entered = span.enter();
let mut builder = MergerBuilder::new(keep_first as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
let index_fid_docid_facet_strings =
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = fid_docid_facet_string.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let reader = KvReaderDelAdd::new(value);
if valid_lmdb_key(key) {
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
@ -587,25 +367,14 @@ pub(crate) fn write_typed_chunk_into_index(
}
}
}
TypedChunk::GeoPoints(_) => {
TypedChunk::GeoPoints(geo_points) => {
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
let _entered = span.enter();
let mut builder = MergerBuilder::new(keep_first as MergeFn);
for typed_chunk in typed_chunks {
let TypedChunk::GeoPoints(chunk) = typed_chunk else {
unreachable!();
};
builder.push(chunk.into_cursor()?);
}
let merger = builder.build();
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = geo_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
@ -624,38 +393,15 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_rtree(wtxn, &rtree)?;
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints { .. } => {
TypedChunk::VectorPoints {
remove_vectors,
manual_vectors,
embeddings,
expected_dimension,
embedder_name,
} => {
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
let _entered = span.enter();
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
let mut params = None;
for typed_chunk in typed_chunks {
let TypedChunk::VectorPoints {
remove_vectors,
manual_vectors,
embeddings,
expected_dimension,
embedder_name,
} = typed_chunk
else {
unreachable!();
};
params = Some((expected_dimension, embedder_name));
remove_vectors_builder.push(remove_vectors.into_cursor()?);
manual_vectors_builder.push(manual_vectors.into_cursor()?);
if let Some(embeddings) = embeddings {
embeddings_builder.push(embeddings.into_cursor()?);
}
}
// typed chunks has always at least 1 chunk.
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?;
@ -673,9 +419,8 @@ pub(crate) fn write_typed_chunk_into_index(
let writers = writers?;
// remove vectors for docids we want them removed
let merger = remove_vectors_builder.build();
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, _)) = iter.next()? {
let mut cursor = remove_vectors.into_cursor()?;
while let Some((key, _)) = cursor.move_on_next()? {
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
for writer in &writers {
@ -687,39 +432,40 @@ pub(crate) fn write_typed_chunk_into_index(
}
// add generated embeddings
let merger = embeddings_builder.build();
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
let data = pod_collect_to_vec(value);
// it is a code error to have embeddings and not expected_dimension
let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension)
// code error if we somehow got the wrong dimension
.unwrap();
if let Some(embeddings) = embeddings {
let mut cursor = embeddings.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
let data = pod_collect_to_vec(value);
// it is a code error to have embeddings and not expected_dimension
let embeddings =
crate::vector::Embeddings::from_inner(data, expected_dimension)
// code error if we somehow got the wrong dimension
.unwrap();
if embeddings.embedding_count() > usize::from(u8::MAX) {
let external_docid = if let Ok(Some(Ok(index))) = index
.external_id_of(wtxn, std::iter::once(docid))
.map(|it| it.into_iter().next())
{
index
} else {
format!("internal docid={docid}")
};
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
external_docid,
embeddings.embedding_count(),
)));
}
for (embedding, writer) in embeddings.iter().zip(&writers) {
writer.add_item(wtxn, docid, embedding)?;
if embeddings.embedding_count() > usize::from(u8::MAX) {
let external_docid = if let Ok(Some(Ok(index))) = index
.external_id_of(wtxn, std::iter::once(docid))
.map(|it| it.into_iter().next())
{
index
} else {
format!("internal docid={docid}")
};
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
external_docid,
embeddings.embedding_count(),
)));
}
for (embedding, writer) in embeddings.iter().zip(&writers) {
writer.add_item(wtxn, docid, embedding)?;
}
}
}
// perform the manual diff
let merger = manual_vectors_builder.build();
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = manual_vectors.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
let (left, _index) = try_split_array_at(key).unwrap();
let docid = DocumentId::from_be_bytes(left);
@ -773,30 +519,26 @@ pub(crate) fn write_typed_chunk_into_index(
tracing::debug!("Finished vector chunk for {}", embedder_name);
}
TypedChunk::ScriptLanguageDocids(_) => {
TypedChunk::ScriptLanguageDocids(sl_map) => {
let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids");
let _entered = span.enter();
for typed_chunk in typed_chunks {
let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() };
for (key, (deletion, addition)) in sl_map {
let mut db_key_exists = false;
let final_value = match index.script_language_docids.get(wtxn, &key)? {
Some(db_values) => {
db_key_exists = true;
(db_values - deletion) | addition
}
None => addition,
};
if final_value.is_empty() {
// If the database entry exists, delete it.
if db_key_exists {
index.script_language_docids.delete(wtxn, &key)?;
}
} else {
index.script_language_docids.put(wtxn, &key, &final_value)?;
for (key, (deletion, addition)) in sl_map {
let mut db_key_exists = false;
let final_value = match index.script_language_docids.get(wtxn, &key)? {
Some(db_values) => {
db_key_exists = true;
(db_values - deletion) | addition
}
None => addition,
};
if final_value.is_empty() {
// If the database entry exists, delete it.
if db_key_exists {
index.script_language_docids.delete(wtxn, &key)?;
}
} else {
index.script_language_docids.put(wtxn, &key, &final_value)?;
}
}
}
@ -815,9 +557,13 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
}
fn merge_word_docids_reader_into_fst(
merger: Merger<CursorClonableMmap, MergeFn>,
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
) -> Result<fst::Set<Vec<u8>>> {
let mut iter = merger.into_stream_merger_iter()?;
let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
merger_builder.push(word_docids_iter.into_cursor()?);
merger_builder.push(exact_word_docids_iter.into_cursor()?);
let mut iter = merger_builder.build().into_stream_merger_iter()?;
let mut builder = fst::SetBuilder::memory();
while let Some((k, _)) = iter.next()? {
@ -831,9 +577,10 @@ fn merge_word_docids_reader_into_fst(
/// merge_values function is used if an entry already exist in the database.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
fn write_entries_into_database<R, K, V, FS, FM>(
merger: Merger<R, MergeFn>,
data: grenad::Reader<R>,
database: &heed::Database<K, V>,
wtxn: &mut RwTxn,
index_is_empty: bool,
serialize_value: FS,
merge_values: FM,
) -> Result<()>
@ -842,17 +589,22 @@ where
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
{
puffin::profile_function!();
puffin::profile_function!(format!("number of entries: {}", data.len()));
let mut buffer = Vec::new();
let database = database.remap_types::<Bytes, Bytes>();
let mut iter = merger.into_stream_merger_iter()?;
while let Some((key, value)) = iter.next()? {
let mut cursor = data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if valid_lmdb_key(key) {
buffer.clear();
let value = match database.get(wtxn, key)? {
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
None => Some(serialize_value(value, &mut buffer)?),
let value = if index_is_empty {
Some(serialize_value(value, &mut buffer)?)
} else {
match database.get(wtxn, key)? {
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
None => Some(serialize_value(value, &mut buffer)?),
}
};
match value {
Some(value) => database.put(wtxn, key, value)?,
@ -862,5 +614,62 @@ where
}
}
}
Ok(())
}
/// Write provided entries in database using serialize_value function.
/// merge_values function is used if an entry already exist in the database.
/// All provided entries must be ordered.
/// If the index is not empty, write_entries_into_database is called instead.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
fn append_entries_into_database<R, K, V, FS, FM>(
data: grenad::Reader<R>,
database: &heed::Database<K, V>,
wtxn: &mut RwTxn,
index_is_empty: bool,
serialize_value: FS,
merge_values: FM,
) -> Result<()>
where
R: io::Read + io::Seek,
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
K: for<'a> heed::BytesDecode<'a>,
{
puffin::profile_function!(format!("number of entries: {}", data.len()));
if !index_is_empty {
return write_entries_into_database(
data,
database,
wtxn,
false,
serialize_value,
merge_values,
);
}
let mut buffer = Vec::new();
let mut database = database.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
let mut cursor = data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if valid_lmdb_key(key) {
debug_assert!(
K::bytes_decode(key).is_ok(),
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
key.len(),
&key
);
buffer.clear();
let value = serialize_value(value, &mut buffer)?;
unsafe {
// safety: We do not keep a reference to anything that lives inside the database
database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, value)?
};
}
}
Ok(())
}

View File

@ -3,8 +3,9 @@ pub use self::clear_documents::ClearDocuments;
pub use self::facet::bulk::FacetsUpdateBulk;
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
pub use self::index_documents::{
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
MergeFn,
};
pub use self::indexer_config::IndexerConfig;
pub use self::settings::{validate_embedding_settings, Setting, Settings};

View File

@ -1032,13 +1032,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
{
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
let existing_fields: HashSet<_> = self
.index
.field_distribution(self.wtxn)?
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
@ -1059,8 +1052,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
// index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed.
let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
let faceted_updated =
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields);
let faceted_updated = old_faceted_fields != new_faceted_fields;
let stop_words_updated = self.update_stop_words()?;
let non_separator_tokens_updated = self.update_non_separator_tokens()?;

View File

@ -47,7 +47,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
)]
pub fn execute(
self,
new_word_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
@ -68,8 +68,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
if !common_prefix_fst_words.is_empty() {
let mut current_prefixes: Option<&&[String]> = None;
let mut prefixes_cache = HashMap::new();
let mut new_word_docids_iter = new_word_docids.into_stream_merger_iter()?;
while let Some((word, data)) = new_word_docids_iter.next()? {
while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
current_prefixes = match current_prefixes.take() {
Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes),
_otherwise => {

View File

@ -52,7 +52,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
)]
pub fn execute(
self,
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
new_word_integer_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
@ -69,14 +69,14 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
self.max_memory,
);
let mut new_word_integer_docids_iter = new_word_integer_docids.into_cursor()?;
if !common_prefix_fst_words.is_empty() {
// We fetch all the new common prefixes between the previous and new prefix fst.
let mut buffer = Vec::new();
let mut current_prefixes: Option<&&[String]> = None;
let mut prefixes_cache = HashMap::new();
let mut new_word_integer_docids_iter =
new_word_integer_docids.into_stream_merger_iter()?;
while let Some((key, data)) = new_word_integer_docids_iter.next()? {
while let Some((key, data)) = new_word_integer_docids_iter.move_on_next()? {
let (word, pos) =
StrBEU16Codec::bytes_decode(key).map_err(heed::Error::Decoding)?;

View File

@ -59,8 +59,8 @@ pub enum EmbedErrorKind {
OpenAiAuth(OpenAiError),
#[error("sent too many requests to OpenAI: {0}")]
OpenAiTooManyRequests(OpenAiError),
#[error("received internal error from OpenAI: {0:?}")]
OpenAiInternalServerError(Option<OpenAiError>),
#[error("received internal error from OpenAI: {0}")]
OpenAiInternalServerError(OpenAiError),
#[error("sent too many tokens in a request to OpenAI: {0}")]
OpenAiTooManyTokens(OpenAiError),
#[error("received unhandled HTTP status code {0} from OpenAI")]
@ -106,7 +106,7 @@ impl EmbedError {
Self { kind: EmbedErrorKind::OpenAiTooManyRequests(inner), fault: FaultSource::Runtime }
}
pub(crate) fn openai_internal_server_error(inner: Option<OpenAiError>) -> EmbedError {
pub(crate) fn openai_internal_server_error(inner: OpenAiError) -> EmbedError {
Self { kind: EmbedErrorKind::OpenAiInternalServerError(inner), fault: FaultSource::Runtime }
}

View File

@ -261,7 +261,3 @@ impl DistributionShift {
score
}
}
pub const fn is_cuda_enabled() -> bool {
cfg!(feature = "cuda")
}

View File

@ -178,8 +178,6 @@ impl Embedder {
retry.into_duration(attempt)
}
}?;
let retry_duration = retry_duration.min(std::time::Duration::from_secs(60)); // don't wait more than a minute
tracing::warn!(
"Attempt #{}, retrying after {}ms.",
attempt,
@ -222,12 +220,24 @@ impl Embedder {
error_response.error,
)));
}
StatusCode::INTERNAL_SERVER_ERROR
| StatusCode::BAD_GATEWAY
| StatusCode::SERVICE_UNAVAILABLE => {
let error_response: Result<OpenAiErrorResponse, _> = response.json().await;
StatusCode::INTERNAL_SERVER_ERROR => {
let error_response: OpenAiErrorResponse = response
.json()
.await
.map_err(EmbedError::openai_unexpected)
.map_err(Retry::retry_later)?;
return Err(Retry::retry_later(EmbedError::openai_internal_server_error(
error_response.ok().map(|error_response| error_response.error),
error_response.error,
)));
}
StatusCode::SERVICE_UNAVAILABLE => {
let error_response: OpenAiErrorResponse = response
.json()
.await
.map_err(EmbedError::openai_unexpected)
.map_err(Retry::retry_later)?;
return Err(Retry::retry_later(EmbedError::openai_internal_server_error(
error_response.error,
)));
}
StatusCode::BAD_REQUEST => {
@ -238,14 +248,14 @@ impl Embedder {
.map_err(EmbedError::openai_unexpected)
.map_err(Retry::retry_later)?;
tracing::warn!("OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your prompt.");
tracing::warn!("OpenAI: input was too long, retrying on tokenized version. For best performance, limit the size of your prompt.");
return Err(Retry::retry_tokenized(EmbedError::openai_too_many_tokens(
error_response.error,
)));
}
code => {
return Err(Retry::retry_later(EmbedError::openai_unhandled_status_code(
return Err(Retry::give_up(EmbedError::openai_unhandled_status_code(
code.as_u16(),
)));
}

View File

@ -1,5 +1,4 @@
use std::collections::{BTreeMap, HashMap};
use std::ops::Range;
use std::time::Duration;
use serde::{Deserialize, Serialize};
@ -17,51 +16,6 @@ enum SpanStatus {
pub struct CallStats {
pub call_count: usize,
pub time: u64,
pub self_time: u64,
}
#[derive(Debug, Default)]
pub struct SelfTime {
child_ranges: Vec<Range<Duration>>,
}
impl SelfTime {
pub fn new() -> Self {
Default::default()
}
pub fn add_child_range(&mut self, child_range: Range<Duration>) {
self.child_ranges.push(child_range)
}
pub fn self_duration(&mut self, self_range: Range<Duration>) -> Duration {
if self.child_ranges.is_empty() {
return self_range.end - self_range.start;
}
// by sorting child ranges by their start time,
// we make sure that no child will start before the last one we visited.
self.child_ranges
.sort_by(|left, right| left.start.cmp(&right.start).then(left.end.cmp(&right.end)));
// self duration computed by adding all the segments where the span is not executing a child
let mut self_duration = Duration::from_nanos(0);
// last point in time where we are certain that this span was not executing a child.
let mut committed_point = self_range.start;
for child_range in &self.child_ranges {
if child_range.start > committed_point {
// we add to the self duration the point between the end of the latest span and the beginning of the next span
self_duration += child_range.start - committed_point;
}
if committed_point < child_range.end {
// then we set ourselves to the end of the latest span
committed_point = child_range.end;
}
}
self_duration
}
}
pub fn to_call_stats<R: std::io::Read>(
@ -69,9 +23,6 @@ pub fn to_call_stats<R: std::io::Read>(
) -> Result<BTreeMap<String, CallStats>, Error> {
let mut calls = HashMap::new();
let mut spans = HashMap::new();
let mut last_point = Duration::from_nanos(0);
let mut first_point = None;
let mut total_self_time = SelfTime::new();
for entry in trace {
let entry = entry?;
match entry {
@ -80,11 +31,10 @@ pub fn to_call_stats<R: std::io::Read>(
}
Entry::NewThread(_) => {}
Entry::NewSpan(span) => {
spans.insert(span.id, (span, SpanStatus::Outside, SelfTime::new()));
spans.insert(span.id, (span, SpanStatus::Outside));
}
Entry::SpanEnter(SpanEnter { id, time, memory: _ }) => {
first_point.get_or_insert(time);
let (_, status, _) = spans.get_mut(&id).unwrap();
let (_, status) = spans.get_mut(&id).unwrap();
let SpanStatus::Outside = status else {
continue;
@ -93,32 +43,18 @@ pub fn to_call_stats<R: std::io::Read>(
*status = SpanStatus::Inside(time);
}
Entry::SpanExit(SpanExit { id, time: end, memory: _ }) => {
let (span, status, self_time) = spans.get_mut(&id).unwrap();
let (span, status) = spans.get_mut(&id).unwrap();
let SpanStatus::Inside(begin) = status else {
continue;
};
let begin = *begin;
if last_point < end {
last_point = end;
}
*status = SpanStatus::Outside;
let self_range = begin..end;
let self_duration = self_time.self_duration(self_range.clone());
*self_time = SelfTime::new();
let span = *span;
if let Some(parent_id) = span.parent_id {
let (_, _, parent_self_time) = spans.get_mut(&parent_id).unwrap();
parent_self_time.add_child_range(self_range.clone())
}
total_self_time.add_child_range(self_range);
let (_, call_list) = calls.get_mut(&span.call_id).unwrap();
call_list.push((end - begin, self_duration));
call_list.push(end - begin);
}
Entry::SpanClose(SpanClose { id, time: _ }) => {
spans.remove(&id);
@ -127,31 +63,17 @@ pub fn to_call_stats<R: std::io::Read>(
}
}
let total_self_time = first_point
.map(|first_point| (first_point, total_self_time.self_duration(first_point..last_point)));
Ok(calls
.into_iter()
.map(|(_, (call_site, calls))| (site_to_string(call_site), calls_to_stats(calls)))
.chain(total_self_time.map(|(first_point, total_self_time)| {
(
"::meta::total".to_string(),
CallStats {
call_count: 1,
time: (last_point - first_point).as_nanos() as u64,
self_time: total_self_time.as_nanos() as u64,
},
)
}))
.collect())
}
fn site_to_string(call_site: NewCallsite) -> String {
format!("{}::{}", call_site.target, call_site.name)
}
fn calls_to_stats(calls: Vec<(Duration, Duration)>) -> CallStats {
fn calls_to_stats(calls: Vec<Duration>) -> CallStats {
let nb = calls.len();
let sum: Duration = calls.iter().map(|(total, _)| total).sum();
let self_sum: Duration = calls.iter().map(|(_, self_duration)| self_duration).sum();
CallStats { call_count: nb, time: sum.as_nanos() as u64, self_time: self_sum.as_nanos() as u64 }
let sum: Duration = calls.iter().sum();
CallStats { call_count: nb, time: sum.as_nanos() as u64 }
}

View File

@ -1,164 +0,0 @@
{
"name": "hackernews.ndjson_1M",
"run_count": 3,
"extra_cli_args": [],
"assets": {
"hackernews-100_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-100_000.ndjson",
"sha256": "60ecd23485d560edbd90d9ca31f0e6dba1455422f2a44e402600fbb5f7f1b213"
},
"hackernews-200_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-200_000.ndjson",
"sha256": "785b0271fdb47cba574fab617d5d332276b835c05dd86e4a95251cf7892a1685"
},
"hackernews-300_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-300_000.ndjson",
"sha256": "de73c7154652eddfaf69cdc3b2f824d5c452f095f40a20a1c97bb1b5c4d80ab2"
},
"hackernews-400_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-400_000.ndjson",
"sha256": "c1b00a24689110f366447e434c201c086d6f456d54ed1c4995894102794d8fe7"
},
"hackernews-500_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-500_000.ndjson",
"sha256": "ae98f9dbef8193d750e3e2dbb6a91648941a1edca5f6e82c143e7996f4840083"
},
"hackernews-600_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-600_000.ndjson",
"sha256": "b495fdc72c4a944801f786400f22076ab99186bee9699f67cbab2f21f5b74dbe"
},
"hackernews-700_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-700_000.ndjson",
"sha256": "4b2c63974f3dabaa4954e3d4598b48324d03c522321ac05b0d583f36cb78a28b"
},
"hackernews-800_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-800_000.ndjson",
"sha256": "cb7b6afe0e6caa1be111be256821bc63b0771b2a0e1fad95af7aaeeffd7ba546"
},
"hackernews-900_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-900_000.ndjson",
"sha256": "e1154ddcd398f1c867758a93db5bcb21a07b9e55530c188a2917fdef332d3ba9"
},
"hackernews-1_000_000.ndjson": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/hackernews-1_000_000.ndjson",
"sha256": "27e25efd0b68b159b8b21350d9af76938710cb29ce0393fa71b41c4f3c630ffe"
}
},
"commands": [
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"displayedAttributes": [
"title",
"by",
"score",
"time"
],
"searchableAttributes": [
"title"
],
"filterableAttributes": [
"by"
],
"sortableAttributes": [
"score",
"time"
]
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-100_000.ndjson"
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-200_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-300_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-400_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-500_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-600_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-700_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-800_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-900_000.ndjson"
},
"synchronous": "WaitForResponse"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "hackernews-1_000_000.ndjson"
},
"synchronous": "WaitForTask"
}
]
}

View File

@ -1,44 +0,0 @@
{
"name": "movies.json,no-threads",
"run_count": 2,
"extra_cli_args": [
"--max-indexing-threads=1"
],
"assets": {
"movies.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies.json",
"sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1"
}
},
"commands": [
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies.json"
},
"synchronous": "WaitForTask"
}
]
}

View File

@ -1,42 +0,0 @@
{
"name": "movies.json",
"run_count": 10,
"extra_cli_args": [],
"assets": {
"movies.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies.json",
"sha256": "5b6e4cb660bc20327776e8a33ea197b43d9ec84856710ead1cc87ab24df77de1"
}
},
"commands": [
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies.json"
},
"synchronous": "WaitForTask"
}
]
}

View File

@ -11,34 +11,5 @@ license.workspace = true
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0.79"
build-info = { version = "1.7.0", path = "../build-info" }
cargo_metadata = "0.18.1"
clap = { version = "4.4.14", features = ["derive"] }
futures-core = "0.3.30"
futures-util = "0.3.30"
reqwest = { version = "0.11.23", features = [
"stream",
"json",
"rustls-tls",
], default_features = false }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = "1.0.111"
sha2 = "0.10.8"
sysinfo = "0.30.5"
time = { version = "0.3.32", features = [
"serde",
"serde-human-readable",
"macros",
] }
tokio = { version = "1.35.1", features = [
"rt",
"net",
"time",
"process",
"signal",
] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"
tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
uuid = { version = "1.7.0", features = ["v7", "serde"] }

View File

@ -1,250 +0,0 @@
use std::collections::BTreeMap;
use std::io::{Read as _, Seek as _, Write as _};
use anyhow::{bail, Context};
use futures_util::TryStreamExt as _;
use serde::Deserialize;
use sha2::Digest;
use super::client::Client;
#[derive(Deserialize, Clone)]
pub struct Asset {
pub local_location: Option<String>,
pub remote_location: Option<String>,
#[serde(default)]
pub format: AssetFormat,
pub sha256: Option<String>,
}
#[derive(Deserialize, Default, Copy, Clone)]
pub enum AssetFormat {
#[default]
Auto,
Json,
NdJson,
Raw,
}
impl AssetFormat {
pub fn to_content_type(self, filename: &str) -> &'static str {
match self {
AssetFormat::Auto => Self::auto_detect(filename).to_content_type(filename),
AssetFormat::Json => "application/json",
AssetFormat::NdJson => "application/x-ndjson",
AssetFormat::Raw => "application/octet-stream",
}
}
fn auto_detect(filename: &str) -> Self {
let path = std::path::Path::new(filename);
match path.extension().and_then(|extension| extension.to_str()) {
Some(extension) if extension.eq_ignore_ascii_case("json") => Self::Json,
Some(extension) if extension.eq_ignore_ascii_case("ndjson") => Self::NdJson,
extension => {
tracing::warn!(asset = filename, ?extension, "asset has format `Auto`, but extension was not recognized. Specify `Raw` format to suppress this warning.");
AssetFormat::Raw
}
}
}
}
pub fn fetch_asset(
name: &str,
assets: &BTreeMap<String, Asset>,
asset_folder: &str,
) -> anyhow::Result<(std::fs::File, AssetFormat)> {
let asset =
assets.get(name).with_context(|| format!("could not find asset with name '{name}'"))?;
let filename = if let Some(local_filename) = &asset.local_location {
local_filename.clone()
} else {
format!("{asset_folder}/{name}")
};
Ok((
std::fs::File::open(&filename)
.with_context(|| format!("could not open asset '{name}' at '{filename}'"))?,
asset.format,
))
}
#[tracing::instrument(skip(client, assets), fields(asset_count = assets.len()))]
pub async fn fetch_assets(
client: &Client,
assets: &BTreeMap<String, Asset>,
asset_folder: &str,
) -> anyhow::Result<()> {
let mut download_tasks = tokio::task::JoinSet::new();
for (name, asset) in assets {
// trying local
if let Some(local) = &asset.local_location {
match std::fs::File::open(local) {
Ok(file) => {
if check_sha256(name, asset, file)? {
continue;
} else {
tracing::warn!(asset = name, file = local, "found local resource for asset but hash differed, skipping to asset store");
}
}
Err(error) => match error.kind() {
std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */
}
_ => tracing::warn!(
error = &error as &dyn std::error::Error,
"error checking local resource, skipping to asset store"
),
},
}
}
// checking asset store
let store_filename = format!("{}/{}", asset_folder, name);
match std::fs::File::open(&store_filename) {
Ok(file) => {
if check_sha256(name, asset, file)? {
continue;
} else {
tracing::warn!(asset = name, file = store_filename, "found resource for asset in asset store, but hash differed, skipping to remote method");
}
}
Err(error) => match error.kind() {
std::io::ErrorKind::NotFound => { /* file does not exist, go to remote, no need for logs */
}
_ => tracing::warn!(
error = &error as &dyn std::error::Error,
"error checking resource in store, skipping to remote method"
),
},
}
// downloading remote
match &asset.remote_location {
Some(location) => {
std::fs::create_dir_all(asset_folder).with_context(|| format!("could not create asset folder at {asset_folder}"))?;
download_tasks.spawn({
let client = client.clone();
let name = name.to_string();
let location = location.to_string();
let store_filename = store_filename.clone();
let asset = asset.clone();
download_asset(client, name, asset, location, store_filename)});
},
None => bail!("asset {name} has no remote location, but was not found locally or in the asset store"),
}
}
while let Some(res) = download_tasks.join_next().await {
res.context("download task panicked")?.context("download task failed")?;
}
Ok(())
}
fn check_sha256(name: &str, asset: &Asset, mut file: std::fs::File) -> anyhow::Result<bool> {
let mut bytes = Vec::new();
file.read_to_end(&mut bytes).with_context(|| format!("hashing file for asset {name}"))?;
let mut file_hash = sha2::Sha256::new();
file_hash.update(&bytes);
let file_hash = file_hash.finalize();
let file_hash = format!("{:x}", file_hash);
tracing::debug!(hash = file_hash, "hashed local file");
Ok(match &asset.sha256 {
Some(hash) => {
tracing::debug!(hash, "hash from workload");
if hash.to_ascii_lowercase() == file_hash {
true
} else {
tracing::warn!(
file_hash,
asset_hash = hash.to_ascii_lowercase(),
"hashes don't match"
);
false
}
}
None => {
tracing::warn!(sha256 = file_hash, "Skipping hash for asset {name} that doesn't have one. Please add it to workload file");
true
}
})
}
#[tracing::instrument(skip(client, asset, name), fields(asset = name))]
async fn download_asset(
client: Client,
name: String,
asset: Asset,
src: String,
dest_filename: String,
) -> anyhow::Result<()> {
let context = || format!("failure downloading asset {name} from {src}");
let response = client.get(&src).send().await.with_context(context)?;
let file = std::fs::File::options()
.create(true)
.truncate(true)
.write(true)
.read(true)
.open(&dest_filename)
.with_context(|| format!("creating destination file {dest_filename}"))
.with_context(context)?;
let mut dest = std::io::BufWriter::new(
file.try_clone().context("cloning I/O handle").with_context(context)?,
);
let total_len: Option<u64> = response
.headers()
.get(reqwest::header::CONTENT_LENGTH)
.and_then(|value| value.to_str().ok())
.and_then(|value| value.parse().ok());
let progress = tokio::spawn({
let name = name.clone();
async move {
loop {
match file.metadata().context("could not get file metadata") {
Ok(metadata) => {
let len = metadata.len();
tracing::info!(
asset = name,
downloaded_bytes = len,
total_bytes = total_len,
"asset download in progress"
);
}
Err(error) => {
tracing::warn!(%error, "could not get file metadata");
}
}
tokio::time::sleep(std::time::Duration::from_secs(60)).await;
}
}
});
let writing_context = || format!("while writing to destination file at {dest_filename}");
let mut response = response.bytes_stream();
while let Some(bytes) =
response.try_next().await.context("while downloading file").with_context(context)?
{
dest.write_all(&bytes).with_context(writing_context).with_context(context)?;
}
progress.abort();
let mut file = dest.into_inner().with_context(writing_context).with_context(context)?;
file.rewind().context("while rewinding asset file")?;
if !check_sha256(&name, &asset, file)? {
bail!("asset '{name}': sha256 mismatch for file {dest_filename} downloaded from {src}")
}
Ok(())
}

View File

@ -1,80 +0,0 @@
use anyhow::Context;
use serde::Deserialize;
#[derive(Debug, Clone)]
pub struct Client {
base_url: Option<String>,
client: reqwest::Client,
}
impl Client {
pub fn new(
base_url: Option<String>,
api_key: Option<&str>,
timeout: Option<std::time::Duration>,
) -> anyhow::Result<Self> {
let mut headers = reqwest::header::HeaderMap::new();
if let Some(api_key) = api_key {
headers.append(
reqwest::header::AUTHORIZATION,
reqwest::header::HeaderValue::from_str(&format!("Bearer {api_key}"))
.context("Invalid authorization header")?,
);
}
let client = reqwest::ClientBuilder::new().default_headers(headers);
let client = if let Some(timeout) = timeout { client.timeout(timeout) } else { client };
let client = client.build()?;
Ok(Self { base_url, client })
}
pub fn request(&self, method: reqwest::Method, route: &str) -> reqwest::RequestBuilder {
if let Some(base_url) = &self.base_url {
if route.is_empty() {
self.client.request(method, base_url)
} else {
self.client.request(method, format!("{}/{}", base_url, route))
}
} else {
self.client.request(method, route)
}
}
pub fn get(&self, route: &str) -> reqwest::RequestBuilder {
self.request(reqwest::Method::GET, route)
}
pub fn put(&self, route: &str) -> reqwest::RequestBuilder {
self.request(reqwest::Method::PUT, route)
}
pub fn post(&self, route: &str) -> reqwest::RequestBuilder {
self.request(reqwest::Method::POST, route)
}
pub fn delete(&self, route: &str) -> reqwest::RequestBuilder {
self.request(reqwest::Method::DELETE, route)
}
}
#[derive(Debug, Clone, Copy, Deserialize)]
#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
pub enum Method {
Get,
Post,
Patch,
Delete,
Put,
}
impl From<Method> for reqwest::Method {
fn from(value: Method) -> Self {
match value {
Method::Get => Self::GET,
Method::Post => Self::POST,
Method::Patch => Self::PATCH,
Method::Delete => Self::DELETE,
Method::Put => Self::PUT,
}
}
}

View File

@ -1,194 +0,0 @@
use std::collections::BTreeMap;
use std::fmt::Display;
use std::io::Read as _;
use anyhow::{bail, Context as _};
use serde::Deserialize;
use super::assets::{fetch_asset, Asset};
use super::client::{Client, Method};
#[derive(Clone, Deserialize)]
pub struct Command {
pub route: String,
pub method: Method,
#[serde(default)]
pub body: Body,
#[serde(default)]
pub synchronous: SyncMode,
}
#[derive(Default, Clone, Deserialize)]
#[serde(untagged)]
pub enum Body {
Inline {
inline: serde_json::Value,
},
Asset {
asset: String,
},
#[default]
Empty,
}
impl Body {
pub fn get(
self,
assets: &BTreeMap<String, Asset>,
asset_folder: &str,
) -> anyhow::Result<Option<(Vec<u8>, &'static str)>> {
Ok(match self {
Body::Inline { inline: body } => Some((
serde_json::to_vec(&body)
.context("serializing to bytes")
.context("while getting inline body")?,
"application/json",
)),
Body::Asset { asset: name } => Some({
let context = || format!("while getting body from asset '{name}'");
let (mut file, format) =
fetch_asset(&name, assets, asset_folder).with_context(context)?;
let mut buf = Vec::new();
file.read_to_end(&mut buf).with_context(context)?;
(buf, format.to_content_type(&name))
}),
Body::Empty => None,
})
}
}
impl Display for Command {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?} {} ({:?})", self.method, self.route, self.synchronous)
}
}
#[derive(Default, Debug, Clone, Copy, Deserialize)]
pub enum SyncMode {
DontWait,
#[default]
WaitForResponse,
WaitForTask,
}
pub async fn run_batch(
client: &Client,
batch: &[Command],
assets: &BTreeMap<String, Asset>,
asset_folder: &str,
) -> anyhow::Result<()> {
let [.., last] = batch else { return Ok(()) };
let sync = last.synchronous;
let mut tasks = tokio::task::JoinSet::new();
for command in batch {
// FIXME: you probably don't want to copy assets everytime here
tasks.spawn({
let client = client.clone();
let command = command.clone();
let assets = assets.clone();
let asset_folder = asset_folder.to_owned();
async move { run(client, command, &assets, &asset_folder).await }
});
}
while let Some(result) = tasks.join_next().await {
result
.context("panicked while executing command")?
.context("error while executing command")?;
}
match sync {
SyncMode::DontWait => {}
SyncMode::WaitForResponse => {}
SyncMode::WaitForTask => wait_for_tasks(client).await?,
}
Ok(())
}
async fn wait_for_tasks(client: &Client) -> anyhow::Result<()> {
loop {
let response = client
.get("tasks?statuses=enqueued,processing")
.send()
.await
.context("could not wait for tasks")?;
let response: serde_json::Value = response
.json()
.await
.context("could not deserialize response to JSON")
.context("could not wait for tasks")?;
match response.get("total") {
Some(serde_json::Value::Number(number)) => {
let number = number.as_u64().with_context(|| {
format!("waiting for tasks: could not parse 'total' as integer, got {}", number)
})?;
if number == 0 {
break;
} else {
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
continue;
}
}
Some(thing_else) => {
bail!(format!(
"waiting for tasks: could not parse 'total' as a number, got '{thing_else}'"
))
}
None => {
bail!(format!(
"waiting for tasks: expected response to contain 'total', got '{response}'"
))
}
}
}
Ok(())
}
#[tracing::instrument(skip(client, command, assets, asset_folder), fields(command = %command))]
pub async fn run(
client: Client,
mut command: Command,
assets: &BTreeMap<String, Asset>,
asset_folder: &str,
) -> anyhow::Result<()> {
// memtake the body here to leave an empty body in its place, so that command is not partially moved-out
let body = std::mem::take(&mut command.body)
.get(assets, asset_folder)
.with_context(|| format!("while getting body for command {command}"))?;
let request = client.request(command.method.into(), &command.route);
let request = if let Some((body, content_type)) = body {
request.body(body).header(reqwest::header::CONTENT_TYPE, content_type)
} else {
request
};
let response =
request.send().await.with_context(|| format!("error sending command: {}", command))?;
let code = response.status();
if code.is_client_error() {
tracing::error!(%command, %code, "error in workload file");
let response: serde_json::Value = response
.json()
.await
.context("could not deserialize response as JSON")
.context("parsing error in workload file when sending command")?;
bail!("error in workload file: server responded with error code {code} and '{response}'")
} else if code.is_server_error() {
tracing::error!(%command, %code, "server error");
let response: serde_json::Value = response
.json()
.await
.context("could not deserialize response as JSON")
.context("parsing server error when sending command")?;
bail!("server error: server responded with error code {code} and '{response}'")
}
Ok(())
}

View File

@ -1,167 +0,0 @@
use std::collections::BTreeMap;
use anyhow::{bail, Context};
use serde_json::json;
use tokio::signal::ctrl_c;
use tokio::task::AbortHandle;
use tracing_trace::processor::span_stats::CallStats;
use uuid::Uuid;
use super::client::Client;
use super::env_info;
use super::workload::Workload;
pub async fn cancel_on_ctrl_c(
invocation_uuid: Uuid,
dashboard_client: Client,
abort_handle: AbortHandle,
) {
tracing::info!("press Ctrl-C to cancel the invocation");
match ctrl_c().await {
Ok(()) => {
tracing::info!(%invocation_uuid, "received Ctrl-C, cancelling invocation");
mark_as_failed(dashboard_client, invocation_uuid, None).await;
abort_handle.abort();
}
Err(error) => tracing::warn!(
error = &error as &dyn std::error::Error,
"failed to listen to Ctrl-C signal, invocation won't be canceled on Ctrl-C"
),
}
}
pub async fn mark_as_failed(
dashboard_client: Client,
invocation_uuid: Uuid,
failure_reason: Option<String>,
) {
let response = dashboard_client
.post("cancel-invocation")
.json(&json!({
"invocation_uuid": invocation_uuid,
"failure_reason": failure_reason,
}))
.send()
.await;
let response = match response {
Ok(response) => response,
Err(response_error) => {
tracing::error!(error = &response_error as &dyn std::error::Error, %invocation_uuid, "could not mark invocation as failed");
return;
}
};
if !response.status().is_success() {
tracing::error!(
%invocation_uuid,
"could not mark invocation as failed: {}",
response.text().await.unwrap()
);
return;
}
tracing::warn!(%invocation_uuid, "marked invocation as failed or canceled");
}
pub async fn send_machine_info(
dashboard_client: &Client,
env: &env_info::Environment,
) -> anyhow::Result<()> {
let response = dashboard_client
.put("machine")
.json(&json!({"hostname": env.hostname}))
.send()
.await
.context("sending machine information")?;
if !response.status().is_success() {
bail!(
"could not send machine information: {} {}",
response.status(),
response.text().await.unwrap_or_else(|_| "unknown".into())
);
}
Ok(())
}
pub async fn create_invocation(
dashboard_client: &Client,
build_info: build_info::BuildInfo,
commit_message: &str,
env: env_info::Environment,
max_workloads: usize,
reason: Option<&str>,
) -> anyhow::Result<Uuid> {
let response = dashboard_client
.put("invocation")
.json(&json!({
"commit": {
"sha1": build_info.commit_sha1,
"message": commit_message,
"commit_date": build_info.commit_timestamp,
"branch": build_info.branch,
"tag": build_info.describe.and_then(|describe| describe.as_tag()),
},
"machine_hostname": env.hostname,
"max_workloads": max_workloads,
"reason": reason
}))
.send()
.await
.context("sending invocation")?;
if !response.status().is_success() {
bail!(
"could not send new invocation: {}",
response.text().await.unwrap_or_else(|_| "unknown".into())
);
}
let invocation_uuid: Uuid =
response.json().await.context("could not deserialize invocation response as JSON")?;
Ok(invocation_uuid)
}
pub async fn create_workload(
dashboard_client: &Client,
invocation_uuid: Uuid,
workload: &Workload,
) -> anyhow::Result<Uuid> {
let response = dashboard_client
.put("workload")
.json(&json!({
"invocation_uuid": invocation_uuid,
"name": &workload.name,
"max_runs": workload.run_count,
}))
.send()
.await
.context("could not create new workload")?;
if !response.status().is_success() {
bail!("creating new workload failed: {}", response.text().await.unwrap())
}
let workload_uuid: Uuid =
response.json().await.context("could not deserialize JSON as UUID")?;
Ok(workload_uuid)
}
pub async fn create_run(
dashboard_client: Client,
workload_uuid: Uuid,
report: &BTreeMap<String, CallStats>,
) -> anyhow::Result<()> {
let response = dashboard_client
.put("run")
.json(&json!({
"workload_uuid": workload_uuid,
"data": report
}))
.send()
.await
.context("sending new run")?;
if !response.status().is_success() {
bail!(
"sending new run failed: {}",
response.text().await.unwrap_or_else(|_| "unknown".into())
)
}
Ok(())
}

View File

@ -1,75 +0,0 @@
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Environment {
pub hostname: Option<String>,
pub cpu: String,
/// Advertised or nominal clock speed in Hertz.
pub clock_speed: u64,
/// Total number of bytes of memory provided by the system. */
pub memory: u64,
pub os_type: String,
pub software: Vec<VersionInfo>,
pub user_name: String,
/// Is set true when the data was gathered by a manual run,
/// possibly on a developer machine, instead of the usual benchmark server.
pub manual_run: bool,
}
impl Environment {
pub fn generate_from_current_config() -> Self {
use sysinfo::System;
let unknown_string = String::from("Unknown");
let mut system = System::new();
system.refresh_cpu();
system.refresh_cpu_frequency();
system.refresh_memory();
let (cpu, frequency) = match system.cpus().first() {
Some(cpu) => (
format!("{} @ {:.2}GHz", cpu.brand(), cpu.frequency() as f64 / 1000.0),
cpu.frequency() * 1_000_000,
),
None => (unknown_string.clone(), 0),
};
let mut software = Vec::new();
if let Some(distribution) = System::name() {
software
.push(VersionInfo { name: distribution, version: String::from("distribution") });
}
if let Some(kernel) = System::kernel_version() {
software.push(VersionInfo { name: kernel, version: String::from("kernel") });
}
if let Some(os) = System::os_version() {
software.push(VersionInfo { name: os, version: String::from("kernel-release") });
}
if let Some(arch) = System::cpu_arch() {
software.push(VersionInfo { name: arch, version: String::from("arch") });
}
Self {
hostname: System::host_name(),
cpu,
clock_speed: frequency,
memory: system.total_memory(),
os_type: System::long_os_version().unwrap_or(unknown_string.clone()),
user_name: System::name().unwrap_or(unknown_string.clone()),
manual_run: false,
software,
}
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct VersionInfo {
pub name: String,
pub version: String,
}

View File

@ -1,112 +0,0 @@
use std::collections::BTreeMap;
use anyhow::{bail, Context as _};
use super::assets::Asset;
use super::client::Client;
use super::workload::Workload;
pub async fn kill(mut meilisearch: tokio::process::Child) {
if let Err(error) = meilisearch.kill().await {
tracing::warn!(
error = &error as &dyn std::error::Error,
"while terminating Meilisearch server"
)
}
}
#[tracing::instrument]
pub async fn build() -> anyhow::Result<()> {
let mut command = tokio::process::Command::new("cargo");
command.arg("build").arg("--release").arg("-p").arg("meilisearch");
command.kill_on_drop(true);
let mut builder = command.spawn().context("error building Meilisearch")?;
if !builder.wait().await.context("could not build Meilisearch")?.success() {
bail!("failed building Meilisearch")
}
Ok(())
}
#[tracing::instrument(skip(client, master_key, workload), fields(workload = workload.name))]
pub async fn start(
client: &Client,
master_key: Option<&str>,
workload: &Workload,
asset_folder: &str,
) -> anyhow::Result<tokio::process::Child> {
let mut command = tokio::process::Command::new("cargo");
command
.arg("run")
.arg("--release")
.arg("-p")
.arg("meilisearch")
.arg("--bin")
.arg("meilisearch")
.arg("--");
command.arg("--db-path").arg("./_xtask_benchmark.ms");
if let Some(master_key) = master_key {
command.arg("--master-key").arg(master_key);
}
command.arg("--experimental-enable-logs-route");
for extra_arg in workload.extra_cli_args.iter() {
command.arg(extra_arg);
}
command.kill_on_drop(true);
let mut meilisearch = command.spawn().context("Error starting Meilisearch")?;
wait_for_health(client, &mut meilisearch, &workload.assets, asset_folder).await?;
Ok(meilisearch)
}
async fn wait_for_health(
client: &Client,
meilisearch: &mut tokio::process::Child,
assets: &BTreeMap<String, Asset>,
asset_folder: &str,
) -> anyhow::Result<()> {
for i in 0..100 {
let res = super::command::run(client.clone(), health_command(), assets, asset_folder).await;
if res.is_ok() {
// check that this is actually the current Meilisearch instance that answered us
if let Some(exit_code) =
meilisearch.try_wait().context("cannot check Meilisearch server process status")?
{
tracing::error!("Got an health response from a different process");
bail!("Meilisearch server exited early with code {exit_code}");
}
return Ok(());
}
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
// check whether the Meilisearch instance exited early (cut the wait)
if let Some(exit_code) =
meilisearch.try_wait().context("cannot check Meilisearch server process status")?
{
bail!("Meilisearch server exited early with code {exit_code}");
}
tracing::debug!(attempt = i, "Waiting for Meilisearch to go up");
}
bail!("meilisearch is not responding")
}
fn health_command() -> super::command::Command {
super::command::Command {
route: "/health".into(),
method: super::client::Method::Get,
body: Default::default(),
synchronous: super::command::SyncMode::WaitForResponse,
}
}
pub fn delete_db() {
let _ = std::fs::remove_dir_all("./_xtask_benchmark.ms");
}

View File

@ -1,203 +0,0 @@
mod assets;
mod client;
mod command;
mod dashboard;
mod env_info;
mod meili_process;
mod workload;
use std::path::PathBuf;
use anyhow::Context;
use clap::Parser;
use tracing_subscriber::fmt::format::FmtSpan;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::Layer;
use self::client::Client;
use self::workload::Workload;
pub fn default_http_addr() -> String {
"127.0.0.1:7700".to_string()
}
pub fn default_report_folder() -> String {
"./bench/reports/".into()
}
pub fn default_asset_folder() -> String {
"./bench/assets/".into()
}
pub fn default_log_filter() -> String {
"info".into()
}
pub fn default_dashboard_url() -> String {
"http://localhost:9001".into()
}
/// Run benchmarks from a workload
#[derive(Parser, Debug)]
pub struct BenchDeriveArgs {
/// Filename of the workload file, pass multiple filenames
/// to run multiple workloads in the specified order.
///
/// Each workload run will get its own report file.
#[arg(value_name = "WORKLOAD_FILE", last = false)]
workload_file: Vec<PathBuf>,
/// URL of the dashboard.
#[arg(long, default_value_t = default_dashboard_url())]
dashboard_url: String,
/// Directory to output reports.
#[arg(long, default_value_t = default_report_folder())]
report_folder: String,
/// Directory to store the remote assets.
#[arg(long, default_value_t = default_asset_folder())]
asset_folder: String,
/// Log directives
#[arg(short, long, default_value_t = default_log_filter())]
log_filter: String,
/// Benchmark dashboard API key
#[arg(long)]
api_key: Option<String>,
/// Meilisearch master keys
#[arg(long)]
master_key: Option<String>,
/// Authentication bearer for fetching assets
#[arg(long)]
assets_key: Option<String>,
/// Reason for the benchmark invocation
#[arg(short, long)]
reason: Option<String>,
}
pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> {
// setup logs
let filter: tracing_subscriber::filter::Targets =
args.log_filter.parse().context("invalid --log-filter")?;
let subscriber = tracing_subscriber::registry().with(
tracing_subscriber::fmt::layer()
.with_span_events(FmtSpan::NEW | FmtSpan::CLOSE)
.with_filter(filter),
);
tracing::subscriber::set_global_default(subscriber).context("could not setup logging")?;
// fetch environment and build info
let env = env_info::Environment::generate_from_current_config();
let build_info = build_info::BuildInfo::from_build();
// tokio runtime
let rt = tokio::runtime::Builder::new_current_thread().enable_io().enable_time().build()?;
let _scope = rt.enter();
// setup clients
let assets_client =
Client::new(None, args.assets_key.as_deref(), Some(std::time::Duration::from_secs(3600)))?; // 1h
let dashboard_client = Client::new(
Some(format!("{}/api/v1", args.dashboard_url)),
args.api_key.as_deref(),
Some(std::time::Duration::from_secs(60)),
)?;
// reporting uses its own client because keeping the stream open to wait for entries
// blocks any other requests
// Also we don't want any pesky timeout because we don't know how much time it will take to recover the full trace
let logs_client = Client::new(
Some("http://127.0.0.1:7700/logs/stream".into()),
args.master_key.as_deref(),
None,
)?;
let meili_client = Client::new(
Some("http://127.0.0.1:7700".into()),
args.master_key.as_deref(),
Some(std::time::Duration::from_secs(60)),
)?;
// enter runtime
rt.block_on(async {
dashboard::send_machine_info(&dashboard_client, &env).await?;
let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap();
let max_workloads = args.workload_file.len();
let reason: Option<&str> = args.reason.as_deref();
let invocation_uuid = dashboard::create_invocation(&dashboard_client, build_info, commit_message, env, max_workloads, reason).await?;
tracing::info!(workload_count = args.workload_file.len(), "handling workload files");
// main task
let workload_runs = tokio::spawn(
{
let dashboard_client = dashboard_client.clone();
async move {
for workload_file in args.workload_file.iter() {
let workload: Workload = serde_json::from_reader(
std::fs::File::open(workload_file)
.with_context(|| format!("error opening {}", workload_file.display()))?,
)
.with_context(|| format!("error parsing {} as JSON", workload_file.display()))?;
workload::execute(
&assets_client,
&dashboard_client,
&logs_client,
&meili_client,
invocation_uuid,
args.master_key.as_deref(),
workload,
&args,
)
.await?;
}
Ok::<(), anyhow::Error>(())
}});
// handle ctrl-c
let abort_handle = workload_runs.abort_handle();
tokio::spawn({
let dashboard_client = dashboard_client.clone();
dashboard::cancel_on_ctrl_c(invocation_uuid, dashboard_client, abort_handle)
});
// wait for the end of the main task, handle result
match workload_runs.await {
Ok(Ok(_)) => {
tracing::info!("Success");
Ok::<(), anyhow::Error>(())
}
Ok(Err(error)) => {
tracing::error!(%invocation_uuid, error = %error, "invocation failed, attempting to report the failure to dashboard");
dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some(error.to_string())).await;
tracing::warn!(%invocation_uuid, "invocation marked as failed following error");
Err(error)
},
Err(join_error) => {
match join_error.try_into_panic() {
Ok(panic) => {
tracing::error!("invocation panicked, attempting to report the failure to dashboard");
dashboard::mark_as_failed(dashboard_client, invocation_uuid, Some("Panicked".into())).await;
std::panic::resume_unwind(panic)
}
Err(_) => {
tracing::warn!("task was canceled");
Ok(())
}
}
},
}
})?;
Ok(())
}

View File

@ -1,262 +0,0 @@
use std::collections::BTreeMap;
use std::fs::File;
use std::io::{Seek as _, Write as _};
use anyhow::{bail, Context as _};
use futures_util::TryStreamExt as _;
use serde::Deserialize;
use serde_json::json;
use tokio::task::JoinHandle;
use uuid::Uuid;
use super::assets::Asset;
use super::client::Client;
use super::command::SyncMode;
use super::BenchDeriveArgs;
use crate::bench::{assets, dashboard, meili_process};
#[derive(Deserialize)]
pub struct Workload {
pub name: String,
pub run_count: u16,
pub extra_cli_args: Vec<String>,
pub assets: BTreeMap<String, Asset>,
pub commands: Vec<super::command::Command>,
}
async fn run_commands(
dashboard_client: &Client,
logs_client: &Client,
meili_client: &Client,
workload_uuid: Uuid,
workload: &Workload,
args: &BenchDeriveArgs,
run_number: u16,
) -> anyhow::Result<JoinHandle<anyhow::Result<File>>> {
let report_folder = &args.report_folder;
let workload_name = &workload.name;
std::fs::create_dir_all(report_folder)
.with_context(|| format!("could not create report directory at {report_folder}"))?;
let trace_filename = format!("{report_folder}/{workload_name}-{run_number}-trace.json");
let report_filename = format!("{report_folder}/{workload_name}-{run_number}-report.json");
let report_handle = start_report(logs_client, trace_filename).await?;
for batch in workload
.commands
.as_slice()
.split_inclusive(|command| !matches!(command.synchronous, SyncMode::DontWait))
{
super::command::run_batch(meili_client, batch, &workload.assets, &args.asset_folder)
.await?;
}
let processor =
stop_report(dashboard_client, logs_client, workload_uuid, report_filename, report_handle)
.await?;
Ok(processor)
}
#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner
#[tracing::instrument(skip(assets_client, dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = workload.name))]
pub async fn execute(
assets_client: &Client,
dashboard_client: &Client,
logs_client: &Client,
meili_client: &Client,
invocation_uuid: Uuid,
master_key: Option<&str>,
workload: Workload,
args: &BenchDeriveArgs,
) -> anyhow::Result<()> {
assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?;
let workload_uuid =
dashboard::create_workload(dashboard_client, invocation_uuid, &workload).await?;
let mut tasks = Vec::new();
for i in 0..workload.run_count {
tasks.push(
execute_run(
dashboard_client,
logs_client,
meili_client,
workload_uuid,
master_key,
&workload,
args,
i,
)
.await?,
);
}
let mut reports = Vec::with_capacity(workload.run_count as usize);
for task in tasks {
reports.push(
task.await
.context("task panicked while processing report")?
.context("task failed while processing report")?,
);
}
tracing::info!(workload = workload.name, "Successful workload");
Ok(())
}
#[allow(clippy::too_many_arguments)] // not best code quality, but this is a benchmark runner
#[tracing::instrument(skip(dashboard_client, logs_client, meili_client, workload, master_key, args), fields(workload = %workload.name))]
async fn execute_run(
dashboard_client: &Client,
logs_client: &Client,
meili_client: &Client,
workload_uuid: Uuid,
master_key: Option<&str>,
workload: &Workload,
args: &BenchDeriveArgs,
run_number: u16,
) -> anyhow::Result<tokio::task::JoinHandle<anyhow::Result<std::fs::File>>> {
meili_process::delete_db();
meili_process::build().await?;
let meilisearch =
meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?;
let processor = run_commands(
dashboard_client,
logs_client,
meili_client,
workload_uuid,
workload,
args,
run_number,
)
.await?;
meili_process::kill(meilisearch).await;
tracing::info!(run_number, "Successful run");
Ok(processor)
}
async fn start_report(
logs_client: &Client,
filename: String,
) -> anyhow::Result<tokio::task::JoinHandle<anyhow::Result<std::fs::File>>> {
let report_file = std::fs::File::options()
.create(true)
.truncate(true)
.write(true)
.read(true)
.open(&filename)
.with_context(|| format!("could not create file at {filename}"))?;
let mut report_file = std::io::BufWriter::new(report_file);
let response = logs_client
.post("")
.json(&json!({
"mode": "profile",
"target": "indexing::=trace"
}))
.send()
.await
.context("failed to start report")?;
let code = response.status();
if code.is_client_error() {
tracing::error!(%code, "request error when trying to start report");
let response: serde_json::Value = response
.json()
.await
.context("could not deserialize response as JSON")
.context("response error when trying to start report")?;
bail!(
"request error when trying to start report: server responded with error code {code} and '{response}'"
)
} else if code.is_server_error() {
tracing::error!(%code, "server error when trying to start report");
let response: serde_json::Value = response
.json()
.await
.context("could not deserialize response as JSON")
.context("response error trying to start report")?;
bail!("server error when trying to start report: server responded with error code {code} and '{response}'")
}
Ok(tokio::task::spawn(async move {
let mut stream = response.bytes_stream();
while let Some(bytes) = stream.try_next().await.context("while waiting for report")? {
report_file
.write_all(&bytes)
.with_context(|| format!("while writing report to {filename}"))?;
}
report_file.into_inner().with_context(|| format!("while writing report to {filename}"))
}))
}
async fn stop_report(
dashboard_client: &Client,
logs_client: &Client,
workload_uuid: Uuid,
filename: String,
report_handle: tokio::task::JoinHandle<anyhow::Result<std::fs::File>>,
) -> anyhow::Result<tokio::task::JoinHandle<anyhow::Result<std::fs::File>>> {
let response = logs_client.delete("").send().await.context("while stopping report")?;
if !response.status().is_success() {
bail!("received HTTP {} while stopping report", response.status())
}
let mut file = tokio::time::timeout(std::time::Duration::from_secs(1000), report_handle)
.await
.context("while waiting for the end of the report")?
.context("report writing task panicked")?
.context("while writing report")?;
file.rewind().context("while rewinding report file")?;
let process_handle = tokio::task::spawn({
let dashboard_client = dashboard_client.clone();
async move {
let span = tracing::info_span!("processing trace to report", filename);
let _guard = span.enter();
let report = tracing_trace::processor::span_stats::to_call_stats(
tracing_trace::TraceReader::new(std::io::BufReader::new(file)),
)
.context("could not convert trace to report")?;
let context = || format!("writing report to {filename}");
dashboard::create_run(dashboard_client, workload_uuid, &report).await?;
let mut output_file = std::io::BufWriter::new(
std::fs::File::options()
.create(true)
.truncate(true)
.write(true)
.read(true)
.open(&filename)
.with_context(context)?,
);
for (key, value) in report {
serde_json::to_writer(&mut output_file, &json!({key: value}))
.context("serializing span stat")?;
writeln!(&mut output_file).with_context(context)?;
}
output_file.flush().with_context(context)?;
let mut output_file = output_file.into_inner().with_context(context)?;
output_file.rewind().context("could not rewind output_file").with_context(context)?;
Ok(output_file)
}
});
Ok(process_handle)
}

View File

@ -1 +0,0 @@
pub mod bench;

View File

@ -1,7 +1,6 @@
use std::collections::HashSet;
use clap::Parser;
use xtask::bench::BenchDeriveArgs;
/// List features available in the workspace
#[derive(Parser, Debug)]
@ -18,16 +17,13 @@ struct ListFeaturesDeriveArgs {
#[command(bin_name = "cargo xtask")]
enum Command {
ListFeatures(ListFeaturesDeriveArgs),
Bench(BenchDeriveArgs),
}
fn main() -> anyhow::Result<()> {
fn main() {
let args = Command::parse();
match args {
Command::ListFeatures(args) => list_features(args),
Command::Bench(args) => xtask::bench::run(args)?,
}
Ok(())
}
fn list_features(args: ListFeaturesDeriveArgs) {