Compare commits

..

28 Commits

Author SHA1 Message Date
Clément Renault
02a40645e2 Add TODOs to the prefix functions 2023-10-19 17:58:52 +02:00
Many the fish
066221fd2b Merge pull request #4141 from meilisearch/diff-indexing-searchable
Diff indexing searchable
2023-10-19 15:00:16 +02:00
ManyTheFish
b8fed737ef update extract word pair proximity to support deladd obkvs 2023-10-19 14:18:14 +02:00
ManyTheFish
c63ff5298b update extract word position docids 2023-10-19 13:27:07 +02:00
ManyTheFish
d50408d670 update extract word docids 2023-10-19 11:59:12 +02:00
ManyTheFish
e0dc413521 Make script language docids map taking a tuple of roaring bitmaps expressing the deletions and the additions 2023-10-19 11:59:12 +02:00
Clément Renault
0f6a0b1ab8 Merge pull request #4136 from meilisearch/diff-indexing-facet-values
Diff Indexing on the facet values extractors
2023-10-19 11:22:34 +02:00
Clément Renault
061f490204 Update extract_facet_string_docids to support deladd obkvs 2023-10-19 10:08:15 +02:00
Clément Renault
5c43ff72c1 Update extract_facet_number_docids to support deladd obkvs 2023-10-18 17:40:13 +02:00
Clément Renault
c445e9daec Rename docid_fid into fid_docid 2023-10-18 13:53:58 +02:00
Clément Renault
178a9802fa Implement all the facet extraction paths and simplify them 2023-10-18 11:02:19 +02:00
Clément Renault
7d546b9c22 Generate the DelAdd for is_null, is_empty, and exists 2023-10-18 11:02:19 +02:00
Clément Renault
c829feb40b Work on fid docid facet values rewrite 2023-10-18 11:02:19 +02:00
ManyTheFish
b88fd7994c Support diff indexing on extract_docid_word_positions 2023-10-16 14:58:11 +02:00
ManyTheFish
096d7705c7 Make the transform struct return diff-based documents obkvs 2023-10-12 11:46:56 +02:00
ManyTheFish
e8f8730467 deactivate prefix dbs 2023-10-10 16:17:03 +02:00
ManyTheFish
26ef0b3a07 clean PR warnings 2023-10-10 14:04:56 +02:00
ManyTheFish
20394fda04 Split wpp in several sorters 2023-10-10 14:04:56 +02:00
ManyTheFish
27161bcd05 Fix word pair proximity 2023-10-10 14:04:56 +02:00
ManyTheFish
04fd44b5e2 Use a vecDeque in wpp database 2023-10-10 14:04:56 +02:00
ManyTheFish
9078e60024 Generalize usage of CboRoaringBitmap codec to ease the use 2023-10-10 14:04:56 +02:00
ManyTheFish
8fb96b8274 Add buffer to the obkv writter 2023-10-10 14:04:56 +02:00
ManyTheFish
50ba751244 Compute word_fid_docids before word_docids and exact_word_docids 2023-10-10 14:04:56 +02:00
ManyTheFish
f36c36e368 add puffin in sorter into reeder function 2023-10-10 14:04:56 +02:00
ManyTheFish
c2dcd66d32 Fix 2023-10-10 14:04:56 +02:00
ManyTheFish
d4594306d3 Fix fid_word_docids 2023-10-10 14:04:56 +02:00
ManyTheFish
93d0680903 Add usefull debug assert before key insertion in database 2023-10-10 14:04:56 +02:00
ManyTheFish
01101d55ac Wip 2023-10-10 14:04:56 +02:00
103 changed files with 2336 additions and 3128 deletions

View File

@@ -7,17 +7,19 @@ assignees: ''
---
Related product team resources: [PRD]() (_internal only_)
Related product team resources: [roadmap card]() (_internal only_) and [PRD]() (_internal only_)
Related product discussion:
Related spec: WIP
## Motivation
<!---Copy/paste the information in PRD or briefly detail the product motivation. Ask product team if any hesitation.-->
<!---Copy/paste the information in the roadmap resources or briefly detail the product motivation. Ask product team if any hesitation.-->
## Usage
<!---Link to the public part of the PRD, or to the related product discussion for experimental features-->
<!---Write a quick description of the usage if the usage has already been defined-->
Refer to the final spec to know the details and the final decisions about the usage.
## TODO

View File

@@ -8,11 +8,11 @@ env:
jobs:
run-benchmarks-on-comment:
if: startsWith(github.event.comment.body, '/benchmark')
name: Run and upload benchmarks
runs-on: benchmarks
timeout-minutes: 4320 # 72h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
@@ -27,25 +27,14 @@ jobs:
reaction-type: "eyes"
repo-token: ${{ env.GH_TOKEN }}
- uses: xt0rted/pull-request-comment-branch@v2
id: comment-branch
with:
repo_token: ${{ env.GH_TOKEN }}
- uses: actions/checkout@v3
if: success()
with:
fetch-depth: 0 # fetch full history to be able to get main commit sha
ref: ${{ steps.comment-branch.outputs.head_ref }}
# Set variables
- name: Set current branch name
shell: bash
run: echo "name=$(git rev-parse --abbrev-ref HEAD)" >> $GITHUB_OUTPUT
run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
id: current_branch
- name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
shell: bash
run: echo "name=$(git rev-parse --abbrev-ref HEAD | tr '/' '_')" >> $GITHUB_OUTPUT
run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
id: normalized_current_branch
- name: Set shorter commit SHA
shell: bash
@@ -87,11 +76,9 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
run: |
set -x
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
export base=$(git log --pretty=%p -n 1)
echo 'Here are your benchmarks diff 👊' >> body.txt
echo '```' >> body.txt
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
./benchmarks/scripts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
echo '```' >> body.txt
gh pr comment ${{ steps.current_branch.outputs.name }} --body-file body.txt
gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt

902
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,6 @@
resolver = "2"
members = [
"meilisearch",
"meilitool",
"meilisearch-types",
"meilisearch-auth",
"meili-snap",
@@ -19,7 +18,7 @@ members = [
]
[workspace.package]
version = "1.5.0"
version = "1.4.0"
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
description = "Meilisearch HTTP server"
homepage = "https://meilisearch.com"

View File

@@ -3,7 +3,7 @@ FROM rust:alpine3.16 AS compiler
RUN apk add -q --update-cache --no-cache build-base openssl-dev
WORKDIR /
WORKDIR /meilisearch
ARG COMMIT_SHA
ARG COMMIT_DATE
@@ -17,7 +17,7 @@ RUN set -eux; \
if [ "$apkArch" = "aarch64" ]; then \
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
fi && \
cargo build --release -p meilisearch -p meilitool
cargo build --release
# Run
FROM alpine:3.16
@@ -28,10 +28,9 @@ ENV MEILI_SERVER_PROVIDER docker
RUN apk update --quiet \
&& apk add -q --no-cache libgcc tini curl
# add meilisearch and meilitool to the `/bin` so you can run it from anywhere
# and it's easy to find.
COPY --from=compiler /target/release/meilisearch /bin/meilisearch
COPY --from=compiler /target/release/meilitool /bin/meilitool
# add meilisearch to the `/bin` so you can run it from anywhere and it's easy
# to find.
COPY --from=compiler /meilisearch/target/release/meilisearch /bin/meilisearch
# To stay compatible with the older version of the container (pre v0.27.0) we're
# going to symlink the meilisearch binary in the path to `/meilisearch`
RUN ln -s /bin/meilisearch /meilisearch

View File

@@ -1,14 +1,14 @@
# Profiling Meilisearch
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options [in Puffin Viewer](https://github.com/embarkstudios/puffin#ui).
Search engine technologies are complex pieces of software that require thorough profiling tools. We chose to use [Puffin](https://github.com/EmbarkStudios/puffin), which the Rust gaming industry uses extensively. You can export and import the profiling reports using the top bar's _File_ menu options.
![An example profiling with Puffin viewer](assets/profiling-example.png)
## Profiling the Indexing Process
When you enable [the `exportPuffinReports` experimental feature](https://www.meilisearch.com/docs/learn/experimental/overview) of Meilisearch, Puffin reports with the `.puffin` extension will be automatically exported to disk. When this option is enabled, the engine will automatically create a "frame" whenever it executes the `IndexScheduler::tick` method.
When you enable the `profile-with-puffin` feature of Meilisearch, a Puffin HTTP server will run on Meilisearch and listen on the default _0.0.0.0:8585_ address. This server will record a "frame" whenever it executes the `IndexScheduler::tick` method.
[Puffin Viewer](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) is used to analyze the reports. Those reports show areas where Meilisearch spent time during indexing.
Once your Meilisearch is running and awaits new indexation operations, you must [install and run the `puffin_viewer` tool](https://github.com/EmbarkStudios/puffin/tree/main/puffin_viewer) to see the profiling results. I advise you to run the viewer with the `RUST_LOG=puffin_http::client=debug` environment variable to see the client trying to connect to your server.
Another piece of advice on the Puffin viewer UI interface is to consider the _Merge children with same ID_ option. It can hide the exact actual timings at which events were sent. Please turn it off when you see strange gaps on the Flamegraph. It can help.

View File

@@ -25,12 +25,6 @@
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
---
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
---
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
<p align="center" name="demo">

View File

@@ -19,7 +19,6 @@ one indexing operation.
use std::collections::{BTreeSet, HashSet};
use std::ffi::OsStr;
use std::fmt;
use std::fs::{self, File};
use std::io::BufWriter;
@@ -200,29 +199,6 @@ impl Batch {
}
}
impl fmt::Display for Batch {
/// A text used when we debug the profiling reports.
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let index_uid = self.index_uid();
let tasks = self.ids();
match self {
Batch::TaskCancelation { .. } => f.write_str("TaskCancelation")?,
Batch::TaskDeletion(_) => f.write_str("TaskDeletion")?,
Batch::SnapshotCreation(_) => f.write_str("SnapshotCreation")?,
Batch::Dump(_) => f.write_str("Dump")?,
Batch::IndexOperation { op, .. } => write!(f, "{op}")?,
Batch::IndexCreation { .. } => f.write_str("IndexCreation")?,
Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?,
Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?,
Batch::IndexSwap { .. } => f.write_str("IndexSwap")?,
};
match index_uid {
Some(name) => f.write_fmt(format_args!(" on {name:?} from tasks: {tasks:?}")),
None => f.write_fmt(format_args!(" from tasks: {tasks:?}")),
}
}
}
impl IndexOperation {
pub fn index_uid(&self) -> &str {
match self {
@@ -237,30 +213,6 @@ impl IndexOperation {
}
}
impl fmt::Display for IndexOperation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
IndexOperation::DocumentOperation { .. } => {
f.write_str("IndexOperation::DocumentOperation")
}
IndexOperation::DocumentDeletion { .. } => {
f.write_str("IndexOperation::DocumentDeletion")
}
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
}
IndexOperation::DocumentClear { .. } => f.write_str("IndexOperation::DocumentClear"),
IndexOperation::Settings { .. } => f.write_str("IndexOperation::Settings"),
IndexOperation::DocumentClearAndSetting { .. } => {
f.write_str("IndexOperation::DocumentClearAndSetting")
}
IndexOperation::SettingsAndDocumentOperation { .. } => {
f.write_str("IndexOperation::SettingsAndDocumentOperation")
}
}
}
}
impl IndexScheduler {
/// Convert an [`BatchKind`](crate::autobatcher::BatchKind) into a [`Batch`].
///
@@ -629,7 +581,7 @@ impl IndexScheduler {
self.breakpoint(crate::Breakpoint::InsideProcessBatch);
}
puffin::profile_function!(batch.to_string());
puffin::profile_function!(format!("{:?}", batch));
match batch {
Batch::TaskCancelation { mut task, previous_started_at, previous_processing_tasks } => {
@@ -896,7 +848,7 @@ impl IndexScheduler {
})?;
// 4. Dump experimental feature settings
let features = self.features().runtime_features();
let features = self.features()?.runtime_features();
dump.create_experimental_features(features)?;
let dump_uid = started_at.format(format_description!(
@@ -923,10 +875,6 @@ impl IndexScheduler {
self.index_mapper.index(&rtxn, &index_uid)?
};
// the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick
*self.currently_updating_index.write().unwrap() =
Some((index_uid.clone(), index.clone()));
let mut index_wtxn = index.write_txn()?;
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
index_wtxn.commit()?;

View File

@@ -1,8 +1,6 @@
use std::sync::{Arc, RwLock};
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RwTxn};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
use crate::error::FeatureNotEnabledError;
use crate::Result;
@@ -11,19 +9,20 @@ const EXPERIMENTAL_FEATURES: &str = "experimental-features";
#[derive(Clone)]
pub(crate) struct FeatureData {
persisted: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
runtime: Arc<RwLock<RuntimeTogglableFeatures>>,
runtime: Database<Str, SerdeJson<RuntimeTogglableFeatures>>,
instance: InstanceTogglableFeatures,
}
#[derive(Debug, Clone, Copy)]
pub struct RoFeatures {
runtime: RuntimeTogglableFeatures,
instance: InstanceTogglableFeatures,
}
impl RoFeatures {
fn new(data: &FeatureData) -> Self {
let runtime = data.runtime_features();
Self { runtime }
fn new(txn: RoTxn<'_>, data: &FeatureData) -> Result<Self> {
let runtime = data.runtime_features(txn)?;
Ok(Self { runtime, instance: data.instance })
}
pub fn runtime_features(&self) -> RuntimeTogglableFeatures {
@@ -44,13 +43,13 @@ impl RoFeatures {
}
pub fn check_metrics(&self) -> Result<()> {
if self.runtime.metrics {
if self.instance.metrics {
Ok(())
} else {
Err(FeatureNotEnabledError {
disabled_action: "Getting metrics",
feature: "metrics",
issue_link: "https://github.com/meilisearch/product/discussions/625",
issue_link: "https://github.com/meilisearch/meilisearch/discussions/3518",
}
.into())
}
@@ -68,36 +67,15 @@ impl RoFeatures {
.into())
}
}
pub fn check_puffin(&self) -> Result<()> {
if self.runtime.export_puffin_reports {
Ok(())
} else {
Err(FeatureNotEnabledError {
disabled_action: "Outputting Puffin reports to disk",
feature: "export puffin reports",
issue_link: "https://github.com/meilisearch/product/discussions/693",
}
.into())
}
}
}
impl FeatureData {
pub fn new(env: &Env, instance_features: InstanceTogglableFeatures) -> Result<Self> {
let mut wtxn = env.write_txn()?;
let runtime_features_db = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
let runtime_features = env.create_database(&mut wtxn, Some(EXPERIMENTAL_FEATURES))?;
wtxn.commit()?;
let txn = env.read_txn()?;
let persisted_features: RuntimeTogglableFeatures =
runtime_features_db.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default();
let runtime = Arc::new(RwLock::new(RuntimeTogglableFeatures {
metrics: instance_features.metrics || persisted_features.metrics,
..persisted_features
}));
Ok(Self { persisted: runtime_features_db, runtime })
Ok(Self { runtime: runtime_features, instance: instance_features })
}
pub fn put_runtime_features(
@@ -105,25 +83,16 @@ impl FeatureData {
mut wtxn: RwTxn,
features: RuntimeTogglableFeatures,
) -> Result<()> {
self.persisted.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
self.runtime.put(&mut wtxn, EXPERIMENTAL_FEATURES, &features)?;
wtxn.commit()?;
// safe to unwrap, the lock will only fail if:
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
// 2. there's a panic while the thread is held -> it is only used for an assignment here.
let mut toggled_features = self.runtime.write().unwrap();
*toggled_features = features;
Ok(())
}
fn runtime_features(&self) -> RuntimeTogglableFeatures {
// sound to unwrap, the lock will only fail if:
// 1. requested by the same thread concurrently -> it is called and released in methods that don't call each other
// 2. there's a panic while the thread is held -> it is only used for copying the data here
*self.runtime.read().unwrap()
fn runtime_features(&self, txn: RoTxn) -> Result<RuntimeTogglableFeatures> {
Ok(self.runtime.get(&txn, EXPERIMENTAL_FEATURES)?.unwrap_or_default())
}
pub fn features(&self) -> RoFeatures {
RoFeatures::new(self)
pub fn features(&self, txn: RoTxn) -> Result<RoFeatures> {
RoFeatures::new(txn, self)
}
}

View File

@@ -30,7 +30,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
index_mapper,
features: _,
max_number_of_tasks: _,
puffin_frame: _,
wake_up: _,
dumps_path: _,
snapshots_path: _,
@@ -39,7 +38,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
test_breakpoint_sdr: _,
planned_failures: _,
run_loop_iteration: _,
currently_updating_index: _,
} = scheduler;
let rtxn = env.read_txn().unwrap();

View File

@@ -27,13 +27,12 @@ mod index_mapper;
mod insta_snapshot;
mod lru;
mod utils;
pub mod uuid_codec;
mod uuid_codec;
pub type Result<T> = std::result::Result<T, Error>;
pub type TaskId = u32;
use std::collections::{BTreeMap, HashMap};
use std::fs::File;
use std::ops::{Bound, RangeBounds};
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicBool;
@@ -53,7 +52,6 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
use puffin::FrameView;
use roaring::RoaringBitmap;
use synchronoise::SignalEvent;
use time::format_description::well_known::Rfc3339;
@@ -316,9 +314,6 @@ pub struct IndexScheduler {
/// the finished tasks automatically.
pub(crate) max_number_of_tasks: usize,
/// A frame to output the indexation profiling files to disk.
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
/// The path used to create the dumps.
pub(crate) dumps_path: PathBuf,
@@ -331,10 +326,6 @@ pub struct IndexScheduler {
/// The path to the version file of Meilisearch.
pub(crate) version_file_path: PathBuf,
/// A few types of long running batches of tasks that act on a single index set this field
/// so that a handle to the index is available from other threads (search) in an optimized manner.
currently_updating_index: Arc<RwLock<Option<(String, Index)>>>,
// ================= test
// The next entry is dedicated to the tests.
/// Provide a way to set a breakpoint in multiple part of the scheduler.
@@ -373,12 +364,10 @@ impl IndexScheduler {
wake_up: self.wake_up.clone(),
autobatching_enabled: self.autobatching_enabled,
max_number_of_tasks: self.max_number_of_tasks,
puffin_frame: self.puffin_frame.clone(),
snapshots_path: self.snapshots_path.clone(),
dumps_path: self.dumps_path.clone(),
auth_path: self.auth_path.clone(),
version_file_path: self.version_file_path.clone(),
currently_updating_index: self.currently_updating_index.clone(),
#[cfg(test)]
test_breakpoint_sdr: self.test_breakpoint_sdr.clone(),
#[cfg(test)]
@@ -468,14 +457,12 @@ impl IndexScheduler {
env,
// we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
wake_up: Arc::new(SignalEvent::auto(true)),
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
autobatching_enabled: options.autobatching_enabled,
max_number_of_tasks: options.max_number_of_tasks,
dumps_path: options.dumps_path,
snapshots_path: options.snapshots_path,
auth_path: options.auth_path,
version_file_path: options.version_file_path,
currently_updating_index: Arc::new(RwLock::new(None)),
#[cfg(test)]
test_breakpoint_sdr,
@@ -585,46 +572,17 @@ impl IndexScheduler {
run.wake_up.wait();
loop {
let puffin_enabled = run.features().check_puffin().is_ok();
puffin::set_scopes_on(puffin_enabled);
puffin::GlobalProfiler::lock().new_frame();
match run.tick() {
Ok(TickOutcome::TickAgain(_)) => (),
Ok(TickOutcome::WaitForSignal) => run.wake_up.wait(),
Err(e) => {
log::error!("{e}");
log::error!("{}", e);
// Wait one second when an irrecoverable error occurs.
if !e.is_recoverable() {
std::thread::sleep(Duration::from_secs(1));
}
}
}
// Let's write the previous frame to disk but only if
// the user wanted to profile with puffin.
if puffin_enabled {
let mut frame_view = run.puffin_frame.lock();
if !frame_view.is_empty() {
let now = OffsetDateTime::now_utc();
let mut file = match File::create(format!("{}.puffin", now)) {
Ok(file) => file,
Err(e) => {
log::error!("{e}");
continue;
}
};
if let Err(e) = frame_view.save_to_writer(&mut file) {
log::error!("{e}");
}
if let Err(e) = file.sync_all() {
log::error!("{e}");
}
// We erase this frame view as it is no more useful. We want to
// measure the new frames now that we exported the previous ones.
*frame_view = FrameView::default();
}
}
}
})
.unwrap();
@@ -658,13 +616,6 @@ impl IndexScheduler {
/// If you need to fetch information from or perform an action on all indexes,
/// see the `try_for_each_index` function.
pub fn index(&self, name: &str) -> Result<Index> {
if let Some((current_name, current_index)) =
self.currently_updating_index.read().unwrap().as_ref()
{
if current_name == name {
return Ok(current_index.clone());
}
}
let rtxn = self.env.read_txn()?;
self.index_mapper.index(&rtxn, name)
}
@@ -1111,6 +1062,8 @@ impl IndexScheduler {
self.breakpoint(Breakpoint::Start);
}
puffin::GlobalProfiler::lock().new_frame();
self.cleanup_task_queue()?;
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
@@ -1146,9 +1099,6 @@ impl IndexScheduler {
handle.join().unwrap_or(Err(Error::ProcessBatchPanicked))
};
// Reset the currently updating index to relinquish the index handle
*self.currently_updating_index.write().unwrap() = None;
#[cfg(test)]
self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?;
@@ -1309,8 +1259,9 @@ impl IndexScheduler {
Ok(IndexStats { is_indexing, inner_stats: index_stats })
}
pub fn features(&self) -> RoFeatures {
self.features.features()
pub fn features(&self) -> Result<RoFeatures> {
let rtxn = self.read_txn()?;
self.features.features(rtxn)
}
pub fn put_runtime_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {

View File

@@ -50,7 +50,6 @@ hebrew = ["milli/hebrew"]
japanese = ["milli/japanese"]
# thai specialized tokenization
thai = ["milli/thai"]
# allow greek specialized tokenization
greek = ["milli/greek"]
# allow khmer specialized tokenization
khmer = ["milli/khmer"]

View File

@@ -5,8 +5,6 @@ use serde::{Deserialize, Serialize};
pub struct RuntimeTogglableFeatures {
pub score_details: bool,
pub vector_store: bool,
pub metrics: bool,
pub export_puffin_reports: bool,
}
#[derive(Default, Debug, Clone, Copy)]

View File

@@ -69,7 +69,8 @@ permissive-json-pointer = { path = "../permissive-json-pointer" }
pin-project-lite = "0.2.9"
platform-dirs = "0.3.0"
prometheus = { version = "0.13.3", features = ["process"] }
puffin = { version = "0.16.0", features = ["serialization"] }
puffin = "0.16.0"
puffin_http = { version = "0.13.0", optional = true }
rand = "0.8.5"
rayon = "1.7.0"
regex = "1.7.3"
@@ -134,6 +135,7 @@ zip = { version = "0.6.4", optional = true }
[features]
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
analytics = ["segment"]
profile-with-puffin = ["dep:puffin_http"]
mini-dashboard = [
"actix-web-static-files",
"static-files",
@@ -150,7 +152,6 @@ hebrew = ["meilisearch-types/hebrew"]
japanese = ["meilisearch-types/japanese"]
thai = ["meilisearch-types/thai"]
greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip"

View File

@@ -114,7 +114,10 @@ pub fn create_app(
.configure(routes::configure)
.configure(|s| dashboard(s, enable_dashboard));
let app = app.wrap(middleware::RouteMetrics);
let app = app.wrap(actix_web::middleware::Condition::new(
opt.experimental_enable_metrics,
middleware::RouteMetrics,
));
app.wrap(
Cors::default()
.send_wildcard()

View File

@@ -30,6 +30,10 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
async fn main() -> anyhow::Result<()> {
let (opt, config_read_from) = Opt::try_build()?;
#[cfg(feature = "profile-with-puffin")]
let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
puffin::set_scopes_on(cfg!(feature = "profile-with-puffin"));
anyhow::ensure!(
!(cfg!(windows) && opt.experimental_reduce_indexing_memory_usage),
"The `experimental-reduce-indexing-memory-usage` flag is not supported on Windows"

View File

@@ -3,10 +3,8 @@
use std::future::{ready, Ready};
use actix_web::dev::{self, Service, ServiceRequest, ServiceResponse, Transform};
use actix_web::web::Data;
use actix_web::Error;
use futures_util::future::LocalBoxFuture;
use index_scheduler::IndexScheduler;
use prometheus::HistogramTimer;
pub struct RouteMetrics;
@@ -49,27 +47,19 @@ where
fn call(&self, req: ServiceRequest) -> Self::Future {
let mut histogram_timer: Option<HistogramTimer> = None;
// calling unwrap here is safe because index scheduler is added to app data while creating actix app.
// also, the tests will fail if this is not present.
let index_scheduler = req.app_data::<Data<IndexScheduler>>().unwrap();
let features = index_scheduler.features();
if features.check_metrics().is_ok() {
let request_path = req.path();
let is_registered_resource = req.resource_map().has_resource(request_path);
if is_registered_resource {
let request_method = req.method().to_string();
histogram_timer = Some(
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
.with_label_values(&[&request_method, request_path])
.start_timer(),
);
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
let request_path = req.path();
let is_registered_resource = req.resource_map().has_resource(request_path);
if is_registered_resource {
let request_method = req.method().to_string();
histogram_timer = Some(
crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
.with_label_values(&[&request_method, request_path])
.inc();
}
};
.start_timer(),
);
crate::metrics::MEILISEARCH_HTTP_REQUESTS_TOTAL
.with_label_values(&[&request_method, request_path])
.inc();
}
let fut = self.service.call(req);

View File

@@ -29,12 +29,12 @@ async fn get_features(
>,
req: HttpRequest,
analytics: Data<dyn Analytics>,
) -> HttpResponse {
let features = index_scheduler.features();
) -> Result<HttpResponse, ResponseError> {
let features = index_scheduler.features()?;
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
debug!("returns: {:?}", features.runtime_features());
HttpResponse::Ok().json(features.runtime_features())
Ok(HttpResponse::Ok().json(features.runtime_features()))
}
#[derive(Debug, Deserr)]
@@ -44,10 +44,6 @@ pub struct RuntimeTogglableFeatures {
pub score_details: Option<bool>,
#[deserr(default)]
pub vector_store: Option<bool>,
#[deserr(default)]
pub metrics: Option<bool>,
#[deserr(default)]
pub export_puffin_reports: Option<bool>,
}
async fn patch_features(
@@ -59,36 +55,26 @@ async fn patch_features(
req: HttpRequest,
analytics: Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> {
let features = index_scheduler.features();
let features = index_scheduler.features()?;
let old_features = features.runtime_features();
let new_features = meilisearch_types::features::RuntimeTogglableFeatures {
score_details: new_features.0.score_details.unwrap_or(old_features.score_details),
vector_store: new_features.0.vector_store.unwrap_or(old_features.vector_store),
metrics: new_features.0.metrics.unwrap_or(old_features.metrics),
export_puffin_reports: new_features
.0
.export_puffin_reports
.unwrap_or(old_features.export_puffin_reports),
};
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
// the it renames to camelCase, which we don't want for analytics.
// **Do not** ignore fields with `..` or `_` here, because we want to add them in the future.
let meilisearch_types::features::RuntimeTogglableFeatures {
score_details,
vector_store,
metrics,
export_puffin_reports,
} = new_features;
let meilisearch_types::features::RuntimeTogglableFeatures { score_details, vector_store } =
new_features;
analytics.publish(
"Experimental features Updated".to_string(),
json!({
"score_details": score_details,
"vector_store": vector_store,
"metrics": metrics,
"export_puffin_reports": export_puffin_reports,
}),
Some(&req),
);

View File

@@ -68,7 +68,7 @@ pub async fn search(
}
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let features = index_scheduler.features()?;
let search_result = tokio::task::spawn_blocking(move || {
perform_facet_search(&index, search_query, facet_query, facet_name, features)
})

View File

@@ -157,7 +157,7 @@ pub async fn search_with_url_query(
let mut aggregate = SearchAggregator::from_query(&query, &req);
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let features = index_scheduler.features()?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {
@@ -192,7 +192,7 @@ pub async fn search_with_post(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let features = index_scheduler.features()?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
if let Ok(ref search_result) = search_result {

View File

@@ -19,7 +19,7 @@ pub async fn get_metrics(
index_scheduler: GuardedData<ActionPolicy<{ actions::METRICS_GET }>, Data<IndexScheduler>>,
auth_controller: Data<AuthController>,
) -> Result<HttpResponse, ResponseError> {
index_scheduler.features().check_metrics()?;
index_scheduler.features()?.check_metrics()?;
let auth_filters = index_scheduler.filters();
if !auth_filters.all_indexes_authorized() {
let mut error = ResponseError::from(AuthenticationError::InvalidToken);

View File

@@ -41,7 +41,7 @@ pub async fn multi_search_with_post(
let queries = params.into_inner().queries;
let mut multi_aggregate = MultiSearchAggregator::from_queries(&queries, &req);
let features = index_scheduler.features();
let features = index_scheduler.features()?;
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code

View File

@@ -2,12 +2,10 @@ use std::collections::{HashMap, HashSet};
use ::time::format_description::well_known::Rfc3339;
use maplit::{hashmap, hashset};
use meilisearch::Opt;
use once_cell::sync::Lazy;
use tempfile::TempDir;
use time::{Duration, OffsetDateTime};
use crate::common::{default_settings, Server, Value};
use crate::common::{Server, Value};
use crate::json;
pub static AUTHORIZATIONS: Lazy<HashMap<(&'static str, &'static str), HashSet<&'static str>>> =
@@ -197,9 +195,7 @@ async fn access_authorized_master_key() {
#[actix_rt::test]
async fn access_authorized_restricted_index() {
let dir = TempDir::new().unwrap();
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
let mut server = Server::new_auth_with_options(enable_metrics, dir).await;
let mut server = Server::new_auth().await;
for ((method, route), actions) in AUTHORIZATIONS.iter() {
for action in actions {
// create a new API key letting only the needed action.

View File

@@ -5,11 +5,9 @@ pub mod service;
use std::fmt::{self, Display};
#[allow(unused)]
pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
use meili_snap::json_string;
use serde::{Deserialize, Serialize};
#[allow(unused)]
pub use server::{default_settings, Server};
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]

View File

@@ -202,10 +202,6 @@ impl Server {
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
self.service.patch("/experimental-features", value).await
}
pub async fn get_metrics(&self) -> (Value, StatusCode) {
self.service.get("/metrics").await
}
}
pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
@@ -225,7 +221,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
skip_index_budget: true,
..Parser::parse_from(None as Option<&str>)
},
experimental_enable_metrics: false,
experimental_enable_metrics: true,
..Parser::parse_from(None as Option<&str>)
}
}

View File

@@ -1,7 +1,4 @@
use meilisearch::Opt;
use tempfile::TempDir;
use crate::common::{default_settings, Server};
use crate::common::Server;
use crate::json;
/// Feature name to test against.
@@ -19,9 +16,7 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": false,
"metrics": false,
"exportPuffinReports": false
"vectorStore": false
}
"###);
@@ -31,9 +26,7 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
"vectorStore": true
}
"###);
@@ -43,9 +36,7 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
"vectorStore": true
}
"###);
@@ -56,9 +47,7 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
"vectorStore": true
}
"###);
@@ -69,73 +58,11 @@ async fn experimental_features() {
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": true,
"metrics": false,
"exportPuffinReports": false
"vectorStore": true
}
"###);
}
#[actix_rt::test]
async fn experimental_feature_metrics() {
// instance flag for metrics enables metrics at startup
let dir = TempDir::new().unwrap();
let enable_metrics = Opt { experimental_enable_metrics: true, ..default_settings(dir.path()) };
let server = Server::new_with_options(enable_metrics).await.unwrap();
let (response, code) = server.get_features().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"scoreDetails": false,
"vectorStore": false,
"metrics": true,
"exportPuffinReports": false
}
"###);
let (response, code) = server.get_metrics().await;
meili_snap::snapshot!(code, @"200 OK");
// metrics are not returned in json format
// so the test server will return null
meili_snap::snapshot!(response, @"null");
// disabling metrics results in invalid request
let (response, code) = server.set_features(json!({"metrics": false})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response["metrics"], @"false");
let (response, code) = server.get_metrics().await;
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Getting metrics requires enabling the `metrics` experimental feature. See https://github.com/meilisearch/product/discussions/625",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// enabling metrics via HTTP results in valid request
let (response, code) = server.set_features(json!({"metrics": true})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response["metrics"], @"true");
let (response, code) = server.get_metrics().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response, @"null");
// startup without flag respects persisted metrics value
let disable_metrics =
Opt { experimental_enable_metrics: false, ..default_settings(dir.path()) };
let server_no_flag = Server::new_with_options(disable_metrics).await.unwrap();
let (response, code) = server_no_flag.get_metrics().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(response, @"null");
}
#[actix_rt::test]
async fn errors() {
let server = Server::new().await;
@@ -146,7 +73,7 @@ async fn errors() {
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`, `metrics`, `exportPuffinReports`",
"message": "Unknown field `NotAFeature`: expected one of `scoreDetails`, `vectorStore`",
"code": "bad_request",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#bad_request"

View File

@@ -1,241 +0,0 @@
use meili_snap::snapshot;
use once_cell::sync::Lazy;
use crate::common::{Server, Value};
use crate::json;
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 1,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Brown"
},
{
"id": 2,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Black"
},
{
"id": 3,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": "Blue"
},
{
"id": 4,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": "Red"
},
{
"id": 5,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": "Blue"
},
{
"id": 6,
"description": "Running Shoes",
"brand": "Adidas",
"product_id": "456789",
"color": "Black"
},
{
"id": 7,
"description": "Running Shoes",
"brand": "Adidas",
"product_id": "456789",
"color": "White"
},
{
"id": 8,
"description": "Hoodie",
"brand": "Puma",
"product_id": "987654",
"color": "Gray"
},
{
"id": 9,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Green"
},
{
"id": 10,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Red"
},
{
"id": 11,
"description": "Sweater",
"brand": "Gap",
"product_id": "234567",
"color": "Blue"
},
{
"id": 12,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Indigo"
},
{
"id": 13,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Black"
},
{
"id": 14,
"description": "Jeans",
"brand": "Levi's",
"product_id": "345678",
"color": "Stone Wash"
}
])
});
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id";
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id";
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
#[actix_rt::test]
async fn distinct_search_with_offset_no_ranking() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await;
fn get_hits(response: &Value) -> Vec<&str> {
let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
}
let (response, code) = index.search_post(json!({"offset": 0, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
snapshot!(response["estimatedTotalHits"] , @"11");
let (response, code) = index.search_post(json!({"offset": 2, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
snapshot!(response["estimatedTotalHits"], @"10");
let (response, code) = index.search_post(json!({"offset": 4, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 5, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"1");
snapshot!(format!("{:?}", hits), @r#"["345678"]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 6, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["estimatedTotalHits"], @"6");
let (response, code) = index.search_post(json!({"offset": 7, "limit": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["estimatedTotalHits"], @"6");
}
/// testing: https://github.com/meilisearch/meilisearch/issues/4130
#[actix_rt::test]
async fn distinct_search_with_pagination_no_ranking() {
let server = Server::new().await;
let index = server.index("test");
let documents = DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
index.wait_task(1).await;
fn get_hits(response: &Value) -> Vec<&str> {
let hits_array = response["hits"].as_array().unwrap();
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
}
let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["page"], @"0");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 1, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
snapshot!(response["page"], @"1");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
snapshot!(response["page"], @"2");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 3, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"2");
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
snapshot!(response["page"], @"3");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 4, "hitsPerPage": 2})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"0");
snapshot!(format!("{:?}", hits), @r#"[]"#);
snapshot!(response["page"], @"4");
snapshot!(response["totalPages"], @"3");
snapshot!(response["totalHits"], @"6");
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 3})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"3");
snapshot!(format!("{:?}", hits), @r#"["987654", "234567", "345678"]"#);
snapshot!(response["page"], @"2");
snapshot!(response["totalPages"], @"2");
snapshot!(response["totalHits"], @"6");
}

View File

@@ -1,7 +1,6 @@
// This modules contains all the test concerning search. Each particular feature of the search
// should be tested in its own module to isolate tests and keep the tests readable.
mod distinct;
mod errors;
mod facet_search;
mod formatted;
@@ -817,7 +816,7 @@ async fn experimental_feature_score_details() {
},
"proximity": {
"order": 2,
"score": 0.75
"score": 0.875
},
"attribute": {
"order": 3,

View File

@@ -1,19 +0,0 @@
[package]
name = "meilitool"
description = "A CLI to edit a Meilisearch database from the command line"
version.workspace = true
authors.workspace = true
homepage.workspace = true
readme.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
anyhow = "1.0.75"
clap = { version = "4.2.1", features = ["derive"] }
dump = { path = "../dump" }
file-store = { path = "../file-store" }
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
time = { version = "0.3.30", features = ["formatting"] }
uuid = { version = "1.5.0", features = ["v4"], default-features = false }

View File

@@ -1,312 +0,0 @@
use std::fs::{read_dir, read_to_string, remove_file, File};
use std::io::BufWriter;
use std::path::PathBuf;
use anyhow::Context;
use clap::{Parser, Subcommand};
use dump::{DumpWriter, IndexMetadata};
use file_store::FileStore;
use meilisearch_auth::AuthController;
use meilisearch_types::heed::types::{OwnedType, SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, PolyDatabase, RoTxn, RwTxn};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
use meilisearch_types::milli::{obkv_to_json, BEU32};
use meilisearch_types::tasks::{Status, Task};
use meilisearch_types::versioning::check_version_file;
use meilisearch_types::Index;
use time::macros::format_description;
use time::OffsetDateTime;
use uuid_codec::UuidCodec;
mod uuid_codec;
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
struct Cli {
/// The database path where the Meilisearch is running.
#[arg(long, default_value = "data.ms/")]
db_path: PathBuf,
#[command(subcommand)]
command: Command,
}
#[derive(Subcommand)]
enum Command {
/// Clears the task queue and make it empty.
///
/// This command can be safely executed even if Meilisearch is running and processing tasks.
/// Once the task queue is empty you can restart Meilisearch and no more tasks must be visible,
/// even the ones that were processing. However, it's highly possible that you see the processing
/// tasks in the queue again with an associated internal error message.
ClearTaskQueue,
/// Exports a dump from the Meilisearch database.
///
/// Make sure to run this command when Meilisearch is not running or running but not processing tasks.
/// If tasks are being processed while a dump is being exported there are chances for the dump to be
/// malformed with missing tasks.
///
/// TODO Verify this claim or make sure it cannot happen and we can export dumps
/// without caring about killing Meilisearch first!
ExportADump {
/// The directory in which the dump will be created.
#[arg(long, default_value = "dumps/")]
dump_dir: PathBuf,
/// Skip dumping the enqueued or processing tasks.
///
/// Can be useful when there are a lot of them and it is not particularly useful
/// to keep them. Note that only the enqueued tasks takes up space so skipping
/// the processed ones is not particularly interesting.
#[arg(long)]
skip_enqueued_tasks: bool,
},
}
fn main() -> anyhow::Result<()> {
let Cli { db_path, command } = Cli::parse();
check_version_file(&db_path).context("While checking the version file")?;
match command {
Command::ClearTaskQueue => clear_task_queue(db_path),
Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
export_a_dump(db_path, dump_dir, skip_enqueued_tasks)
}
}
}
/// Clears the task queue located at `db_path`.
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
let path = db_path.join("tasks");
let env = EnvOpenOptions::new()
.max_dbs(100)
.open(&path)
.with_context(|| format!("While trying to open {:?}", path.display()))?;
eprintln!("Deleting tasks from the database...");
let mut wtxn = env.write_txn()?;
let all_tasks = try_opening_poly_database(&env, &wtxn, "all-tasks")?;
let total = all_tasks.len(&wtxn)?;
let status = try_opening_poly_database(&env, &wtxn, "status")?;
let kind = try_opening_poly_database(&env, &wtxn, "kind")?;
let index_tasks = try_opening_poly_database(&env, &wtxn, "index-tasks")?;
let canceled_by = try_opening_poly_database(&env, &wtxn, "canceled_by")?;
let enqueued_at = try_opening_poly_database(&env, &wtxn, "enqueued-at")?;
let started_at = try_opening_poly_database(&env, &wtxn, "started-at")?;
let finished_at = try_opening_poly_database(&env, &wtxn, "finished-at")?;
try_clearing_poly_database(&mut wtxn, all_tasks, "all-tasks")?;
try_clearing_poly_database(&mut wtxn, status, "status")?;
try_clearing_poly_database(&mut wtxn, kind, "kind")?;
try_clearing_poly_database(&mut wtxn, index_tasks, "index-tasks")?;
try_clearing_poly_database(&mut wtxn, canceled_by, "canceled_by")?;
try_clearing_poly_database(&mut wtxn, enqueued_at, "enqueued-at")?;
try_clearing_poly_database(&mut wtxn, started_at, "started-at")?;
try_clearing_poly_database(&mut wtxn, finished_at, "finished-at")?;
wtxn.commit().context("While committing the transaction")?;
eprintln!("Successfully deleted {total} tasks from the tasks database!");
eprintln!("Deleting the content files from disk...");
let mut count = 0usize;
let update_files = db_path.join("update_files");
let entries = read_dir(&update_files).with_context(|| {
format!("While trying to read the content of {:?}", update_files.display())
})?;
for result in entries {
match result {
Ok(ent) => match remove_file(ent.path()) {
Ok(_) => count += 1,
Err(e) => eprintln!("Error while deleting {:?}: {}", ent.path().display(), e),
},
Err(e) => {
eprintln!("Error while reading a file in {:?}: {}", update_files.display(), e)
}
}
}
eprintln!("Sucessfully deleted {count} content files from disk!");
Ok(())
}
fn try_opening_database<KC: 'static, DC: 'static>(
env: &Env,
rtxn: &RoTxn,
db_name: &str,
) -> anyhow::Result<Database<KC, DC>> {
env.open_database(rtxn, Some(db_name))
.with_context(|| format!("While opening the {db_name:?} database"))?
.with_context(|| format!("Missing the {db_name:?} database"))
}
fn try_opening_poly_database(
env: &Env,
rtxn: &RoTxn,
db_name: &str,
) -> anyhow::Result<PolyDatabase> {
env.open_poly_database(rtxn, Some(db_name))
.with_context(|| format!("While opening the {db_name:?} poly database"))?
.with_context(|| format!("Missing the {db_name:?} poly database"))
}
fn try_clearing_poly_database(
wtxn: &mut RwTxn,
database: PolyDatabase,
db_name: &str,
) -> anyhow::Result<()> {
database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database"))
}
/// Exports a dump into the dump directory.
fn export_a_dump(
db_path: PathBuf,
dump_dir: PathBuf,
skip_enqueued_tasks: bool,
) -> Result<(), anyhow::Error> {
let started_at = OffsetDateTime::now_utc();
// 1. Extracts the instance UID from disk
let instance_uid_path = db_path.join("instance-uid");
let instance_uid = match read_to_string(&instance_uid_path) {
Ok(content) => match content.trim().parse() {
Ok(uuid) => Some(uuid),
Err(e) => {
eprintln!("Impossible to parse instance-uid: {e}");
None
}
},
Err(e) => {
eprintln!("Impossible to read {}: {}", instance_uid_path.display(), e);
None
}
};
let dump = DumpWriter::new(instance_uid).context("While creating a new dump")?;
let file_store =
FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?;
let index_scheduler_path = db_path.join("tasks");
let env = EnvOpenOptions::new()
.max_dbs(100)
.open(&index_scheduler_path)
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
eprintln!("Dumping the keys...");
// 2. dump the keys
let auth_store = AuthController::new(&db_path, &None)
.with_context(|| format!("While opening the auth store at {}", db_path.display()))?;
let mut dump_keys = dump.create_keys()?;
let mut count = 0;
for key in auth_store.list_keys()? {
dump_keys.push_key(&key)?;
count += 1;
}
dump_keys.flush()?;
eprintln!("Successfully dumped {count} keys!");
let rtxn = env.read_txn()?;
let all_tasks: Database<OwnedType<BEU32>, SerdeJson<Task>> =
try_opening_database(&env, &rtxn, "all-tasks")?;
let index_mapping: Database<Str, UuidCodec> =
try_opening_database(&env, &rtxn, "index-mapping")?;
if skip_enqueued_tasks {
eprintln!("Skip dumping the enqueued tasks...");
} else {
eprintln!("Dumping the enqueued tasks...");
// 3. dump the tasks
let mut dump_tasks = dump.create_tasks_queue()?;
let mut count = 0;
for ret in all_tasks.iter(&rtxn)? {
let (_, t) = ret?;
let status = t.status;
let content_file = t.content_uuid();
let mut dump_content_file = dump_tasks.push_task(&t.into())?;
// 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
if let Some(content_file_uuid) = content_file {
if status == Status::Enqueued {
let content_file = file_store.get_update(content_file_uuid)?;
let reader =
DocumentsBatchReader::from_reader(content_file).with_context(|| {
format!("While reading content file {:?}", content_file_uuid)
})?;
let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
while let Some(doc) = cursor.next_document().with_context(|| {
format!("While iterating on content file {:?}", content_file_uuid)
})? {
dump_content_file
.push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
}
dump_content_file.flush()?;
count += 1;
}
}
}
dump_tasks.flush()?;
eprintln!("Successfully dumped {count} enqueued tasks!");
}
eprintln!("Dumping the indexes...");
// 4. Dump the indexes
let mut count = 0;
for result in index_mapping.iter(&rtxn)? {
let (uid, uuid) = result?;
let index_path = db_path.join("indexes").join(uuid.to_string());
let index = Index::new(EnvOpenOptions::new(), &index_path).with_context(|| {
format!("While trying to open the index at path {:?}", index_path.display())
})?;
let rtxn = index.read_txn()?;
let metadata = IndexMetadata {
uid: uid.to_owned(),
primary_key: index.primary_key(&rtxn)?.map(String::from),
created_at: index.created_at(&rtxn)?,
updated_at: index.updated_at(&rtxn)?,
};
let mut index_dumper = dump.create_index(uid, &metadata)?;
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
// 4.1. Dump the documents
for ret in index.all_documents(&rtxn)? {
let (_id, doc) = ret?;
let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
index_dumper.push_document(&document)?;
}
// 4.2. Dump the settings
let settings = meilisearch_types::settings::settings(&index, &rtxn)?;
index_dumper.settings(&settings)?;
count += 1;
}
eprintln!("Successfully dumped {count} indexes!");
// We will not dump experimental feature settings
eprintln!("The tool is not dumping experimental features, please set them by hand afterward");
let dump_uid = started_at.format(format_description!(
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
)).unwrap();
let path = dump_dir.join(format!("{}.dump", dump_uid));
let file = File::create(&path)?;
dump.persist_to(BufWriter::new(file))?;
eprintln!("Dump exported at path {:?}", path.display());
Ok(())
}

View File

@@ -1,24 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use meilisearch_types::heed::{BytesDecode, BytesEncode};
use uuid::Uuid;
/// A heed codec for value of struct Uuid.
pub struct UuidCodec;
impl<'a> BytesDecode<'a> for UuidCodec {
type DItem = Uuid;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
bytes.try_into().ok().map(Uuid::from_bytes)
}
}
impl BytesEncode<'_> for UuidCodec {
type EItem = Uuid;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
Some(Cow::Borrowed(item.as_bytes()))
}
}

View File

@@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.4.0"
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
byteorder = "1.4.3"
charabia = { version = "0.8.5", default-features = false }
charabia = { version = "0.8.3", default-features = false }
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.8"
deserr = { version = "0.6.0", features = ["actix-web"]}
@@ -79,10 +79,11 @@ big_s = "1.0.2"
insta = "1.29.0"
maplit = "1.0.2"
md5 = "0.7.0"
meili-snap = { path = "../meili-snap" }
rand = { version = "0.8.5", features = ["small_rng"] }
[features]
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek"]
# Use POSIX semaphores instead of SysV semaphores in LMDB
# For more information on this feature, see heed's Cargo.toml
@@ -106,6 +107,3 @@ thai = ["charabia/thai"]
# allow greek specialized tokenization
greek = ["charabia/greek"]
# allow khmer specialized tokenization
khmer = ["charabia/khmer"]

View File

@@ -1,5 +1,4 @@
use std::fs::File;
use std::io::BufReader;
use std::{io, str};
use obkv::KvReader;
@@ -20,14 +19,14 @@ use crate::FieldId;
pub struct EnrichedDocumentsBatchReader<R> {
documents: DocumentsBatchReader<R>,
primary_key: String,
external_ids: grenad::ReaderCursor<BufReader<File>>,
external_ids: grenad::ReaderCursor<File>,
}
impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
pub fn new(
documents: DocumentsBatchReader<R>,
primary_key: String,
external_ids: grenad::Reader<BufReader<File>>,
external_ids: grenad::Reader<File>,
) -> Result<Self, Error> {
if documents.documents_count() as u64 == external_ids.len() {
Ok(EnrichedDocumentsBatchReader {
@@ -76,7 +75,7 @@ pub struct EnrichedDocument<'a> {
pub struct EnrichedDocumentsBatchCursor<R> {
documents: DocumentsBatchCursor<R>,
primary_key: String,
external_ids: grenad::ReaderCursor<BufReader<File>>,
external_ids: grenad::ReaderCursor<File>,
}
impl<R> EnrichedDocumentsBatchCursor<R> {

View File

@@ -60,12 +60,16 @@ impl CboRoaringBitmapCodec {
/// if the merged values length is under the threshold, values are directly
/// serialized in the buffer else a RoaringBitmap is created from the
/// values and is serialized in the buffer.
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
where
I: IntoIterator<Item = A>,
A: AsRef<[u8]>,
{
let mut roaring = RoaringBitmap::new();
let mut vec = Vec::new();
for bytes in slices {
if bytes.len() <= THRESHOLD * size_of::<u32>() {
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
let mut reader = bytes.as_ref();
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
vec.push(integer);
@@ -85,7 +89,7 @@ impl CboRoaringBitmapCodec {
}
} else {
// We can unwrap safely because the vector is sorted upper.
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
roaring.serialize_into(buffer)?;
}
} else {

View File

@@ -119,16 +119,16 @@ pub struct Index {
pub(crate) main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,

View File

@@ -2,7 +2,7 @@ use std::cmp;
use crate::{relative_from_absolute_position, Position};
pub const MAX_DISTANCE: u32 = 4;
pub const MAX_DISTANCE: u32 = 8;
pub fn index_proximity(lhs: u32, rhs: u32) -> u32 {
if lhs <= rhs {

View File

@@ -3,7 +3,7 @@ use std::fmt::{Debug, Display};
use std::ops::Bound::{self, Excluded, Included};
use either::Either;
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token};
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token};
use roaring::RoaringBitmap;
use serde_json::Value;

View File

@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
use self::new::PartialSearchResult;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};

View File

@@ -53,22 +53,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
if excluded.contains(docid) {
continue;
}
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
results.push(docid);
}
let mut all_candidates = universe - excluded;
all_candidates.extend(results.iter().copied());
// drain the results of the skipped elements
// this **must** be done **after** writing the entire results in `all_candidates` to ensure
// e.g. estimatedTotalHits is correct.
if results.len() >= from {
results.drain(..from);
} else {
results.clear();
}
return Ok(BucketSortOutput {
scores: vec![Default::default(); results.len()],
docids: results,

View File

@@ -11,9 +11,7 @@ use super::interner::Interned;
use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
};
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
/// A cache storing pointers to values in the LMDB databases.
///
@@ -168,7 +166,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
@@ -182,7 +180,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
word: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
@@ -230,7 +228,7 @@ impl<'ctx> SearchContext<'ctx> {
merge_cbo_roaring_bitmaps,
)
}
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
@@ -244,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> {
&mut self,
prefix: Interned<String>,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),

View File

@@ -434,18 +434,7 @@ pub fn execute_search(
let mut search = Search::default();
let docids = match ctx.index.vector_hnsw(ctx.txn)? {
Some(hnsw) => {
if let Some(expected_size) = hnsw.iter().map(|(_, point)| point.len()).next() {
if vector.len() != expected_size {
return Err(UserError::InvalidVectorDimensions {
expected: expected_size,
found: vector.len(),
}
.into());
}
}
let vector = NDotProductPoint::new(vector.clone());
let neighbors = hnsw.search(&vector, &mut search);
let mut docids = Vec::new();

View File

@@ -29,7 +29,7 @@ use std::hash::Hash;
pub use cheapest_paths::PathVisitor;
pub use condition_docids_cache::ConditionDocIdsCache;
pub use dead_ends_cache::DeadEndsCache;
pub use exactness::ExactnessGraph;
pub use exactness::{ExactnessCondition, ExactnessGraph};
pub use fid::{FidCondition, FidGraph};
pub use position::{PositionCondition, PositionGraph};
pub use proximity::{ProximityCondition, ProximityGraph};

View File

@@ -1,7 +1,6 @@
#![allow(clippy::too_many_arguments)]
use super::ProximityCondition;
use crate::proximity::MAX_DISTANCE;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
@@ -36,7 +35,7 @@ pub fn build_edges(
}
let mut conditions = vec![];
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
for cost in right_ngram_max..(7 + right_ngram_max) {
conditions.push((
cost as u32,
conditions_interner.insert(ProximityCondition::Uninit {
@@ -48,7 +47,7 @@ pub fn build_edges(
}
conditions.push((
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
(7 + right_ngram_max) as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
));

View File

@@ -273,7 +273,7 @@ fn test_proximity_simple() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the quick brown fox jumps over the lazy dog");
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 2, 3, 5, 1, 0]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 5, 2, 3, 0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
[
@@ -282,11 +282,11 @@ fn test_proximity_simple() {
"\"the quickbrown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the lazy dog\"",
"\"the really quick brown fox jumps over the very lazy dog\"",
"\"brown quick fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the lazy. dog\"",
"\"dog the quick brown fox jumps over the lazy\"",
"\"brown quick fox jumps over the lazy dog\"",
"\"the. quick brown fox jumps over the lazy. dog\"",
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
"\"the. quick brown fox jumps over the lazy. dog\"",
]
"###);
}
@@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best s");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@@ -382,9 +382,9 @@ fn test_proximity_prefix_db() {
"\"summer best\"",
"\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal of the summer\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"",
"\"this is the best meal of the summer\"",
"\"summer x y best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
]
@@ -396,7 +396,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best su");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@@ -406,10 +406,10 @@ fn test_proximity_prefix_db() {
"\"summer best\"",
"\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"",
"\"this is the best meal of the summer\"",
"\"summer x y best\"",
"\"this is the best cooked meal of the summer\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
]
"###);
@@ -447,7 +447,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best wint");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@@ -457,10 +457,10 @@ fn test_proximity_prefix_db() {
"\"winter best\"",
"\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"",
"\"winter x y best\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
]
"###);
@@ -471,7 +471,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best wi");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@@ -481,9 +481,9 @@ fn test_proximity_prefix_db() {
"\"winter best\"",
"\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal of the winter\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"",
"\"winter x y best\"",
]
"###);

View File

@@ -68,8 +68,8 @@ fn test_trap_basic() {
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
Typo(
@@ -82,8 +82,8 @@ fn test_trap_basic() {
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
Typo(

View File

@@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 9,
max_rank: 25,
rank: 35,
max_rank: 57,
},
),
],
@@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 9,
max_rank: 25,
rank: 35,
max_rank: 57,
},
),
],
@@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 9,
max_rank: 25,
rank: 35,
max_rank: 57,
},
),
],

View File

@@ -23,8 +23,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 25,
max_rank: 25,
rank: 57,
max_rank: 57,
},
),
],
@@ -49,8 +49,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 24,
max_rank: 25,
rank: 56,
max_rank: 57,
},
),
],
@@ -75,8 +75,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 9,
max_rank: 25,
rank: 35,
max_rank: 57,
},
),
],
@@ -101,8 +101,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
rank: 22,
max_rank: 22,
},
),
],
@@ -127,8 +127,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
rank: 22,
max_rank: 22,
},
),
],
@@ -153,8 +153,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
rank: 22,
max_rank: 22,
},
),
],
@@ -179,8 +179,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 9,
max_rank: 10,
rank: 21,
max_rank: 22,
},
),
],
@@ -205,8 +205,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 5,
max_rank: 10,
rank: 17,
max_rank: 22,
},
),
],
@@ -231,8 +231,8 @@ expression: "format!(\"{document_ids_scores:#?}\")"
),
Proximity(
Rank {
rank: 5,
max_rank: 10,
rank: 17,
max_rank: 22,
},
),
],

View File

@@ -3,35 +3,59 @@ source: milli/src/search/new/tests/proximity.rs
expression: "format!(\"{document_scores:#?}\")"
---
[
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 7,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -39,31 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -6,32 +6,40 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
rank: 7,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
@@ -39,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -47,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -55,15 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -6,32 +6,40 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
rank: 7,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
@@ -39,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -47,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -55,7 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -63,15 +71,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -3,35 +3,59 @@ source: milli/src/search/new/tests/proximity.rs
expression: "format!(\"{document_scores:#?}\")"
---
[
[
Proximity(
Rank {
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 7,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 6,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 5,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 2,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -39,7 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -47,31 +71,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -7,7 +7,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -15,7 +15,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -47,7 +47,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -55,7 +55,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -63,7 +63,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -6,24 +6,24 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
@@ -31,7 +31,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],
@@ -39,7 +39,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -6,16 +6,16 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
[
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
@@ -23,7 +23,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 4,
max_rank: 8,
},
),
],

View File

@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 1,
max_rank: 4,
rank: 5,
max_rank: 8,
},
),
],
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 3,
max_rank: 4,
rank: 7,
max_rank: 8,
},
),
],

View File

@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
rank: 15,
max_rank: 15,
},
),
],
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 4,
max_rank: 7,
rank: 8,
max_rank: 15,
},
),
],

View File

@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 50,
max_rank: 50,
},
),
],
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 22,
rank: 50,
max_rank: 50,
},
),
],
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 21,
max_rank: 22,
rank: 49,
max_rank: 50,
},
),
],
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 21,
max_rank: 22,
rank: 49,
max_rank: 50,
},
),
],
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 20,
max_rank: 22,
rank: 48,
max_rank: 50,
},
),
],
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 17,
max_rank: 22,
rank: 41,
max_rank: 50,
},
),
],
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 16,
max_rank: 22,
rank: 40,
max_rank: 50,
},
),
],
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 19,
rank: 43,
max_rank: 43,
},
),
],
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 16,
max_rank: 16,
rank: 36,
max_rank: 36,
},
),
],
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 13,
max_rank: 16,
rank: 31,
max_rank: 36,
},
),
],
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
rank: 22,
max_rank: 22,
},
),
],
@@ -166,8 +166,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
rank: 15,
max_rank: 15,
},
),
],
@@ -180,8 +180,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
rank: 15,
max_rank: 15,
},
),
],
@@ -194,8 +194,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
rank: 15,
max_rank: 15,
},
),
],
@@ -208,8 +208,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 4,
max_rank: 4,
rank: 8,
max_rank: 8,
},
),
],

View File

@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 19,
rank: 43,
max_rank: 43,
},
),
],
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 19,
rank: 43,
max_rank: 43,
},
),
],
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 18,
max_rank: 19,
rank: 42,
max_rank: 43,
},
),
],
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 18,
max_rank: 19,
rank: 42,
max_rank: 43,
},
),
],
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 17,
max_rank: 19,
rank: 41,
max_rank: 43,
},
),
],
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 14,
max_rank: 19,
rank: 34,
max_rank: 43,
},
),
],
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 13,
max_rank: 19,
rank: 33,
max_rank: 43,
},
),
],
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 16,
max_rank: 16,
rank: 36,
max_rank: 36,
},
),
],
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 13,
max_rank: 13,
rank: 29,
max_rank: 29,
},
),
],
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 10,
max_rank: 13,
rank: 24,
max_rank: 29,
},
),
],
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
rank: 15,
max_rank: 15,
},
),
],

View File

@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 25,
max_rank: 25,
rank: 57,
max_rank: 57,
},
),
],
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 25,
max_rank: 25,
rank: 57,
max_rank: 57,
},
),
],
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 24,
max_rank: 25,
rank: 56,
max_rank: 57,
},
),
],
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 24,
max_rank: 25,
rank: 56,
max_rank: 57,
},
),
],
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 23,
max_rank: 25,
rank: 55,
max_rank: 57,
},
),
],
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 25,
rank: 54,
max_rank: 57,
},
),
],
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 21,
max_rank: 25,
rank: 53,
max_rank: 57,
},
),
],
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 20,
max_rank: 25,
rank: 52,
max_rank: 57,
},
),
],
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 20,
max_rank: 25,
rank: 51,
max_rank: 57,
},
),
],
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 25,
rank: 48,
max_rank: 57,
},
),
],
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 19,
max_rank: 25,
rank: 47,
max_rank: 57,
},
),
],
@@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 25,
max_rank: 57,
},
),
],
@@ -178,6 +178,62 @@ expression: "format!(\"{document_scores:#?}\")"
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 50,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 43,
max_rank: 43,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 38,
max_rank: 43,
},
),
],
[
Words(
Words {
matching_words: 5,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 29,
max_rank: 29,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 22,
@@ -188,42 +244,14 @@ expression: "format!(\"{document_scores:#?}\")"
[
Words(
Words {
matching_words: 7,
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 19,
max_rank: 19,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 16,
max_rank: 19,
},
),
],
[
Words(
Words {
matching_words: 5,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 13,
max_rank: 13,
rank: 22,
max_rank: 22,
},
),
],
@@ -236,36 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 10,
max_rank: 10,
rank: 22,
max_rank: 22,
},
),
],
@@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 7,
rank: 15,
max_rank: 15,
},
),
],

View File

@@ -12,8 +12,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 25,
max_rank: 25,
rank: 57,
max_rank: 57,
},
),
],
@@ -26,8 +26,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 24,
max_rank: 25,
rank: 56,
max_rank: 57,
},
),
],
@@ -40,8 +40,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 23,
max_rank: 25,
rank: 55,
max_rank: 57,
},
),
],
@@ -54,8 +54,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 25,
rank: 54,
max_rank: 57,
},
),
],
@@ -68,8 +68,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 25,
rank: 54,
max_rank: 57,
},
),
],
@@ -82,8 +82,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 22,
max_rank: 25,
rank: 54,
max_rank: 57,
},
),
],
@@ -96,8 +96,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 21,
max_rank: 25,
rank: 53,
max_rank: 57,
},
),
],
@@ -110,8 +110,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 21,
max_rank: 25,
rank: 53,
max_rank: 57,
},
),
],
@@ -124,8 +124,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 20,
max_rank: 25,
rank: 52,
max_rank: 57,
},
),
],
@@ -138,8 +138,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 18,
max_rank: 25,
rank: 47,
max_rank: 57,
},
),
],
@@ -152,8 +152,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 18,
max_rank: 25,
rank: 45,
max_rank: 57,
},
),
],
@@ -167,7 +167,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 25,
max_rank: 57,
},
),
],
@@ -178,6 +178,62 @@ expression: "format!(\"{document_scores:#?}\")"
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 47,
max_rank: 50,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 40,
max_rank: 43,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 35,
max_rank: 43,
},
),
],
[
Words(
Words {
matching_words: 5,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 26,
max_rank: 29,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 19,
@@ -188,42 +244,14 @@ expression: "format!(\"{document_scores:#?}\")"
[
Words(
Words {
matching_words: 7,
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 16,
max_rank: 19,
},
),
],
[
Words(
Words {
matching_words: 7,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 13,
max_rank: 19,
},
),
],
[
Words(
Words {
matching_words: 5,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 10,
max_rank: 13,
rank: 19,
max_rank: 22,
},
),
],
@@ -236,36 +264,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 7,
max_rank: 10,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 7,
max_rank: 10,
},
),
],
[
Words(
Words {
matching_words: 4,
max_matching_words: 9,
},
),
Proximity(
Rank {
rank: 7,
max_rank: 10,
rank: 19,
max_rank: 22,
},
),
],
@@ -278,8 +278,8 @@ expression: "format!(\"{document_scores:#?}\")"
),
Proximity(
Rank {
rank: 5,
max_rank: 7,
rank: 13,
max_rank: 15,
},
),
],

View File

@@ -6,88 +6,88 @@ expression: "format!(\"{document_scores:#?}\")"
[
Proximity(
Rank {
rank: 25,
max_rank: 25,
rank: 57,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 25,
max_rank: 25,
rank: 57,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 24,
max_rank: 25,
rank: 56,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 24,
max_rank: 25,
rank: 56,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 23,
max_rank: 25,
rank: 55,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 22,
max_rank: 25,
rank: 54,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 21,
max_rank: 25,
rank: 53,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 20,
max_rank: 25,
rank: 52,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 20,
max_rank: 25,
rank: 51,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 19,
max_rank: 25,
rank: 48,
max_rank: 57,
},
),
],
[
Proximity(
Rank {
rank: 19,
max_rank: 25,
rank: 47,
max_rank: 57,
},
),
],
@@ -95,7 +95,7 @@ expression: "format!(\"{document_scores:#?}\")"
Proximity(
Rank {
rank: 1,
max_rank: 25,
max_rank: 57,
},
),
],

View File

@@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
use big_s::S;
use maplit::hashset;
use meili_snap::insta;
use crate::index::tests::TempIndex;
use crate::search::new::tests::collect_field_values;

View File

@@ -259,8 +259,8 @@ fn test_ignore_stop_words() {
),
Proximity(
Rank {
rank: 3,
max_rank: 4,
rank: 7,
max_rank: 8,
},
),
Fid(
@@ -411,8 +411,8 @@ fn test_stop_words_in_phrase() {
),
Proximity(
Rank {
rank: 2,
max_rank: 4,
rank: 6,
max_rank: 8,
},
),
Fid(

View File

@@ -277,7 +277,7 @@ fn test_words_proximity_tms_last_simple() {
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
// 7 is better than 6 because of the proximity between "the" and its surrounding terms
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
@@ -289,10 +289,10 @@ fn test_words_proximity_tms_last_simple() {
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
@@ -312,7 +312,7 @@ fn test_words_proximity_tms_last_simple() {
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
// 10 is better than 9 because of the proximity between "quick" and "brown"
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 15, 16, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
@@ -326,8 +326,8 @@ fn test_words_proximity_tms_last_simple() {
"\"the great quick brown fox jumps over the lazy dog\"",
"\"the quick brown fox jumps over the really lazy dog\"",
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
"\"the quick brown fox jumps over the lazy\"",
"\"the quick brown fox jumps over the\"",
@@ -427,7 +427,7 @@ fn test_words_tms_all() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 16, 19, 15, 20, 22]");
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]");
insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###"
@@ -439,10 +439,10 @@ fn test_words_tms_all() {
"\"the mighty and quick brown fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the lazy dog\"",
"\"the brown quick fox jumps over the really lazy dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the brown quick fox immediately jumps over the really lazy blue dog\"",
"\"this quick brown and scary fox jumps over the lazy dog\"",
"\"this quick brown and very scary fox jumps over the lazy dog\"",
"\"the, quick, brown, fox, jumps, over, the, lazy, dog\"",
]
"###);

104
milli/src/update/del_add.rs Normal file
View File

@@ -0,0 +1,104 @@
use obkv::Key;
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
/// DelAdd defines the new value to add in the database and old value to delete from the database.
///
/// Its used in an OBKV to be serialized in grenad files.
#[repr(u8)]
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
pub enum DelAdd {
Deletion = 0,
Addition = 1,
}
impl Key for DelAdd {
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
type BYTES = [u8; Self::BYTES_SIZE];
fn to_be_bytes(&self) -> Self::BYTES {
u8::to_be_bytes(*self as u8)
}
fn from_be_bytes(array: Self::BYTES) -> Self {
match u8::from_be_bytes(array) {
0 => Self::Deletion,
1 => Self::Addition,
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
}
}
}
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
///
/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
/// if both deletion and addition are `true, the value will be inserted in both keys.
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
reader: obkv::KvReader<K>,
deletion: bool,
addition: bool,
buffer: &mut Vec<u8>,
) -> Result<(), std::io::Error> {
let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new();
for (key, value) in reader.iter() {
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
if deletion {
value_writer.insert(DelAdd::Deletion, value)?;
}
if addition {
value_writer.insert(DelAdd::Addition, value)?;
}
value_writer.finish()?;
writer.insert(key, &value_buffer)?;
}
writer.finish()
}
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
///
/// putting each deletion obkv's keys under an DelAdd::Deletion
/// and putting each addition obkv's keys under an DelAdd::Addition
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
deletion: obkv::KvReader<K>,
addition: obkv::KvReader<K>,
buffer: &mut Vec<u8>,
) -> Result<(), std::io::Error> {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new();
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
value_buffer.clear();
match eob {
Left((k, v)) => {
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Deletion, v).unwrap();
writer.insert(k, value_writer.into_inner()?).unwrap();
}
Right((k, v)) => {
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Addition, v).unwrap();
writer.insert(k, value_writer.into_inner()?).unwrap();
}
Both((k, deletion), (_, addition)) => {
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
value_writer.insert(DelAdd::Addition, addition).unwrap();
writer.insert(k, value_writer.into_inner()?).unwrap();
}
}
}
writer.finish()
}
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
}

View File

@@ -16,9 +16,7 @@ use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::Hnsw;
use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
};
use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -108,17 +106,15 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
self.delete_document(docid);
Some(docid)
}
pub fn execute(self) -> Result<DocumentDeletionResult> {
puffin::profile_function!();
let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
self.execute_inner()?;
Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
}
pub(crate) fn execute_inner(mut self) -> Result<DetailedDocumentDeletionResult> {
puffin::profile_function!();
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
// We retrieve the current documents ids that are in the database.
@@ -478,8 +474,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
+ for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
{
puffin::profile_function!();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@@ -499,11 +493,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
fn remove_from_word_prefix_docids(
txn: &mut heed::RwTxn,
db: &Database<Str, RoaringBitmapCodec>,
db: &Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
) -> Result<fst::Set<Vec<u8>>> {
puffin::profile_function!();
let mut prefixes_to_delete = fst::SetBuilder::memory();
// We iterate over the word prefix docids database and remove the deleted documents ids
@@ -529,13 +521,11 @@ fn remove_from_word_prefix_docids(
fn remove_from_word_docids(
txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>,
db: &heed::Database<Str, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,
words_to_keep: &mut BTreeSet<String>,
words_to_remove: &mut BTreeSet<String>,
) -> Result<()> {
puffin::profile_function!();
// We create an iterator to be able to get the content and delete the word docids.
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
// the LMDB B-Tree two times but only once.
@@ -567,8 +557,6 @@ fn remove_docids_from_field_id_docid_facet_value(
field_id: FieldId,
to_remove: &RoaringBitmap,
) -> heed::Result<HashSet<Vec<u8>>> {
puffin::profile_function!();
let db = match facet_type {
FacetType::String => {
index.field_id_docid_facet_strings.remap_types::<ByteSlice, DecodeIgnore>()
@@ -604,8 +592,6 @@ fn remove_docids_from_facet_id_docids<'a, C>(
where
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
{
puffin::profile_function!();
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;

View File

@@ -1,6 +1,5 @@
use std::borrow::Cow;
use std::fs::File;
use std::io::BufReader;
use grenad::CompressionType;
use heed::types::ByteSlice;
@@ -31,7 +30,7 @@ pub struct FacetsUpdateBulk<'i> {
facet_type: FacetType,
field_ids: Vec<FieldId>,
// None if level 0 does not need to be updated
new_data: Option<grenad::Reader<BufReader<File>>>,
new_data: Option<grenad::Reader<File>>,
}
impl<'i> FacetsUpdateBulk<'i> {
@@ -39,7 +38,7 @@ impl<'i> FacetsUpdateBulk<'i> {
index: &'i Index,
field_ids: Vec<FieldId>,
facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>,
new_data: grenad::Reader<File>,
group_size: u8,
min_level_size: u8,
) -> FacetsUpdateBulk<'i> {
@@ -133,6 +132,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
self.db.delete_range(wtxn, &range).map(drop)?;
Ok(())
}
// TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
let new_data = match self.new_data.take() {
Some(x) => x,
@@ -188,7 +189,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
&self,
field_id: FieldId,
txn: &RoTxn,
) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
let mut all_docids = RoaringBitmap::new();
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
for bitmap in bitmaps {
@@ -260,7 +261,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
field_id: u16,
level: u8,
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
) -> Result<Vec<grenad::Reader<File>>> {
if level == 0 {
self.read_level_0(rtxn, field_id, handle_group)?;
// Level 0 is already in the database

View File

@@ -1,6 +1,5 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, Error, RoTxn, RwTxn};
@@ -35,14 +34,14 @@ pub struct FacetsUpdateIncremental<'i> {
index: &'i Index,
inner: FacetsUpdateIncrementalInner,
facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>,
new_data: grenad::Reader<File>,
}
impl<'i> FacetsUpdateIncremental<'i> {
pub fn new(
index: &'i Index,
facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>,
new_data: grenad::Reader<File>,
group_size: u8,
min_level_size: u8,
max_group_size: u8,

View File

@@ -78,7 +78,6 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::BufReader;
use std::iter::FromIterator;
use charabia::normalizer::{Normalize, NormalizerOption};
@@ -109,17 +108,14 @@ pub struct FacetsUpdate<'i> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>,
new_data: grenad::Reader<File>,
group_size: u8,
max_group_size: u8,
min_level_size: u8,
}
impl<'i> FacetsUpdate<'i> {
pub fn new(
index: &'i Index,
facet_type: FacetType,
new_data: grenad::Reader<BufReader<File>>,
) -> Self {
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
let database = match facet_type {
FacetType::String => index
.facet_id_string_docids

View File

@@ -1,4 +1,4 @@
use std::io::{BufWriter, Read, Seek};
use std::io::{Read, Seek};
use std::result::Result as StdResult;
use std::{fmt, iter};
@@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
// The primary key *field id* that has already been set for this index or the one

View File

@@ -1,22 +1,19 @@
use std::collections::{HashMap, HashSet};
use std::convert::TryInto;
use std::fs::File;
use std::io::BufReader;
use std::{io, mem, str};
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
use obkv::KvReader;
use obkv::{KvReader, KvWriterU16};
use roaring::RoaringBitmap;
use serde_json::Value;
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
use crate::error::{InternalError, SerializationError};
use crate::update::index_documents::MergeFn;
use crate::{
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
};
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
/// Extracts the word and positions where this word appear and
/// prefixes it by the document id.
@@ -32,25 +29,160 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>,
) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
puffin::profile_function!();
let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread();
// initialize destination values.
let mut documents_ids = RoaringBitmap::new();
let mut script_language_docids = HashMap::new();
let mut docid_word_positions_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
concat_u32s_array,
keep_latest_obkv,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut buffers = Buffers::default();
// initialize buffers.
let mut del_buffers = Buffers::default();
let mut add_buffers = Buffers::default();
let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new();
// initialize tokenizer.
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
let tokenizer = builder.build();
// iterate over documents.
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let document_id = key
.try_into()
.map(u32::from_be_bytes)
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document.
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
continue;
}
documents_ids.push(document_id);
// Update key buffer prefix.
key_buffer.clear();
key_buffer.extend_from_slice(&document_id.to_be_bytes());
// Tokenize deletions and additions in 2 diffferent threads.
let (del, add): (Result<_>, Result<_>) = rayon::join(
|| {
// deletions
lang_safe_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
stop_words,
allowed_separators,
dictionary,
max_positions_per_attributes,
DelAdd::Deletion,
&mut del_buffers,
)
},
|| {
// additions
lang_safe_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
stop_words,
allowed_separators,
dictionary,
max_positions_per_attributes,
DelAdd::Addition,
&mut add_buffers,
)
},
);
let (del_obkv, del_script_language_word_count) = del?;
let (add_obkv, add_script_language_word_count) = add?;
// merge deletions and additions.
value_buffer.clear();
del_add_from_two_obkvs(
KvReader::<FieldId>::new(del_obkv),
KvReader::<FieldId>::new(add_obkv),
&mut value_buffer,
)?;
// write them into the sorter.
let obkv = KvReader::<FieldId>::new(value);
for (field_id, value) in obkv.iter() {
key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(&field_id.to_be_bytes());
docid_word_positions_sorter.insert(&key_buffer, value)?;
}
// update script_language_docids deletions.
for (script, languages_frequency) in del_script_language_word_count {
for (language, _) in languages_frequency {
let entry = script_language_docids
.entry((script, language))
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
entry.0.push(document_id);
}
}
// update script_language_docids additions.
for (script, languages_frequency) in add_script_language_word_count {
for (language, _) in languages_frequency {
let entry = script_language_docids
.entry((script, language))
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
entry.1.push(document_id);
}
}
}
sorter_into_reader(docid_word_positions_sorter, indexer)
.map(|reader| (documents_ids, reader, script_language_docids))
}
/// Check if any searchable fields of a document changed.
fn searchable_fields_changed(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
) -> bool {
for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let del_add = KvReaderDelAdd::new(field_bytes);
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
// if both fields are None, check the next field.
(None, None) => (),
// if both contains a value and values are the same, check the next field.
(Some(del), Some(add)) if del == add => (),
// otherwise the fields are different, return true.
_otherwise => return true,
}
}
}
false
}
/// Factorize tokenizer building.
fn tokenizer_builder<'a>(
stop_words: Option<&'a fst::Set<&[u8]>>,
allowed_separators: Option<&'a [&str]>,
dictionary: Option<&'a [&str]>,
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
) -> TokenizerBuilder<'a, &'a [u8]> {
let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
@@ -61,130 +193,144 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
if let Some(separators) = allowed_separators {
tokenizer_builder.separators(separators);
}
let tokenizer = tokenizer_builder.build();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let document_id = key
.try_into()
.map(u32::from_be_bytes)
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let obkv = KvReader::<FieldId>::new(value);
if let Some(script_language) = script_language {
tokenizer_builder.allow_list(&script_language);
}
documents_ids.push(document_id);
buffers.key_buffer.clear();
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
tokenizer_builder
}
let mut script_language_word_count = HashMap::new();
/// Extract words maped with their positions of a document,
/// ensuring no Language detection mistakes was made.
fn lang_safe_tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer,
stop_words: Option<&fst::Set<&[u8]>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: u32,
del_add: DelAdd,
buffers: &'a mut Buffers,
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
let mut script_language_word_count = HashMap::new();
extract_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
&mut buffers,
&mut script_language_word_count,
&mut docid_word_positions_sorter,
)?;
tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
del_add,
buffers,
&mut script_language_word_count,
)?;
// if we detect a potetial mistake in the language detection,
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
// context: https://github.com/meilisearch/meilisearch/issues/3565
if script_language_word_count
.values()
.map(Vec::as_slice)
.any(potential_language_detection_error)
{
// build an allow list with the most frequent detected languages in the document.
let script_language: HashMap<_, _> =
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
// if we detect a potetial mistake in the language detection,
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
// context: https://github.com/meilisearch/meilisearch/issues/3565
if script_language_word_count
.values()
.map(Vec::as_slice)
.any(potential_language_detection_error)
{
// build an allow list with the most frequent detected languages in the document.
let script_language: HashMap<_, _> =
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
// if the allow list is empty, meaning that no Language is considered frequent,
// then we don't rerun the extraction.
if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list.
let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
}
tokenizer_builder.allow_list(&script_language);
let tokenizer = tokenizer_builder.build();
// if the allow list is empty, meaning that no Language is considered frequent,
// then we don't rerun the extraction.
if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list.
let mut builder = tokenizer_builder(
stop_words,
dictionary,
allowed_separators,
Some(&script_language),
);
let tokenizer = builder.build();
script_language_word_count.clear();
script_language_word_count.clear();
// rerun the extraction.
extract_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
&mut buffers,
&mut script_language_word_count,
&mut docid_word_positions_sorter,
)?;
}
}
for (script, languages_frequency) in script_language_word_count {
for (language, _) in languages_frequency {
let entry = script_language_docids
.entry((script, language))
.or_insert_with(RoaringBitmap::new);
entry.push(document_id);
}
// rerun the extraction.
tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
max_positions_per_attributes,
del_add,
buffers,
&mut script_language_word_count,
)?;
}
}
sorter_into_reader(docid_word_positions_sorter, indexer)
.map(|reader| (documents_ids, reader, script_language_docids))
Ok((&buffers.obkv_buffer, script_language_word_count))
}
fn extract_tokens_from_document(
/// Extract words maped with their positions of a document.
fn tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer,
max_positions_per_attributes: u32,
buffers: &mut Buffers,
del_add: DelAdd,
buffers: &'a mut Buffers,
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
) -> Result<&'a [u8]> {
buffers.obkv_buffer.clear();
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
for (field_id, field_bytes) in obkv.iter() {
// if field is searchable.
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
buffers.field_buffer.clear();
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
let tokens = process_tokens(tokenizer.tokenize(field))
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
// extract deletion or addition only.
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
// parse json.
let value =
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
for (index, token) in tokens {
// if a language has been detected for the token, we update the counter.
if let Some(language) = token.language {
let script = token.script;
let entry =
script_language_word_count.entry(script).or_insert_with(Vec::new);
match entry.iter_mut().find(|(l, _)| *l == language) {
Some((_, n)) => *n += 1,
None => entry.push((language, 1)),
// prepare writting destination.
buffers.obkv_positions_buffer.clear();
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
// convert json into an unique string.
buffers.field_buffer.clear();
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
// create an iterator of token with their positions.
let tokens = process_tokens(tokenizer.tokenize(field))
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
for (index, token) in tokens {
// if a language has been detected for the token, we update the counter.
if let Some(language) = token.language {
let script = token.script;
let entry =
script_language_word_count.entry(script).or_insert_with(Vec::new);
match entry.iter_mut().find(|(l, _)| *l == language) {
Some((_, n)) => *n += 1,
None => entry.push((language, 1)),
}
}
// keep a word only if it is not empty and fit in a LMDB key.
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
let position: u16 = index
.try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
writer.insert(position, token.as_bytes())?;
}
}
let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
buffers.key_buffer.truncate(mem::size_of::<u32>());
buffers.key_buffer.extend_from_slice(token.as_bytes());
let position: u16 = index
.try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let position = absolute_from_relative_position(field_id, position);
docid_word_positions_sorter
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
}
// write positions into document.
let positions = writer.into_inner()?;
document_writer.insert(field_id, positions)?;
}
}
}
}
Ok(())
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
}
/// Transform a JSON value into a string that can be indexed.
@@ -287,10 +433,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
#[derive(Default)]
struct Buffers {
// the key buffer is the concatenation of the internal document id with the field id.
// The buffer has to be completelly cleared between documents,
// and the field id part must be cleared between each field.
key_buffer: Vec<u8>,
// the field buffer for each fields desserialization, and must be cleared between each field.
field_buffer: String,
// buffer used to store the value data containing an obkv.
obkv_buffer: Vec<u8>,
// buffer used to store the value data containing an obkv of tokens with their positions.
obkv_positions_buffer: Vec<u8>,
}

View File

@@ -1,14 +1,15 @@
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use heed::{BytesDecode, BytesEncode};
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
};
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::Result;
/// Extracts the facet number and the documents ids where this facet number appear.
@@ -17,30 +18,39 @@ use crate::Result;
/// documents ids from the given chunk of docid facet number positions.
#[logging_timer::time]
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
docid_fid_facet_number: grenad::Reader<R>,
fid_docid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut facet_number_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut cursor = docid_fid_facet_number.into_cursor()?;
while let Some((key_bytes, _)) = cursor.move_on_next()? {
let mut buffer = Vec::new();
let mut cursor = fid_docid_facet_number.into_cursor()?;
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
let (field_id, document_id, number) =
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
}
sorter_into_reader(facet_number_docids_sorter, indexer)

View File

@@ -1,13 +1,14 @@
use std::fs::File;
use std::io::{self, BufReader};
use std::{io, str};
use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::StrRefCodec;
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
use crate::{FieldId, Result};
/// Extracts the facet string and the documents ids where this facet string appear.
///
@@ -17,22 +18,23 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
@@ -40,21 +42,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
let normalised_truncated_value: String;
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
normalised_truncated_value = normalised_value
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
normalised_value = normalised_truncated_value.as_str();
}
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
let normalized_value = str::from_utf8(normalized_value_bytes)?;
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
// document id is encoded in native-endian because of the CBO roaring bitmap codec
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
}
obkv.finish()?;
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
}
sorter_into_reader(facet_string_docids_sorter, indexer)

View File

@@ -1,27 +1,39 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
use std::convert::TryInto;
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use std::mem::size_of;
use std::result::Result as StdResult;
use grenad::Sorter;
use heed::zerocopy::AsBytes;
use heed::BytesEncode;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::{from_slice, Value};
use FilterableValues::{Empty, Null, Values};
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
use crate::error::InternalError;
use crate::facet::value_encoding::f64_into_bytes;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
use crate::{
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
/// The extracted facet values stored in grenad files by type.
pub struct ExtractedFacetValues {
pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
}
/// Extracts the facet values of each faceted field of each document.
@@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
max_memory.map(|m| m / 2),
);
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
// The tuples represents the Del and Add side for a bitmap
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
// We create two buffer for mutable ref issues with closures.
let mut numbers_key_buffer = Vec::new();
let mut strings_key_buffer = Vec::new();
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
for (field_id, field_bytes) in obkv.iter() {
if faceted_fields.contains(&field_id) {
key_buffer.clear();
numbers_key_buffer.clear();
strings_key_buffer.clear();
// Set key to the field_id
// Note: this encoding is consistent with FieldIdCodec
key_buffer.extend_from_slice(&field_id.to_be_bytes());
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
// Here, we know already that the document must be added to the “field id exists” database
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
let document = BEU32::from(document).get();
facet_exists_docids.entry(field_id).or_default().insert(document);
// For the other extraction tasks, prefix the key with the field_id and the document_id
key_buffer.extend_from_slice(docid_bytes);
numbers_key_buffer.extend_from_slice(docid_bytes);
strings_key_buffer.extend_from_slice(docid_bytes);
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
let del_add_obkv = obkv::KvReader::new(field_bytes);
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
None => None,
};
let add_value = match del_add_obkv.get(DelAdd::Addition) {
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
None => None,
};
match extract_facet_values(
&value,
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
) {
FilterableValues::Null => {
facet_is_null_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Empty => {
facet_is_empty_docids.entry(field_id).or_default().insert(document);
}
FilterableValues::Values { numbers, strings } => {
// insert facet numbers in sorter
for number in numbers {
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// We insert the document id on the Del and the Add side if the field exists.
let (ref mut del_exists, ref mut add_exists) =
facet_exists_docids.entry(field_id).or_default();
let (ref mut del_is_null, ref mut add_is_null) =
facet_is_null_docids.entry(field_id).or_default();
let (ref mut del_is_empty, ref mut add_is_empty) =
facet_is_empty_docids.entry(field_id).or_default();
fid_docid_facet_numbers_sorter
.insert(&key_buffer, ().as_bytes())?;
}
if del_value.is_some() {
del_exists.insert(document);
}
if add_value.is_some() {
add_exists.insert(document);
}
let geo_support =
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
let del_filterable_values =
del_value.map(|value| extract_facet_values(&value, geo_support));
let add_filterable_values =
add_value.map(|value| extract_facet_values(&value, geo_support));
// Those closures are just here to simplify things a bit.
let mut insert_numbers_diff = |del_numbers, add_numbers| {
insert_numbers_diff(
&mut fid_docid_facet_numbers_sorter,
&mut numbers_key_buffer,
del_numbers,
add_numbers,
)
};
let mut insert_strings_diff = |del_strings, add_strings| {
insert_strings_diff(
&mut fid_docid_facet_strings_sorter,
&mut strings_key_buffer,
del_strings,
add_strings,
)
};
match (del_filterable_values, add_filterable_values) {
(None, None) => (),
(Some(del_filterable_values), None) => match del_filterable_values {
Null => {
del_is_null.insert(document);
}
// insert normalized and original facet string in sorter
for (normalized, original) in
strings.into_iter().filter(|(n, _)| !n.is_empty())
{
let normalized_truncated_value: String = normalized
.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
fid_docid_facet_strings_sorter
.insert(&key_buffer, original.as_bytes())?;
Empty => {
del_is_empty.insert(document);
}
Values { numbers, strings } => {
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
},
(None, Some(add_filterable_values)) => match add_filterable_values {
Null => {
add_is_null.insert(document);
}
Empty => {
add_is_empty.insert(document);
}
Values { numbers, strings } => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
}
},
(Some(del_filterable_values), Some(add_filterable_values)) => {
match (del_filterable_values, add_filterable_values) {
(Null, Null) | (Empty, Empty) => (),
(Null, Empty) => {
del_is_null.insert(document);
add_is_empty.insert(document);
}
(Empty, Null) => {
del_is_empty.insert(document);
add_is_null.insert(document);
}
(Null, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_null.insert(document);
}
(Empty, Values { numbers, strings }) => {
insert_numbers_diff(vec![], numbers)?;
insert_strings_diff(vec![], strings)?;
del_is_empty.insert(document);
}
(Values { numbers, strings }, Null) => {
add_is_null.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(Values { numbers, strings }, Empty) => {
add_is_empty.insert(document);
insert_numbers_diff(numbers, vec![])?;
insert_strings_diff(strings, vec![])?;
}
(
Values { numbers: del_numbers, strings: del_strings },
Values { numbers: add_numbers, strings: add_strings },
) => {
insert_numbers_diff(del_numbers, add_numbers)?;
insert_strings_diff(del_strings, add_strings)?;
}
}
}
}
@@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
}
}
let mut buffer = Vec::new();
let mut facet_exists_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_exists_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
}
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
@@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_null_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
}
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
@@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
}
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
Ok(ExtractedFacetValues {
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
})
}
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
fn deladd_obkv_cbo_roaring_bitmaps(
buffer: &mut Vec<u8>,
del_bitmap: &RoaringBitmap,
add_bitmap: &RoaringBitmap,
) -> io::Result<()> {
buffer.clear();
let mut obkv = KvWriterDelAdd::new(buffer);
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
obkv.finish()
}
/// Truncates a string to the biggest valid LMDB key size.
fn truncate_string(s: String) -> String {
s.char_indices()
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect()
}
/// Computes the diff between both Del and Add numbers and
/// only inserts the parts that differ in the sorter.
fn insert_numbers_diff<MF>(
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
key_buffer: &mut Vec<u8>,
mut del_numbers: Vec<f64>,
mut add_numbers: Vec<f64>,
) -> Result<()>
where
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
{
// We sort and dedup the float numbers
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
let merged_numbers_iter = itertools::merge_join_by(
del_numbers.into_iter().map(OrderedFloat),
add_numbers.into_iter().map(OrderedFloat),
|del, add| del.cmp(add),
);
// insert facet numbers in sorter
for eob in merged_numbers_iter {
key_buffer.truncate(TRUNCATE_SIZE);
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(OrderedFloat(number)) => {
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those numbers.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
let bytes = obkv.into_inner()?;
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(OrderedFloat(number)) => {
if let Some(value_bytes) = f64_into_bytes(number) {
key_buffer.extend_from_slice(&value_bytes);
key_buffer.extend_from_slice(&number.to_be_bytes());
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those numbers.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, ().as_bytes())?;
let bytes = obkv.into_inner()?;
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
}
}
}
}
Ok(())
}
/// Computes the diff between both Del and Add strings and
/// only inserts the parts that differ in the sorter.
fn insert_strings_diff<MF>(
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
key_buffer: &mut Vec<u8>,
mut del_strings: Vec<(String, String)>,
mut add_strings: Vec<(String, String)>,
) -> Result<()>
where
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
{
// We sort and dedup the normalized and original strings
del_strings.sort_unstable();
add_strings.sort_unstable();
del_strings.dedup();
add_strings.dedup();
let merged_strings_iter = itertools::merge_join_by(
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|del, add| del.cmp(add),
);
// insert normalized and original facet string in sorter
for eob in merged_strings_iter {
key_buffer.truncate(TRUNCATE_SIZE);
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left((normalized, original)) => {
let truncated = truncate_string(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
}
EitherOrBoth::Right((normalized, original)) => {
let truncated = truncate_string(normalized);
key_buffer.extend_from_slice(truncated.as_bytes());
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, original)?;
let bytes = obkv.into_inner()?;
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
}
}
}
Ok(())
}
/// Represent what a document field contains.
enum FilterableValues {
/// Corresponds to the JSON `null` value.
@@ -182,6 +409,7 @@ enum FilterableValues {
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
}
/// Extracts the facet values of a JSON field.
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
fn inner_extract_facet_values(
value: &Value,

View File

@@ -1,16 +1,17 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use grenad::Sorter;
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters, MergeFn,
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
use crate::Result;
const MAX_COUNTED_WORDS: usize = 30;
/// Extracts the field id word count and the documents ids where
/// this field id with this amount of words appear.
@@ -21,7 +22,7 @@ use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
@@ -35,63 +36,21 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
max_memory,
);
// This map is assumed to not consume a lot of memory.
let mut document_fid_wordcount = HashMap::new();
let mut current_document_id = None;
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let curr_document_id = *current_document_id.get_or_insert(document_id);
if curr_document_id != document_id {
drain_document_fid_wordcount_into_sorter(
&mut fid_word_count_docids_sorter,
&mut document_fid_wordcount,
curr_document_id,
)?;
current_document_id = Some(document_id);
}
for position in read_u32_ne_bytes(value) {
let (field_id, _) = relative_from_absolute_position(position);
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
*value += 1;
}
}
if let Some(document_id) = current_document_id {
// We must make sure that don't lose the current document field id
// word count map if we break because we reached the end of the chunk.
drain_document_fid_wordcount_into_sorter(
&mut fid_word_count_docids_sorter,
&mut document_fid_wordcount,
document_id,
)?;
}
sorter_into_reader(fid_word_count_docids_sorter, indexer)
}
fn drain_document_fid_wordcount_into_sorter(
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
document_fid_wordcount: &mut HashMap<FieldId, u32>,
document_id: DocumentId,
) -> Result<()> {
let mut key_buffer = Vec::new();
for (fid, count) in document_fid_wordcount.drain() {
if count <= 30 {
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
if word_count <= MAX_COUNTED_WORDS {
key_buffer.clear();
key_buffer.extend_from_slice(&fid.to_be_bytes());
key_buffer.push(count as u8);
key_buffer.extend_from_slice(fid_bytes);
key_buffer.push(word_count as u8);
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
}
Ok(())
sorter_into_reader(fid_word_count_docids_sorter, indexer)
}

View File

@@ -1,5 +1,5 @@
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use concat_arrays::concat_arrays;
use serde_json::Value;
@@ -18,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
indexer: GrenadParameters,
primary_key_id: FieldId,
(lat_fid, lng_fid): (FieldId, FieldId),
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let mut writer = create_writer(

View File

@@ -1,6 +1,6 @@
use std::convert::TryFrom;
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use bytemuck::cast_slice;
use serde_json::{from_slice, Value};
@@ -18,7 +18,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
indexer: GrenadParameters,
primary_key_id: FieldId,
vectors_fid: FieldId,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let mut writer = create_writer(

View File

@@ -1,18 +1,20 @@
use std::collections::HashSet;
use std::collections::{BTreeSet, HashSet};
use std::fs::File;
use std::io::{self, BufReader};
use std::iter::FromIterator;
use std::io;
use roaring::RoaringBitmap;
use heed::BytesDecode;
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
try_split_array_at, GrenadParameters,
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
try_split_array_at, writer_into_reader, GrenadParameters,
};
use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::index_documents::helpers::read_u32_ne_bytes;
use crate::{relative_from_absolute_position, FieldId, Result};
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::MergeFn;
use crate::{DocumentId, FieldId, Result};
/// Extracts the word and the documents ids where this word appear.
///
@@ -26,65 +28,148 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
exact_attributes: &HashSet<FieldId>,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_docids_sorter = create_sorter(
let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 2),
max_memory.map(|x| x / 3),
);
let mut key_buffer = Vec::new();
let mut del_words = BTreeSet::new();
let mut add_words = BTreeSet::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let (fid_bytes, _) = try_split_array_at(fid_bytes)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let fid = u16::from_be_bytes(fid_bytes);
let del_add_reader = KvReaderDelAdd::new(&value);
// extract all unique words to remove.
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
del_words.insert(word.to_vec());
}
}
// extract all unique additional words.
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
for (_pos, word) in KvReaderU16::new(&addition).iter() {
add_words.insert(word.to_vec());
}
}
words_into_sorter(
document_id,
fid,
&mut key_buffer,
&del_words,
&add_words,
&mut word_fid_docids_sorter,
)?;
del_words.clear();
add_words.clear();
}
let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 3),
);
let mut exact_word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 2),
max_memory.map(|x| x / 3),
);
let mut value_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, positions)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
let mut word_fid_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
while let Some((key, value)) = iter.next()? {
// only keep the value if their is a change to apply in the DB.
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
word_fid_docids_writer.insert(key, value)?;
}
let (word, fid) = StrBEU16Codec::bytes_decode(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let bitmap = RoaringBitmap::from_iter(Some(document_id));
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
// If there are no exact attributes, we do not need to iterate over positions.
if exact_attributes.is_empty() {
word_docids_sorter.insert(word_bytes, &value_buffer)?;
// every words contained in an attribute set to exact must be pushed in the exact_words list.
if exact_attributes.contains(&fid) {
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
} else {
let mut added_to_exact = false;
let mut added_to_word_docids = false;
for position in read_u32_ne_bytes(positions) {
// as soon as we know that this word had been to both readers, we don't need to
// iterate over the positions.
if added_to_exact && added_to_word_docids {
break;
}
let (fid, _) = relative_from_absolute_position(position);
if exact_attributes.contains(&fid) && !added_to_exact {
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
added_to_exact = true;
} else if !added_to_word_docids {
word_docids_sorter.insert(word_bytes, &value_buffer)?;
added_to_word_docids = true;
}
}
word_docids_sorter.insert(word.as_bytes(), &value)?;
}
}
Ok((
sorter_into_reader(word_docids_sorter, indexer)?,
sorter_into_reader(exact_word_docids_sorter, indexer)?,
writer_into_reader(word_fid_docids_writer)?,
))
}
fn words_into_sorter(
document_id: DocumentId,
fid: FieldId,
key_buffer: &mut Vec<u8>,
del_words: &BTreeSet<Vec<u8>>,
add_words: &BTreeSet<Vec<u8>>,
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut buffer = Vec::new();
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
let word_bytes = match eob {
Left(word_bytes) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
word_bytes
}
Right(word_bytes) => {
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
word_bytes
}
Both(word_bytes, _) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
word_bytes
}
};
key_buffer.clear();
key_buffer.extend_from_slice(&word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
Ok(())
}

View File

@@ -1,51 +0,0 @@
use std::fs::File;
use std::io::{self, BufReader};
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{relative_from_absolute_position, DocumentId, Result};
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
#[logging_timer::time]
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = DocumentId::from_be_bytes(document_id_bytes);
for position in read_u32_ne_bytes(value) {
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
let (fid, _) = relative_from_absolute_position(position);
key_buffer.extend_from_slice(&fid.to_be_bytes());
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
}
}
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
Ok(word_fid_docids_reader)
}

View File

@@ -1,16 +1,17 @@
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap};
use std::collections::{BTreeMap, VecDeque};
use std::fs::File;
use std::io::BufReader;
use std::{cmp, io, mem, str, vec};
use std::{cmp, io};
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters, MergeFn,
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
writer_into_reader, GrenadParameters, MergeFn,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::{positions_proximity, MAX_DISTANCE};
use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::{DocumentId, Result};
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@@ -21,63 +22,143 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_pair_proximity_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / 2),
);
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
.into_iter()
.map(|_| {
create_sorter(
grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|m| m / MAX_DISTANCE as usize),
)
})
.collect();
// This map is assumed to not consume a lot of memory.
let mut document_word_positions_heap = BinaryHeap::new();
let mut del_word_positions: VecDeque<(String, u16)> =
VecDeque::with_capacity(MAX_DISTANCE as usize);
let mut add_word_positions: VecDeque<(String, u16)> =
VecDeque::with_capacity(MAX_DISTANCE as usize);
let mut del_word_pair_proximity = BTreeMap::new();
let mut add_word_pair_proximity = BTreeMap::new();
let mut current_document_id = None;
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = u32::from_be_bytes(document_id_bytes);
let word = str::from_utf8(word_bytes)?;
let curr_document_id = *current_document_id.get_or_insert(document_id);
if curr_document_id != document_id {
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
// if we change document, we fill the sorter
if current_document_id.map_or(false, |id| id != document_id) {
puffin::profile_scope!("Document into sorter");
document_word_positions_into_sorter(
curr_document_id,
document_word_positions_heap,
&mut word_pair_proximity_docids_sorter,
current_document_id.unwrap(),
&del_word_pair_proximity,
&add_word_pair_proximity,
&mut word_pair_proximity_docids_sorters,
)?;
current_document_id = Some(document_id);
del_word_pair_proximity.clear();
add_word_pair_proximity.clear();
}
let word = word.to_string();
let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
positions.sort_unstable();
let mut iter = positions.into_iter();
if let Some(position) = iter.next() {
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
}
current_document_id = Some(document_id);
let (del, add): (Result<_>, Result<_>) = rayon::join(
|| {
// deletions
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
for (position, word) in KvReaderU16::new(deletion).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while del_word_positions.get(0).map_or(false, |(_w, p)| {
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
}) {
word_positions_into_word_pair_proximity(
&mut del_word_positions,
&mut del_word_pair_proximity,
)?;
}
// insert the new word.
let word = std::str::from_utf8(word)?;
del_word_positions.push_back((word.to_string(), position));
}
while !del_word_positions.is_empty() {
word_positions_into_word_pair_proximity(
&mut del_word_positions,
&mut del_word_pair_proximity,
)?;
}
}
Ok(())
},
|| {
// additions
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
for (position, word) in KvReaderU16::new(addition).iter() {
// drain the proximity window until the head word is considered close to the word we are inserting.
while add_word_positions.get(0).map_or(false, |(_w, p)| {
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
}) {
word_positions_into_word_pair_proximity(
&mut add_word_positions,
&mut add_word_pair_proximity,
)?;
}
// insert the new word.
let word = std::str::from_utf8(word)?;
add_word_positions.push_back((word.to_string(), position));
}
while !add_word_positions.is_empty() {
word_positions_into_word_pair_proximity(
&mut add_word_positions,
&mut add_word_pair_proximity,
)?;
}
}
Ok(())
},
);
del?;
add?;
}
if let Some(document_id) = current_document_id {
// We must make sure that don't lose the current document field id
// word count map if we break because we reached the end of the chunk.
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
puffin::profile_scope!("Final document into sorter");
document_word_positions_into_sorter(
document_id,
document_word_positions_heap,
&mut word_pair_proximity_docids_sorter,
&del_word_pair_proximity,
&add_word_pair_proximity,
&mut word_pair_proximity_docids_sorters,
)?;
}
{
puffin::profile_scope!("sorter_into_reader");
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
for sorter in word_pair_proximity_docids_sorters {
sorter.write_into_stream_writer(&mut writer)?;
}
writer_into_reader(writer)
}
}
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
@@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
/// close to each other.
fn document_word_positions_into_sorter(
document_id: DocumentId,
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
) -> Result<()> {
let mut word_pair_proximity = HashMap::new();
let mut ordered_peeked_word_positions = Vec::new();
while !word_positions_heap.is_empty() {
while let Some(peeked_word_position) = word_positions_heap.pop() {
ordered_peeked_word_positions.push(peeked_word_position);
if ordered_peeked_word_positions.len() == 7 {
break;
}
}
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
for PeekedWordPosition { word, position, .. } in tail {
let prox = positions_proximity(head.position, *position);
if prox > 0 && prox < MAX_DISTANCE {
word_pair_proximity
.entry((head.word.clone(), word.clone()))
.and_modify(|p| {
*p = cmp::min(*p, prox);
})
.or_insert(prox);
}
}
// Push the tail in the heap.
let tail_iter = ordered_peeked_word_positions.drain(1..);
word_positions_heap.extend(tail_iter);
// Advance the head and push it in the heap.
if let Some(mut head) = ordered_peeked_word_positions.pop() {
if let Some(next_position) = head.iter.next() {
let prox = positions_proximity(head.position, next_position);
if prox > 0 && prox < MAX_DISTANCE {
word_pair_proximity
.entry((head.word.clone(), head.word.clone()))
.and_modify(|p| {
*p = cmp::min(*p, prox);
})
.or_insert(prox);
}
word_positions_heap.push(PeekedWordPosition {
word: head.word,
position: next_position,
iter: head.iter,
});
}
}
}
}
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut buffer = Vec::new();
let mut key_buffer = Vec::new();
for ((w1, w2), prox) in word_pair_proximity {
for eob in
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
d.cmp(a)
})
{
buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
let ((w1, w2), prox) = match eob {
Left(key_value) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
key_value
}
Right(key_value) => {
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key_value
}
Both(key_value, _) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key_value
}
};
key_buffer.clear();
key_buffer.push(prox as u8);
key_buffer.push(*prox as u8);
key_buffer.extend_from_slice(w1.as_bytes());
key_buffer.push(0);
key_buffer.extend_from_slice(w2.as_bytes());
word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
word_pair_proximity_docids_sorters[*prox as usize - 1]
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
Ok(())
}
struct PeekedWordPosition<I> {
word: String,
position: u32,
iter: I,
}
impl<I> Ord for PeekedWordPosition<I> {
fn cmp(&self, other: &Self) -> Ordering {
self.position.cmp(&other.position).reverse()
}
}
impl<I> PartialOrd for PeekedWordPosition<I> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<I> Eq for PeekedWordPosition<I> {}
impl<I> PartialEq for PeekedWordPosition<I> {
fn eq(&self, other: &Self) -> bool {
self.position == other.position
fn word_positions_into_word_pair_proximity(
word_positions: &mut VecDeque<(String, u16)>,
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
) -> Result<()> {
let (head_word, head_position) = word_positions.pop_front().unwrap();
for (word, position) in word_positions.iter() {
let prox = index_proximity(head_position as u32, *position as u32) as u8;
if prox > 0 && prox < MAX_DISTANCE as u8 {
word_pair_proximity
.entry((head_word.clone(), word.clone()))
.and_modify(|p| {
*p = cmp::min(*p, prox);
})
.or_insert(prox);
}
}
Ok(())
}

View File

@@ -1,13 +1,18 @@
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use obkv::KvReaderU16;
use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
try_split_array_at, GrenadParameters,
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
GrenadParameters,
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::MergeFn;
use crate::{bucketed_position, DocumentId, Result};
/// Extracts the word positions and the documents ids where this word appear.
///
@@ -17,39 +22,117 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_position_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory,
);
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
let mut current_document_id: Option<u32> = None;
let mut key_buffer = Vec::new();
let mut cursor = docid_word_positions.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
let (document_id_bytes, word_bytes) = try_split_array_at(key)
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
let document_id = DocumentId::from_be_bytes(document_id_bytes);
for position in read_u32_ne_bytes(value) {
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
let (_, position) = relative_from_absolute_position(position);
let position = bucketed_position(position);
key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
if current_document_id.map_or(false, |id| document_id != id) {
words_position_into_sorter(
current_document_id.unwrap(),
&mut key_buffer,
&del_word_positions,
&add_word_positions,
&mut word_position_docids_sorter,
)?;
del_word_positions.clear();
add_word_positions.clear();
}
current_document_id = Some(document_id);
let del_add_reader = KvReaderDelAdd::new(&value);
// extract all unique words to remove.
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
let position = bucketed_position(position);
del_word_positions.insert((position, word_bytes.to_vec()));
}
}
// extract all unique additional words.
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
let position = bucketed_position(position);
add_word_positions.insert((position, word_bytes.to_vec()));
}
}
}
if let Some(document_id) = current_document_id {
words_position_into_sorter(
document_id,
&mut key_buffer,
&del_word_positions,
&add_word_positions,
&mut word_position_docids_sorter,
)?;
}
// TODO remove noop DelAdd OBKV
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
Ok(word_position_docids_reader)
}
fn words_position_into_sorter(
document_id: DocumentId,
key_buffer: &mut Vec<u8>,
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
let mut buffer = Vec::new();
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
{
buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
let (position, word_bytes) = match eob {
Left(key) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
key
}
Right(key) => {
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key
}
Both(key, _) => {
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
key
}
};
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0);
key_buffer.extend_from_slice(&position.to_be_bytes());
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
}
Ok(())
}

View File

@@ -6,13 +6,11 @@ mod extract_fid_word_count_docids;
mod extract_geo_points;
mod extract_vector_points;
mod extract_word_docids;
mod extract_word_fid_docids;
mod extract_word_pair_proximity_docids;
mod extract_word_position_docids;
use std::collections::HashSet;
use std::fs::File;
use std::io::BufReader;
use crossbeam_channel::Sender;
use log::debug;
@@ -26,12 +24,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_vector_points::extract_vector_points;
use self::extract_word_docids::extract_word_docids;
use self::extract_word_fid_docids::extract_word_fid_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
GrenadParameters, MergeFn, MergeableReader,
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
MergeableReader,
};
use super::{helpers, TypedChunk};
use crate::{FieldId, Result};
@@ -40,8 +37,8 @@ use crate::{FieldId, Result};
/// Send data in grenad file over provided Sender.
#[allow(clippy::too_many_arguments)]
pub(crate) fn data_from_obkv_documents(
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: Option<HashSet<FieldId>>,
@@ -94,9 +91,9 @@ pub(crate) fn data_from_obkv_documents(
let (
docid_word_positions_chunks,
(
docid_fid_facet_numbers_chunks,
fid_docid_facet_numbers_chunks,
(
docid_fid_facet_strings_chunks,
fid_docid_facet_strings_chunks,
(
facet_is_null_docids_chunks,
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
@@ -153,7 +150,7 @@ pub(crate) fn data_from_obkv_documents(
});
}
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
@@ -163,7 +160,7 @@ pub(crate) fn data_from_obkv_documents(
"word-pair-proximity-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
@@ -176,21 +173,24 @@ pub(crate) fn data_from_obkv_documents(
spawn_extraction_task::<
_,
_,
Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
merge_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
merge_cbo_roaring_bitmaps,
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
},
"word-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
docid_word_positions_chunks.clone(),
indexer,
lmdb_writer_sx.clone(),
@@ -199,18 +199,9 @@ pub(crate) fn data_from_obkv_documents(
TypedChunk::WordPositionDocids,
"word-position-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_word_positions_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_word_fid_docids,
merge_cbo_roaring_bitmaps,
TypedChunk::WordFidDocids,
"word-fid-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_fid_facet_strings_chunks,
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
fid_docid_facet_strings_chunks,
indexer,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
@@ -219,8 +210,8 @@ pub(crate) fn data_from_obkv_documents(
"field-id-facet-string-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
docid_fid_facet_numbers_chunks,
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
fid_docid_facet_numbers_chunks,
indexer,
lmdb_writer_sx,
extract_facet_number_docids,
@@ -274,7 +265,7 @@ fn spawn_extraction_task<FE, FS, M>(
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents
fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
original_documents_chunk: Result<grenad::Reader<File>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
vectors_field_id: Option<FieldId>,
@@ -316,7 +307,7 @@ fn send_original_documents_data(
#[allow(clippy::too_many_arguments)]
#[allow(clippy::type_complexity)]
fn send_and_extract_flattened_documents_data(
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
flattened_documents_chunk: Result<grenad::Reader<File>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: &Option<HashSet<FieldId>>,
@@ -333,10 +324,7 @@ fn send_and_extract_flattened_documents_data(
grenad::Reader<CursorClonableMmap>,
(
grenad::Reader<CursorClonableMmap>,
(
grenad::Reader<BufReader<File>>,
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
),
(grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
),
),
)> {
@@ -356,7 +344,7 @@ fn send_and_extract_flattened_documents_data(
});
}
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join(
|| {
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
@@ -384,8 +372,8 @@ fn send_and_extract_flattened_documents_data(
},
|| {
let ExtractedFacetValues {
docid_fid_facet_numbers_chunk,
docid_fid_facet_strings_chunk,
fid_docid_facet_numbers_chunk,
fid_docid_facet_strings_chunk,
fid_facet_is_null_docids_chunk,
fid_facet_is_empty_docids_chunk,
fid_facet_exists_docids_chunk,
@@ -396,26 +384,26 @@ fn send_and_extract_flattened_documents_data(
geo_fields_ids,
)?;
// send docid_fid_facet_numbers_chunk to DB writer
let docid_fid_facet_numbers_chunk =
unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
// send fid_docid_facet_numbers_chunk to DB writer
let fid_docid_facet_numbers_chunk =
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
docid_fid_facet_numbers_chunk.clone(),
fid_docid_facet_numbers_chunk.clone(),
)));
// send docid_fid_facet_strings_chunk to DB writer
let docid_fid_facet_strings_chunk =
unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
// send fid_docid_facet_strings_chunk to DB writer
let fid_docid_facet_strings_chunk =
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
docid_fid_facet_strings_chunk.clone(),
fid_docid_facet_strings_chunk.clone(),
)));
Ok((
docid_fid_facet_numbers_chunk,
fid_docid_facet_numbers_chunk,
(
docid_fid_facet_strings_chunk,
fid_docid_facet_strings_chunk,
(
fid_facet_is_null_docids_chunk,
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
@@ -425,5 +413,5 @@ fn send_and_extract_flattened_documents_data(
},
);
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
}

View File

@@ -1,6 +1,6 @@
use std::borrow::Cow;
use std::fs::File;
use std::io::{self, BufReader, BufWriter, Seek};
use std::io::{self, Seek};
use std::time::Instant;
use grenad::{CompressionType, Sorter};
@@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
typ: grenad::CompressionType,
level: Option<u32>,
file: R,
) -> grenad::Writer<BufWriter<R>> {
) -> grenad::Writer<R> {
let mut builder = grenad::Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
builder.compression_level(level);
}
builder.build(BufWriter::new(file))
builder.build(file)
}
pub fn create_sorter(
@@ -53,7 +53,8 @@ pub fn create_sorter(
pub fn sorter_into_reader(
sorter: grenad::Sorter<MergeFn>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<BufReader<File>>> {
) -> Result<grenad::Reader<File>> {
puffin::profile_function!();
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
@@ -64,18 +65,16 @@ pub fn sorter_into_reader(
writer_into_reader(writer)
}
pub fn writer_into_reader(
writer: grenad::Writer<BufWriter<File>>,
) -> Result<grenad::Reader<BufReader<File>>> {
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
let mut file = writer.into_inner()?;
file.rewind()?;
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
grenad::Reader::new(file).map_err(Into::into)
}
pub unsafe fn as_cloneable_grenad(
reader: &grenad::Reader<BufReader<File>>,
reader: &grenad::Reader<File>,
) -> Result<grenad::Reader<CursorClonableMmap>> {
let file = reader.get_ref().get_ref();
let file = reader.get_ref();
let mmap = memmap2::Mmap::map(file)?;
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
let reader = grenad::Reader::new(cursor)?;
@@ -91,8 +90,8 @@ where
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
}
impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
type Output = grenad::Reader<BufReader<File>>;
impl MergeableReader for Vec<grenad::Reader<File>> {
type Output = grenad::Reader<File>;
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut merger = MergerBuilder::new(merge_fn);
@@ -101,8 +100,8 @@ impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
}
}
impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<File>, grenad::Reader<File>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
@@ -115,6 +114,22 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu
}
}
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
let mut m1 = MergerBuilder::new(merge_fn);
let mut m2 = MergerBuilder::new(merge_fn);
let mut m3 = MergerBuilder::new(merge_fn);
for (r1, r2, r3) in self.into_iter() {
m1.push(r1)?;
m2.push(r2)?;
m3.push(r3)?;
}
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
}
}
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
impl<R: io::Read + io::Seek> MergerBuilder<R> {
@@ -127,7 +142,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
Ok(())
}
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
let merger = self.0.build();
let mut writer = create_writer(
params.chunk_compression_type,
@@ -178,7 +193,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
reader: grenad::Reader<R>,
indexer: GrenadParameters,
documents_chunk_size: usize,
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
let mut continue_reading = true;
let mut cursor = reader.into_cursor()?;

View File

@@ -6,11 +6,13 @@ use std::result::Result as StdResult;
use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::transform::Operation;
use crate::Result;
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
#[allow(unused)]
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
@@ -75,57 +77,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
Ok(obkvs.last().unwrap().clone())
}
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
pub fn merge_two_del_add_obkvs(
base: obkv::KvReaderU16,
update: obkv::KvReaderU16,
merge_additions: bool,
buffer: &mut Vec<u8>,
) {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
buffer.clear();
let mut writer = obkv::KvWriter::new(buffer);
let mut value_buffer = Vec::new();
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
match eob {
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
Left((k, v)) => {
if merge_additions {
writer.insert(k, v).unwrap()
} else {
// If merge_additions is false, recreate an obkv keeping the deletions only.
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
let base_reader = KvReaderDelAdd::new(v);
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
value_writer.finish().unwrap();
writer.insert(k, &value_buffer).unwrap()
}
}
}
Right((k, v)) => writer.insert(k, v).unwrap(),
Both((k, base), (_, update)) => {
// merge deletions and additions.
value_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
let base_reader = KvReaderDelAdd::new(base);
let update_reader = KvReaderDelAdd::new(update);
// keep newest deletion.
if let Some(deletion) = update_reader
.get(DelAdd::Deletion)
.or_else(|| base_reader.get(DelAdd::Deletion))
{
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
}
// keep base addition only if merge_additions is true.
let base_addition =
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
// keep newest addition.
// TODO use or_else
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
value_writer.insert(DelAdd::Addition, addition).unwrap();
}
value_writer.finish().unwrap();
writer.insert(k, &value_buffer).unwrap()
}
}
}
writer.finish().unwrap();
}
/// Merge all the obks in the order we see them.
pub fn merge_obkvs_and_operations<'a>(
/// Merge all the obkvs from the newest to the oldest.
fn inner_merge_del_add_obkvs<'a>(
obkvs: &[Cow<'a, [u8]>],
merge_additions: bool,
) -> Result<Cow<'a, [u8]>> {
// pop the newest operation from the list.
let (newest, obkvs) = obkvs.split_last().unwrap();
// keep the operation type for the returned value.
let newest_operation_type = newest[0];
// treat the newest obkv as the starting point of the merge.
let mut acc_operation_type = newest_operation_type;
let mut acc = newest[1..].to_vec();
let mut buffer = Vec::new();
// reverse iter from the most recent to the oldest.
for current in obkvs.into_iter().rev() {
// if in the previous iteration there was a complete deletion,
// stop the merge process.
if acc_operation_type == Operation::Deletion as u8 {
break;
}
let newest = obkv::KvReader::new(&acc);
let oldest = obkv::KvReader::new(&current[1..]);
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
// we want the result of the merge into our accumulator.
std::mem::swap(&mut acc, &mut buffer);
acc_operation_type = current[0];
}
acc.insert(0, newest_operation_type);
Ok(Cow::from(acc))
}
/// Merge all the obkvs from the newest to the oldest.
pub fn obkvs_merge_additions_and_deletions<'a>(
_key: &[u8],
obkvs: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
// [add, add, delete, add, add]
// we can ignore everything that happened before the last delete.
let starting_position =
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
// [add, add, delete]
// if the last operation was a deletion then we simply return the deletion
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
{
return Ok(obkvs[obkvs.len() - 1].clone());
}
let mut buffer = Vec::new();
// (add, add, delete) [add, add]
// in the other case, no deletion will be encountered during the merge
let mut ret =
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
let first = obkv::KvReader::new(&acc);
let second = obkv::KvReader::new(&current[1..]);
merge_two_obkvs(first, second, &mut buffer);
// we want the result of the merge into our accumulator
std::mem::swap(&mut acc, &mut buffer);
acc
});
ret.insert(0, Operation::Addition as u8);
Ok(Cow::from(ret))
inner_merge_del_add_obkvs(obkvs, true)
}
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
_key: &[u8],
obkvs: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
inner_merge_del_add_obkvs(obkvs, false)
}
/// Do a union of all the CboRoaringBitmaps in the values.
pub fn merge_cbo_roaring_bitmaps<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
@@ -138,3 +206,36 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
Ok(Cow::from(vec))
}
}
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
/// separately and outputs a new DelAdd with both unions.
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
// Retrieve the bitmaps from both sides
let mut del_bitmaps_bytes = Vec::new();
let mut add_bitmaps_bytes = Vec::new();
for value in values {
let obkv = KvReaderDelAdd::new(value);
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
del_bitmaps_bytes.push(bitmap_bytes);
}
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
add_bitmaps_bytes.push(bitmap_bytes);
}
}
let mut output_deladd_obkv = KvWriterDelAdd::memory();
let mut buffer = Vec::new();
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
buffer.clear();
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
}
}

View File

@@ -14,7 +14,8 @@ pub use grenad_helpers::{
};
pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
serialize_roaring_bitmap, MergeFn,
};
@@ -44,6 +45,7 @@ where
Some((head, tail))
}
#[allow(unused)]
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
}

View File

@@ -20,7 +20,10 @@ use slice_group_by::GroupBy;
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
use self::enrich::enrich_documents_batch;
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
pub use self::enrich::{
extract_finite_float_from_value, validate_document_id, validate_document_id_value,
validate_geo_from_json, DocumentId,
};
pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
@@ -35,7 +38,7 @@ use crate::update::{
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::{Index, Result, RoaringBitmapCodec};
use crate::{CboRoaringBitmapCodec, Index, Result};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 5;
@@ -403,13 +406,23 @@ where
}
let typed_chunk = match result? {
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
exact_word_docids = Some(cloneable_chunk);
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPairProximityDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
@@ -421,11 +434,6 @@ where
word_position_docids = Some(cloneable_chunk);
TypedChunk::WordPositionDocids(chunk)
}
TypedChunk::WordFidDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordFidDocids(chunk)
}
otherwise => otherwise,
};
@@ -467,13 +475,14 @@ where
let all_documents_ids = index_documents_ids | new_documents_ids;
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
self.execute_prefix_databases(
word_docids,
exact_word_docids,
word_pair_proximity_docids,
word_position_docids,
word_fid_docids,
)?;
// TODO: reactivate prefix DB with diff-indexing
// self.execute_prefix_databases(
// word_docids,
// exact_word_docids,
// word_pair_proximity_docids,
// word_position_docids,
// word_fid_docids,
// )?;
Ok(all_documents_ids.len())
}
@@ -687,8 +696,8 @@ where
fn execute_word_prefix_docids(
txn: &mut heed::RwTxn,
reader: grenad::Reader<Cursor<ClonableMmap>>,
word_docids_db: Database<Str, RoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
indexer_config: &IndexerConfig,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],

View File

@@ -7,18 +7,20 @@ use std::io::{Read, Seek};
use fxhash::FxHashMap;
use heed::RoTxn;
use itertools::Itertools;
use obkv::{KvReader, KvWriter};
use obkv::{KvReader, KvReaderU16, KvWriter};
use roaring::RoaringBitmap;
use serde_json::Value;
use smartstring::SmartString;
use super::helpers::{
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions,
obkvs_merge_additions_and_deletions, MergeFn,
};
use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key};
use crate::update::del_add::into_del_add_obkv;
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
@@ -106,8 +108,8 @@ impl<'a, 'i> Transform<'a, 'i> {
// We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch.
let merge_function = match index_documents_method {
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
};
// We initialize the sorter with the user indexing settings.
@@ -223,19 +225,21 @@ impl<'a, 'i> Transform<'a, 'i> {
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
Entry::Occupied(entry) => *entry.get() as u32,
Entry::Vacant(entry) => {
// If the document was already in the db we mark it as a replaced document.
// It'll be deleted later.
if let Some(docid) = external_documents_ids.get(entry.key()) {
// If it was already in the list of replaced documents it means it was deleted
// by the remove_document method. We should starts as if it never existed.
if self.replaced_documents_ids.insert(docid) {
original_docid = Some(docid);
let docid = match external_documents_ids.get(entry.key()) {
Some(docid) => {
// If it was already in the list of replaced documents it means it was deleted
// by the remove_document method. We should starts as if it never existed.
if self.replaced_documents_ids.insert(docid) {
original_docid = Some(docid);
}
docid
}
}
let docid = self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?;
None => self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?,
};
entry.insert(docid as u64);
docid
}
@@ -263,16 +267,28 @@ impl<'a, 'i> Transform<'a, 'i> {
skip_insertion = true;
} else {
// we associate the base document with the new key, everything will get merged later.
let keep_original_version =
self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(base_obkv);
into_del_add_obkv(
KvReaderU16::new(base_obkv),
true,
keep_original_version,
&mut document_sorter_buffer,
)?;
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
Some(flattened_obkv) => {
// we recreate our buffer with the flattened documents
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(&flattened_obkv);
into_del_add_obkv(
KvReaderU16::new(&flattened_obkv),
true,
keep_original_version,
&mut document_sorter_buffer,
)?;
self.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
}
@@ -288,7 +304,12 @@ impl<'a, 'i> Transform<'a, 'i> {
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(&obkv_buffer);
into_del_add_obkv(
KvReaderU16::new(&obkv_buffer),
false,
true,
&mut document_sorter_buffer,
)?;
// We use the extracted/generated user id as the key for this document.
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
@@ -296,7 +317,12 @@ impl<'a, 'i> Transform<'a, 'i> {
Some(flattened_obkv) => {
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Addition as u8);
document_sorter_buffer.extend_from_slice(&flattened_obkv);
into_del_add_obkv(
KvReaderU16::new(&flattened_obkv),
false,
true,
&mut document_sorter_buffer,
)?;
self.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
}
@@ -354,19 +380,25 @@ impl<'a, 'i> Transform<'a, 'i> {
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let mut documents_deleted = 0;
let mut document_sorter_buffer = Vec::new();
for to_remove in to_remove {
if should_abort() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
// Check if the document has been added in the current indexing process.
let deleted_from_current = match self
.new_external_documents_ids_builder
.entry((*to_remove).into())
{
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
Entry::Occupied(entry) => {
let doc_id = *entry.get() as u32;
self.original_sorter
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
self.flattened_sorter
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Deletion as u8);
obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap();
self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
// we must NOT update the list of replaced_documents_ids
// Either:
@@ -375,21 +407,69 @@ impl<'a, 'i> Transform<'a, 'i> {
// we're removing it there is nothing to do.
self.new_documents_ids.remove(doc_id);
entry.remove_entry();
true
}
Entry::Vacant(entry) => {
// If the document was already in the db we mark it as a `to_delete` document.
// It'll be deleted later. We don't need to push anything to the sorters.
if let Some(docid) = external_documents_ids.get(entry.key()) {
self.replaced_documents_ids.insert(docid);
} else {
// if the document is nowehere to be found, there is nothing to do and we must NOT
// increment the count of documents_deleted
continue;
}
}
Entry::Vacant(_) => false,
};
documents_deleted += 1;
// If the document was already in the db we mark it as a `to_delete` document.
// Then we push the document in sorters in deletion mode.
let deleted_from_db = match external_documents_ids.get(&to_remove) {
Some(docid) => {
self.replaced_documents_ids.insert(docid);
// fetch the obkv document
let original_key = BEU32::new(docid);
let base_obkv = self
.index
.documents
.remap_data_type::<heed::types::ByteSlice>()
.get(wtxn, &original_key)?
.ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
// push it as to delete in the original_sorter
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Deletion as u8);
into_del_add_obkv(
KvReaderU16::new(base_obkv),
true,
false,
&mut document_sorter_buffer,
)?;
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
// flatten it and push it as to delete in the flattened_sorter
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
Some(flattened_obkv) => {
// we recreate our buffer with the flattened documents
document_sorter_buffer.clear();
document_sorter_buffer.push(Operation::Deletion as u8);
into_del_add_obkv(
KvReaderU16::new(&flattened_obkv),
true,
false,
&mut document_sorter_buffer,
)?;
self.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
}
None => self
.flattened_sorter
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
}
true
}
None => false,
};
// increase counter only if the document existed somewhere before.
if deleted_from_current || deleted_from_db {
documents_deleted += 1;
}
}
Ok(documents_deleted)
@@ -589,9 +669,7 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut documents_count = 0;
while let Some((key, val)) = iter.next()? {
if val[0] == Operation::Deletion as u8 {
continue;
}
// skip first byte corresponding to the operation type (Deletion or Addition).
let val = &val[1..];
// send a callback to show at which step we are
@@ -631,9 +709,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// We get rids of the `Operation` byte and skip the deleted documents as well.
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
while let Some((key, val)) = iter.next()? {
if val[0] == Operation::Deletion as u8 {
continue;
}
// skip first byte corresponding to the operation type (Deletion or Addition).
let val = &val[1..];
writer.insert(key, val)?;
}
@@ -659,10 +735,8 @@ impl<'a, 'i> Transform<'a, 'i> {
new_documents_ids: self.new_documents_ids,
replaced_documents_ids: self.replaced_documents_ids,
documents_count: self.documents_count,
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
flattened_documents: flattened_documents
.into_inner()
.map_err(|err| err.into_error())?,
original_documents,
flattened_documents,
})
}
@@ -713,6 +787,7 @@ impl<'a, 'i> Transform<'a, 'i> {
);
let mut obkv_buffer = Vec::new();
let mut document_sorter_buffer = Vec::new();
for result in self.index.all_documents(wtxn)? {
let (docid, obkv) = result?;
@@ -727,7 +802,9 @@ impl<'a, 'i> Transform<'a, 'i> {
}
let buffer = obkv_writer.into_inner()?;
original_writer.insert(docid.to_be_bytes(), &buffer)?;
document_sorter_buffer.clear();
into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?;
original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
// Once we have the document. We're going to flatten it
// and insert it in the flattened sorter.
@@ -762,7 +839,9 @@ impl<'a, 'i> Transform<'a, 'i> {
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
writer.insert(fid, &value)?;
}
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
document_sorter_buffer.clear();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?;
flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
}
// Once we have written all the documents, we extract
@@ -781,10 +860,8 @@ impl<'a, 'i> Transform<'a, 'i> {
new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(),
documents_count,
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
flattened_documents: flattened_documents
.into_inner()
.map_err(|err| err.into_error())?,
original_documents,
flattened_documents,
};
let new_facets = output.compute_real_facets(wtxn, self.index)?;
@@ -828,38 +905,86 @@ mod test {
#[test]
fn merge_obkvs() {
let mut doc_0 = Vec::new();
let mut kv_writer = KvWriter::new(&mut doc_0);
let mut additive_doc_0 = Vec::new();
let mut deletive_doc_0 = Vec::new();
let mut del_add_doc_0 = Vec::new();
let mut kv_writer = KvWriter::memory();
kv_writer.insert(0_u8, [0]).unwrap();
kv_writer.finish().unwrap();
doc_0.insert(0, Operation::Addition as u8);
let buffer = kv_writer.into_inner().unwrap();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap();
additive_doc_0.insert(0, Operation::Addition as u8);
into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap();
deletive_doc_0.insert(0, Operation::Deletion as u8);
into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap();
del_add_doc_0.insert(0, Operation::Addition as u8);
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
assert_eq!(*ret, doc_0);
let mut additive_doc_1 = Vec::new();
let mut kv_writer = KvWriter::memory();
kv_writer.insert(1_u8, [1]).unwrap();
let buffer = kv_writer.into_inner().unwrap();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap();
additive_doc_1.insert(0, Operation::Addition as u8);
let ret = merge_obkvs_and_operations(
let mut additive_doc_0_1 = Vec::new();
let mut kv_writer = KvWriter::memory();
kv_writer.insert(0_u8, [0]).unwrap();
kv_writer.insert(1_u8, [1]).unwrap();
let buffer = kv_writer.into_inner().unwrap();
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap();
additive_doc_0_1.insert(0, Operation::Addition as u8);
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
.unwrap();
assert_eq!(*ret, additive_doc_0);
let ret = obkvs_merge_additions_and_deletions(
&[],
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, doc_0);
assert_eq!(*ret, del_add_doc_0);
let ret = merge_obkvs_and_operations(
let ret = obkvs_merge_additions_and_deletions(
&[],
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, [Operation::Deletion as u8]);
assert_eq!(*ret, deletive_doc_0);
let ret = merge_obkvs_and_operations(
let ret = obkvs_merge_additions_and_deletions(
&[],
&[
Cow::from([Operation::Addition as u8, 1].as_slice()),
Cow::from([Operation::Deletion as u8].as_slice()),
Cow::from(doc_0.as_slice()),
Cow::from(additive_doc_1.as_slice()),
Cow::from(deletive_doc_0.as_slice()),
Cow::from(additive_doc_0.as_slice()),
],
)
.unwrap();
assert_eq!(*ret, doc_0);
assert_eq!(*ret, del_add_doc_0);
let ret = obkvs_merge_additions_and_deletions(
&[],
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, additive_doc_0_1);
let ret = obkvs_keep_last_addition_merge_deletions(
&[],
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
)
.unwrap();
assert_eq!(*ret, additive_doc_0);
let ret = obkvs_keep_last_addition_merge_deletions(
&[],
&[
Cow::from(deletive_doc_0.as_slice()),
Cow::from(additive_doc_1.as_slice()),
Cow::from(additive_doc_0.as_slice()),
],
)
.unwrap();
assert_eq!(*ret, del_add_doc_0);
}
}

View File

@@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::File;
use std::io::{self, BufReader};
use std::io;
use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script};
@@ -27,23 +27,23 @@ pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
Documents(grenad::Reader<CursorClonableMmap>),
FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
FieldIdWordcountDocids(grenad::Reader<File>),
NewDocumentsIds(RoaringBitmap),
WordDocids {
word_docids_reader: grenad::Reader<BufReader<File>>,
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
word_docids_reader: grenad::Reader<File>,
exact_word_docids_reader: grenad::Reader<File>,
word_fid_docids_reader: grenad::Reader<File>,
},
WordPositionDocids(grenad::Reader<BufReader<File>>),
WordFidDocids(grenad::Reader<BufReader<File>>),
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
GeoPoints(grenad::Reader<BufReader<File>>),
VectorPoints(grenad::Reader<BufReader<File>>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
WordPositionDocids(grenad::Reader<File>),
WordPairProximityDocids(grenad::Reader<File>),
FieldIdFacetStringDocids(grenad::Reader<File>),
FieldIdFacetNumberDocids(grenad::Reader<File>),
FieldIdFacetExistsDocids(grenad::Reader<File>),
FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>),
VectorPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
}
impl TypedChunk {
@@ -64,17 +64,19 @@ impl TypedChunk {
TypedChunk::NewDocumentsIds(grenad) => {
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => format!(
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
word_docids_reader.len(),
exact_word_docids_reader.len()
exact_word_docids_reader.len(),
word_fid_docids_reader.len()
),
TypedChunk::WordPositionDocids(grenad) => {
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::WordFidDocids(grenad) => {
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::WordPairProximityDocids(grenad) => {
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
}
@@ -99,8 +101,8 @@ impl TypedChunk {
TypedChunk::VectorPoints(grenad) => {
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
}
TypedChunk::ScriptLanguageDocids(grenad) => {
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
TypedChunk::ScriptLanguageDocids(sl_map) => {
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
}
}
}
@@ -138,7 +140,11 @@ pub(crate) fn write_typed_chunk_into_index(
TypedChunk::NewDocumentsIds(documents_ids) => {
return Ok((documents_ids, is_merged_database))
}
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
append_entries_into_database(
word_docids_iter.clone(),
@@ -146,7 +152,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@@ -156,7 +162,17 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
merge_cbo_roaring_bitmaps,
)?;
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
append_entries_into_database(
word_fid_docids_iter,
&index.word_fid_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
// create fst from word docids
@@ -182,17 +198,6 @@ pub(crate) fn write_typed_chunk_into_index(
)?;
is_merged_database = true;
}
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
append_entries_into_database(
word_fid_docids_iter,
&index.word_fid_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_cbo_roaring_bitmaps,
)?;
is_merged_database = true;
}
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
indexer.execute(wtxn)?;
@@ -339,22 +344,25 @@ pub(crate) fn write_typed_chunk_into_index(
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
index.put_vector_hnsw(wtxn, &new_hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();
for (key, value) in hash_pair {
buffer.clear();
TypedChunk::ScriptLanguageDocids(sl_map) => {
for (key, (deletion, addition)) in sl_map {
let mut db_key_exists = false;
let final_value = match index.script_language_docids.get(wtxn, &key)? {
Some(db_values) => {
let mut db_value_buffer = Vec::new();
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
let mut new_value_buffer = Vec::new();
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
RoaringBitmap::deserialize_from(&buffer[..])?
db_key_exists = true;
(db_values - deletion) | addition
}
None => value,
None => addition,
};
index.script_language_docids.put(wtxn, &key, &final_value)?;
if final_value.is_empty() {
// If the database entry exists, delete it.
if db_key_exists == true {
index.script_language_docids.delete(wtxn, &key)?;
}
} else {
index.script_language_docids.put(wtxn, &key, &final_value)?;
}
}
}
}
@@ -379,13 +387,6 @@ fn merge_word_docids_reader_into_fst(
Ok(builder.into_set())
}
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
let new_value = RoaringBitmap::deserialize_from(new_value)?;
let db_value = RoaringBitmap::deserialize_from(db_value)?;
let value = new_value | db_value;
Ok(serialize_roaring_bitmap(&value, buffer)?)
}
fn merge_cbo_roaring_bitmaps(
new_value: &[u8],
db_value: &[u8],
@@ -455,6 +456,7 @@ where
R: io::Read + io::Seek,
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
K: for<'a> heed::BytesDecode<'a>,
{
puffin::profile_function!(format!("number of entries: {}", data.len()));
@@ -475,6 +477,12 @@ where
let mut cursor = data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
if valid_lmdb_key(key) {
debug_assert!(
K::bytes_decode(&key).is_some(),
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
key.len(),
&key
);
buffer.clear();
let value = serialize_value(value, &mut buffer)?;
unsafe { database.append(key, value)? };

View File

@@ -21,6 +21,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
mod available_documents_ids;
mod clear_documents;
pub(crate) mod del_add;
mod delete_documents;
pub(crate) mod facet;
mod index_documents;

View File

@@ -1,6 +1,6 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::io::{BufReader, BufWriter};
use std::io::BufReader;
use grenad::CompressionType;
use heed::types::ByteSlice;
@@ -119,9 +119,9 @@ pub fn insert_into_database(
pub fn write_into_lmdb_database_without_merging(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
writer: grenad::Writer<BufWriter<std::fs::File>>,
writer: grenad::Writer<std::fs::File>,
) -> Result<()> {
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
let file = writer.into_inner()?;
let reader = grenad::Reader::new(BufReader::new(file))?;
if database.is_empty(wtxn)? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;

View File

@@ -20,4 +20,7 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 at a [100, ]
3 rings a [101, ]
3 the a [101, ]
4 at b [100, ]
4 at be [100, ]
4 bell a [101, ]

View File

@@ -30,4 +30,10 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 bell 5 [101, ]
3 rings am [101, ]
3 the at [101, ]
4 an house [100, ]
4 at beautiful [100, ]
4 bell am [101, ]
4 the 5 [101, ]
5 at house [100, ]
5 the am [101, ]

View File

@@ -28,4 +28,8 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 rings a [101, ]
3 rings am [101, ]
3 the a [101, ]
4 at b [100, ]
4 at be [100, ]
4 bell a [101, ]
4 bell am [101, ]

View File

@@ -7,4 +7,5 @@ source: milli/src/update/prefix_word_pairs/mod.rs
2 bell a [51, ]
3 rings a [51, ]
3 the a [51, ]
4 bell a [51, ]

View File

@@ -12,4 +12,5 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 at a [50, ]
3 rings a [51, ]
3 the a [51, ]
4 bell a [51, ]

View File

@@ -7,4 +7,5 @@ source: milli/src/update/prefix_word_pairs/mod.rs
2 bell a [51, ]
3 rings a [51, ]
3 the a [51, ]
4 bell a [51, ]

View File

@@ -12,4 +12,5 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 at a [50, ]
3 rings a [51, ]
3 the a [51, ]
4 bell a [51, ]

View File

@@ -12,4 +12,5 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 at a [50, ]
3 rings a [51, ]
3 the a [51, ]
4 bell a [51, ]

View File

@@ -16,4 +16,6 @@ source: milli/src/update/prefix_word_pairs/mod.rs
3 at a [50, ]
3 rings a [51, ]
3 the a [51, ]
4 at b [50, ]
4 bell a [51, ]

Some files were not shown because too many files have changed in this diff Show More