Compare commits

..

18 Commits

Author SHA1 Message Date
dd01613a63 Remove the unused distance 2023-06-14 16:37:14 +02:00
70d975b399 Introduce a new error message for invalid vector dimensions 2023-06-14 16:36:58 +02:00
a8e6d946a7 Make clippy happy 2023-06-14 15:59:10 +02:00
7c1f72ae33 Fix the tests 2023-06-14 15:57:31 +02:00
442a8f44c6 Support more pages but in an ugly way 2023-06-14 15:53:39 +02:00
185a238c77 Change the name of the distance module 2023-06-14 15:53:39 +02:00
a82bf776f3 Implement an ugly deletion of values in the HNSW 2023-06-14 15:53:39 +02:00
b2f86df127 Replace the euclidean with a dot product 2023-06-14 15:53:39 +02:00
c3a5f51705 Use a basic euclidean distance function 2023-06-14 15:53:39 +02:00
686d1f4c12 Move back to the hnsw crate
This reverts commit 7a4b6c065482f988b01298642f4c18775503f92f.
2023-06-14 15:53:39 +02:00
ba75606731 Log more to make sure we insert vectors in the hgg data-structure 2023-06-14 15:53:38 +02:00
baf3b036d9 Introduce an optimized version of the euclidean distance function 2023-06-14 15:53:38 +02:00
0d499f0055 Move to the hgg crate 2023-06-14 15:53:38 +02:00
7999c397c5 Expose a new vector field on the search route 2023-06-14 15:53:38 +02:00
c44db8b4bc Add a vector field to the search routes 2023-06-14 15:53:38 +02:00
9466949e34 Store the vectors in an HNSW in LMDB 2023-06-14 15:53:38 +02:00
f051bbfd84 Extract the vectors from the documents 2023-06-14 15:52:43 +02:00
72b1c3df08 Create a new _vector extractor 2023-06-14 15:52:43 +02:00
65 changed files with 914 additions and 1708 deletions

View File

@ -1,41 +1,24 @@
#!/usr/bin/env bash
set -eu -o pipefail
#!/bin/bash
check_tag() {
local expected=$1
local actual=$2
local filename=$3
if [[ $actual != $expected ]]; then
echo >&2 "Error: the current tag does not match the version in $filename: found $actual, expected $expected"
return 1
fi
# check_tag $current_tag $file_tag $file_name
function check_tag {
if [[ "$1" != "$2" ]]; then
echo "Error: the current tag does not match the version in Cargo.toml: found $2 - expected $1"
ret=1
fi
}
read_version() {
grep '^version = ' | cut -d \" -f 2
}
if [[ -z "${GITHUB_REF:-}" ]]; then
echo >&2 "Error: GITHUB_REF is not set"
exit 1
fi
if [[ ! "$GITHUB_REF" =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+(-[a-z0-9]+)?$ ]]; then
echo >&2 "Error: GITHUB_REF is not a valid tag: $GITHUB_REF"
exit 1
fi
current_tag=${GITHUB_REF#refs/tags/v}
ret=0
current_tag=${GITHUB_REF#'refs/tags/v'}
toml_tag="$(cat Cargo.toml | read_version)"
check_tag "$current_tag" "$toml_tag" Cargo.toml || ret=1
file_tag="$(grep '^version = ' Cargo.toml | cut -d '=' -f 2 | tr -d '"' | tr -d ' ')"
check_tag $current_tag $file_tag
lock_tag=$(grep -A 1 '^name = "meilisearch-auth"' Cargo.lock | read_version)
check_tag "$current_tag" "$lock_tag" Cargo.lock || ret=1
lock_file='Cargo.lock'
lock_tag=$(grep -A 1 'name = "meilisearch-auth"' $lock_file | grep version | cut -d '=' -f 2 | tr -d '"' | tr -d ' ')
check_tag $current_tag $lock_tag $lock_file
if (( ret == 0 )); then
echo 'OK'
if [[ "$ret" -eq 0 ]] ; then
echo 'OK'
fi
exit $ret

View File

@ -1,24 +0,0 @@
name: Run the indexing fuzzer
on:
push:
branches:
- main
jobs:
fuzz:
name: Setup the action
runs-on: ubuntu-latest
timeout-minutes: 4320 # 72h
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
# Run benchmarks
- name: Run the fuzzer
run: |
cargo run --release --bin fuzz-indexing

View File

@ -25,7 +25,7 @@ jobs:
- name: Define the Docker image we need to use
id: define-image
run: |
event=${{ github.event_name }}
event=${{ github.event.action }}
echo "docker-image=nightly" >> $GITHUB_OUTPUT
if [[ $event == 'workflow_dispatch' ]]; then
echo "docker-image=${{ github.event.inputs.docker_image }}" >> $GITHUB_OUTPUT
@ -37,7 +37,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -72,7 +72,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -99,7 +99,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -130,7 +130,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -155,7 +155,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -185,7 +185,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@ -210,7 +210,7 @@ jobs:
runs-on: ubuntu-latest
services:
meilisearch:
image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
env:
MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}

818
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -10,12 +10,10 @@ members = [
"file-store",
"permissive-json-pointer",
"milli",
"index-stats",
"filter-parser",
"flatten-serde-json",
"json-depth-checker",
"benchmarks",
"fuzzers",
"benchmarks"
]
[workspace.package]

View File

@ -1,20 +0,0 @@
[package]
name = "fuzzers"
publish = false
version.workspace = true
authors.workspace = true
description.workspace = true
homepage.workspace = true
readme.workspace = true
edition.workspace = true
license.workspace = true
[dependencies]
arbitrary = { version = "1.3.0", features = ["derive"] }
clap = { version = "4.3.0", features = ["derive"] }
fastrand = "1.9.0"
milli = { path = "../milli" }
serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] }
tempfile = "3.5.0"

View File

@ -1,3 +0,0 @@
# Fuzzers
The purpose of this crate is to contains all the handmade "fuzzer" we may need.

View File

@ -1,152 +0,0 @@
use std::num::NonZeroUsize;
use std::path::PathBuf;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::time::Duration;
use arbitrary::{Arbitrary, Unstructured};
use clap::Parser;
use fuzzers::Operation;
use milli::heed::EnvOpenOptions;
use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
use milli::Index;
use tempfile::TempDir;
#[derive(Debug, Arbitrary)]
struct Batch([Operation; 5]);
#[derive(Debug, Clone, Parser)]
struct Opt {
/// The number of fuzzer to run in parallel.
#[clap(long)]
par: Option<NonZeroUsize>,
// We need to put a lot of newlines in the following documentation or else everything gets collapsed on one line
/// The path in which the databases will be created.
/// Using a ramdisk is recommended.
///
/// Linux:
///
/// sudo mount -t tmpfs -o size=2g tmpfs ramdisk # to create it
///
/// sudo umount ramdisk # to remove it
///
/// MacOS:
///
/// diskutil erasevolume HFS+ 'RAM Disk' `hdiutil attach -nobrowse -nomount ram://4194304 # create it
///
/// hdiutil detach /dev/:the_disk
#[clap(long)]
path: Option<PathBuf>,
}
fn main() {
let opt = Opt::parse();
let progression: &'static AtomicUsize = Box::leak(Box::new(AtomicUsize::new(0)));
let stop: &'static AtomicBool = Box::leak(Box::new(AtomicBool::new(false)));
let par = opt.par.unwrap_or_else(|| std::thread::available_parallelism().unwrap()).get();
let mut handles = Vec::with_capacity(par);
for _ in 0..par {
let opt = opt.clone();
let handle = std::thread::spawn(move || {
let mut options = EnvOpenOptions::new();
options.map_size(1024 * 1024 * 1024 * 1024);
let tempdir = match opt.path {
Some(path) => TempDir::new_in(path).unwrap(),
None => TempDir::new().unwrap(),
};
let index = Index::new(options, tempdir.path()).unwrap();
let indexer_config = IndexerConfig::default();
let index_documents_config = IndexDocumentsConfig::default();
std::thread::scope(|s| {
loop {
if stop.load(Ordering::Relaxed) {
return;
}
let v: Vec<u8> =
std::iter::repeat_with(|| fastrand::u8(..)).take(1000).collect();
let mut data = Unstructured::new(&v);
let batches = <[Batch; 5]>::arbitrary(&mut data).unwrap();
// will be used to display the error once a thread crashes
let dbg_input = format!("{:#?}", batches);
let handle = s.spawn(|| {
let mut wtxn = index.write_txn().unwrap();
for batch in batches {
let mut builder = IndexDocuments::new(
&mut wtxn,
&index,
&indexer_config,
index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
for op in batch.0 {
match op {
Operation::AddDoc(doc) => {
let documents =
milli::documents::objects_from_json_value(doc.to_d());
let documents =
milli::documents::documents_batch_reader_from_objects(
documents,
);
let (b, _added) = builder.add_documents(documents).unwrap();
builder = b;
}
Operation::DeleteDoc(id) => {
let (b, _removed) =
builder.remove_documents(vec![id.to_s()]).unwrap();
builder = b;
}
}
}
builder.execute().unwrap();
// after executing a batch we check if the database is corrupted
let res = index.search(&wtxn).execute().unwrap();
index.documents(&wtxn, res.documents_ids).unwrap();
progression.fetch_add(1, Ordering::Relaxed);
}
wtxn.abort().unwrap();
});
if let err @ Err(_) = handle.join() {
stop.store(true, Ordering::Relaxed);
err.expect(&dbg_input);
}
}
});
});
handles.push(handle);
}
std::thread::spawn(|| {
let mut last_value = 0;
let start = std::time::Instant::now();
loop {
let total = progression.load(Ordering::Relaxed);
let elapsed = start.elapsed().as_secs();
if elapsed > 3600 {
// after 1 hour, stop the fuzzer, success
std::process::exit(0);
}
println!(
"Has been running for {:?} seconds. Tested {} new values for a total of {}.",
elapsed,
total - last_value,
total
);
last_value = total;
std::thread::sleep(Duration::from_secs(1));
}
});
for handle in handles {
handle.join().unwrap();
}
}

View File

@ -1,46 +0,0 @@
use arbitrary::Arbitrary;
use serde_json::{json, Value};
#[derive(Debug, Arbitrary)]
pub enum Document {
One,
Two,
Three,
Four,
Five,
Six,
}
impl Document {
pub fn to_d(&self) -> Value {
match self {
Document::One => json!({ "id": 0, "doggo": "bernese" }),
Document::Two => json!({ "id": 0, "doggo": "golden" }),
Document::Three => json!({ "id": 0, "catto": "jorts" }),
Document::Four => json!({ "id": 1, "doggo": "bernese" }),
Document::Five => json!({ "id": 1, "doggo": "golden" }),
Document::Six => json!({ "id": 1, "catto": "jorts" }),
}
}
}
#[derive(Debug, Arbitrary)]
pub enum DocId {
Zero,
One,
}
impl DocId {
pub fn to_s(&self) -> String {
match self {
DocId::Zero => "0".to_string(),
DocId::One => "1".to_string(),
}
}
}
#[derive(Debug, Arbitrary)]
pub enum Operation {
AddDoc(Document),
DeleteDoc(DocId),
}

View File

@ -160,7 +160,7 @@ impl BatchKind {
impl BatchKind {
/// Returns a `ControlFlow::Break` if you must stop right now.
/// The boolean tell you if an index has been created by the batched task.
/// To ease the writing of the code. `true` can be returned when you don't need to create an index
/// To ease the writting of the code. `true` can be returned when you don't need to create an index
/// but false can't be returned if you needs to create an index.
// TODO use an AutoBatchKind as input
pub fn new(
@ -214,7 +214,7 @@ impl BatchKind {
/// Returns a `ControlFlow::Break` if you must stop right now.
/// The boolean tell you if an index has been created by the batched task.
/// To ease the writing of the code. `true` can be returned when you don't need to create an index
/// To ease the writting of the code. `true` can be returned when you don't need to create an index
/// but false can't be returned if you needs to create an index.
#[rustfmt::skip]
fn accumulate(self, id: TaskId, kind: AutobatchKind, index_already_exists: bool, primary_key: Option<&str>) -> ControlFlow<BatchKind, BatchKind> {
@ -321,18 +321,9 @@ impl BatchKind {
})
}
(
BatchKind::DocumentOperation { method, allow_index_creation, primary_key, mut operation_ids },
this @ BatchKind::DocumentOperation { .. },
K::DocumentDeletion,
) => {
operation_ids.push(id);
Continue(BatchKind::DocumentOperation {
method,
allow_index_creation,
primary_key,
operation_ids,
})
}
) => Break(this),
// but we can't autobatch documents if it's not the same kind
// this match branch MUST be AFTER the previous one
(
@ -355,35 +346,7 @@ impl BatchKind {
deletion_ids.push(id);
Continue(BatchKind::DocumentClear { ids: deletion_ids })
}
// we can autobatch the deletion and import if the index already exists
(
BatchKind::DocumentDeletion { mut deletion_ids },
K::DocumentImport { method, allow_index_creation, primary_key }
) if index_already_exists => {
deletion_ids.push(id);
Continue(BatchKind::DocumentOperation {
method,
allow_index_creation,
primary_key,
operation_ids: deletion_ids,
})
}
// we can autobatch the deletion and import if both can't create an index
(
BatchKind::DocumentDeletion { mut deletion_ids },
K::DocumentImport { method, allow_index_creation, primary_key }
) if !allow_index_creation => {
deletion_ids.push(id);
Continue(BatchKind::DocumentOperation {
method,
allow_index_creation,
primary_key,
operation_ids: deletion_ids,
})
}
// we can't autobatch a deletion and an import if the index does not exists but would be created by an addition
// we can't autobatch a deletion and an import
(
this @ BatchKind::DocumentDeletion { .. },
K::DocumentImport { .. }
@ -685,36 +648,36 @@ mod tests {
debug_snapshot!(autobatch_from(false,None, [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false,None, [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))");
// We can autobatch document addition with document deletion
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
// And the other way around
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
// We can't autobatch document addition with document deletion
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
// we also can't do the only way around
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
}
#[test]

View File

@ -998,7 +998,7 @@ impl IndexScheduler {
}()
.unwrap_or_default();
// The write transaction is directly owned and committed inside.
// The write transaction is directly owned and commited inside.
match self.index_mapper.delete_index(wtxn, &index_uid) {
Ok(()) => (),
Err(Error::IndexNotFound(_)) if index_has_been_created => (),

View File

@ -1785,7 +1785,7 @@ mod tests {
assert_eq!(task.kind.as_kind(), k);
}
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "everything_is_successfully_registered");
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "everything_is_succesfully_registered");
}
#[test]
@ -2075,105 +2075,6 @@ mod tests {
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded");
}
#[test]
fn document_addition_and_document_deletion() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let content = r#"[
{ "id": 1, "doggo": "jean bob" },
{ "id": 2, "catto": "jorts" },
{ "id": 3, "doggo": "bork" }
]"#;
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
file.persist().unwrap();
index_scheduler
.register(KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: Some(S("id")),
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
index_scheduler
.register(KindWithContent::DocumentDeletion {
index_uid: S("doggos"),
documents_ids: vec![S("1"), S("2")],
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
handle.advance_one_successful_batch(); // The addition AND deletion should've been batched together
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_batch");
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
}
#[test]
fn document_deletion_and_document_addition() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
index_scheduler
.register(KindWithContent::DocumentDeletion {
index_uid: S("doggos"),
documents_ids: vec![S("1"), S("2")],
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
let content = r#"[
{ "id": 1, "doggo": "jean bob" },
{ "id": 2, "catto": "jorts" },
{ "id": 3, "doggo": "bork" }
]"#;
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
file.persist().unwrap();
index_scheduler
.register(KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: Some(S("id")),
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
})
.unwrap();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
// The deletion should have failed because it can't create an index
handle.advance_one_failed_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_the_deletion");
// The addition should works
handle.advance_one_successful_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_last_successful_addition");
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
}
#[test]
fn do_not_batch_task_of_different_indexes() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);

View File

@ -1,43 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
1 {uid: 1, status: succeeded, details: { received_document_ids: 2, deleted_documents: Some(2) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued []
succeeded [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
"documentDeletion" [1,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
doggos: { number_of_documents: 1, field_distribution: {"doggo": 1, "id": 1} }
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,1,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,1,]
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -1,9 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"id": 3,
"doggo": "bork"
}
]

View File

@ -1,37 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -1,40 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
1 {uid: 1, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [0,]
"documentDeletion" [1,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -1,43 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [1,]
failed [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,]
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -1,46 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued []
succeeded [1,]
failed [0,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
doggos: { number_of_documents: 3, field_distribution: {"catto": 1, "doggo": 2, "id": 3} }
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Finished At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -1,17 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
[
{
"id": 1,
"doggo": "jean bob"
},
{
"id": 2,
"catto": "jorts"
},
{
"id": 3,
"doggo": "bork"
}
]

View File

@ -1,36 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
----------------------------------------------------------------------
### Status:
enqueued [0,]
----------------------------------------------------------------------
### Kind:
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
----------------------------------------------------------------------

View File

@ -1,40 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
### Autobatching Enabled = true
### Processing Tasks:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
----------------------------------------------------------------------
### Status:
enqueued [0,1,]
----------------------------------------------------------------------
### Kind:
"documentAdditionOrUpdate" [1,]
"documentDeletion" [0,]
----------------------------------------------------------------------
### Index Tasks:
doggos [0,1,]
----------------------------------------------------------------------
### Index Mapper:
----------------------------------------------------------------------
### Canceled By:
----------------------------------------------------------------------
### Enqueued At:
[timestamp] [0,]
[timestamp] [1,]
----------------------------------------------------------------------
### Started At:
----------------------------------------------------------------------
### Finished At:
----------------------------------------------------------------------
### File Store:
00000000-0000-0000-0000-000000000000
----------------------------------------------------------------------

View File

@ -1,12 +0,0 @@
[package]
name = "index-stats"
description = "A small program that computes internal stats of a Meilisearch index"
version = "0.1.0"
edition = "2021"
publish = false
[dependencies]
anyhow = "1.0.71"
clap = { version = "4.3.5", features = ["derive"] }
milli = { path = "../milli" }
piechart = "1.0.0"

View File

@ -1,224 +0,0 @@
use std::cmp::Reverse;
use std::path::PathBuf;
use clap::Parser;
use milli::heed::{types::ByteSlice, EnvOpenOptions, PolyDatabase, RoTxn};
use milli::index::db_name::*;
use milli::index::Index;
use piechart::{Chart, Color, Data};
/// Simple program to greet a person
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// The path to the LMDB Meilisearch index database.
path: PathBuf,
/// The radius of the graphs
#[clap(long, default_value_t = 10)]
graph_radius: u16,
/// The radius of the graphs
#[clap(long, default_value_t = 6)]
graph_aspect_ratio: u16,
}
fn main() -> anyhow::Result<()> {
let Args { path, graph_radius, graph_aspect_ratio } = Args::parse();
let env = EnvOpenOptions::new().max_dbs(24).open(path)?;
// TODO not sure to keep that...
// if removed put the pub(crate) back in the Index struct
matches!(
Option::<Index>::None,
Some(Index {
env: _,
main: _,
word_docids: _,
exact_word_docids: _,
word_prefix_docids: _,
exact_word_prefix_docids: _,
word_pair_proximity_docids: _,
word_prefix_pair_proximity_docids: _,
prefix_word_pair_proximity_docids: _,
word_position_docids: _,
word_fid_docids: _,
field_id_word_count_docids: _,
word_prefix_position_docids: _,
word_prefix_fid_docids: _,
script_language_docids: _,
facet_id_exists_docids: _,
facet_id_is_null_docids: _,
facet_id_is_empty_docids: _,
facet_id_f64_docids: _,
facet_id_string_docids: _,
field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _,
documents: _,
})
);
let mut wtxn = env.write_txn()?;
let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
let word_docids = env.create_poly_database(&mut wtxn, Some(WORD_DOCIDS))?;
let exact_word_docids = env.create_poly_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
let word_prefix_docids = env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
let exact_word_prefix_docids =
env.create_poly_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
let word_pair_proximity_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids =
env.create_poly_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
let word_prefix_pair_proximity_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let prefix_word_pair_proximity_docids =
env.create_poly_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_position_docids = env.create_poly_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
let word_fid_docids = env.create_poly_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
let field_id_word_count_docids =
env.create_poly_database(&mut wtxn, Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
let word_prefix_position_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_POSITION_DOCIDS))?;
let word_prefix_fid_docids =
env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
let facet_id_f64_docids = env.create_poly_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_exists_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids =
env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s =
env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings =
env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let documents = env.create_poly_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?;
let list = [
(main, MAIN),
(word_docids, WORD_DOCIDS),
(exact_word_docids, EXACT_WORD_DOCIDS),
(word_prefix_docids, WORD_PREFIX_DOCIDS),
(exact_word_prefix_docids, EXACT_WORD_PREFIX_DOCIDS),
(word_pair_proximity_docids, WORD_PAIR_PROXIMITY_DOCIDS),
(script_language_docids, SCRIPT_LANGUAGE_DOCIDS),
(word_prefix_pair_proximity_docids, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS),
(prefix_word_pair_proximity_docids, PREFIX_WORD_PAIR_PROXIMITY_DOCIDS),
(word_position_docids, WORD_POSITION_DOCIDS),
(word_fid_docids, WORD_FIELD_ID_DOCIDS),
(field_id_word_count_docids, FIELD_ID_WORD_COUNT_DOCIDS),
(word_prefix_position_docids, WORD_PREFIX_POSITION_DOCIDS),
(word_prefix_fid_docids, WORD_PREFIX_FIELD_ID_DOCIDS),
(facet_id_f64_docids, FACET_ID_F64_DOCIDS),
(facet_id_string_docids, FACET_ID_STRING_DOCIDS),
(facet_id_exists_docids, FACET_ID_EXISTS_DOCIDS),
(facet_id_is_null_docids, FACET_ID_IS_NULL_DOCIDS),
(facet_id_is_empty_docids, FACET_ID_IS_EMPTY_DOCIDS),
(field_id_docid_facet_f64s, FIELD_ID_DOCID_FACET_F64S),
(field_id_docid_facet_strings, FIELD_ID_DOCID_FACET_STRINGS),
(documents, DOCUMENTS),
];
let rtxn = env.read_txn()?;
let result: Result<Vec<_>, _> =
list.into_iter().map(|(db, name)| compute_stats(&rtxn, db).map(|s| (s, name))).collect();
let mut stats = result?;
println!("{:1$} Number of Entries", "", graph_radius as usize * 2);
stats.sort_by_key(|(s, _)| Reverse(s.number_of_entries));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.number_of_entries as f32, *n)));
Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
display_legend(&data);
print!("\r\n");
println!("{:1$} Size of Entries", "", graph_radius as usize * 2);
stats.sort_by_key(|(s, _)| Reverse(s.size_of_entries));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_entries as f32, *n)));
Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
display_legend(&data);
print!("\r\n");
println!("{:1$} Size of Data", "", graph_radius as usize * 2);
stats.sort_by_key(|(s, _)| Reverse(s.size_of_data));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_data as f32, *n)));
Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
display_legend(&data);
print!("\r\n");
println!("{:1$} Size of Keys", "", graph_radius as usize * 2);
stats.sort_by_key(|(s, _)| Reverse(s.size_of_keys));
let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_keys as f32, *n)));
Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
display_legend(&data);
Ok(())
}
fn display_legend(data: &[Data]) {
let total: f32 = data.iter().map(|d| d.value).sum();
for Data { label, value, color, fill } in data {
println!(
"{} {} {:.02}%",
color.unwrap().paint(fill.to_string()),
label,
value / total * 100.0
);
}
}
fn compute_graph_data<'a>(stats: impl IntoIterator<Item = (f32, &'a str)>) -> Vec<Data> {
let mut colors = [
Color::Red,
Color::Green,
Color::Yellow,
Color::Blue,
Color::Purple,
Color::Cyan,
Color::White,
]
.into_iter()
.cycle();
let mut characters = ['▴', '▵', '▾', '▿', '▪', '▫', '•', '◦'].into_iter().cycle();
stats
.into_iter()
.map(|(value, name)| Data {
label: (*name).into(),
value,
color: Some(colors.next().unwrap().into()),
fill: characters.next().unwrap(),
})
.collect()
}
#[derive(Debug)]
pub struct Stats {
pub number_of_entries: u64,
pub size_of_keys: u64,
pub size_of_data: u64,
pub size_of_entries: u64,
}
fn compute_stats(rtxn: &RoTxn, db: PolyDatabase) -> anyhow::Result<Stats> {
let mut number_of_entries = 0;
let mut size_of_keys = 0;
let mut size_of_data = 0;
for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
let (key, data) = result?;
number_of_entries += 1;
size_of_keys += key.len() as u64;
size_of_data += data.len() as u64;
}
Ok(Stats {
number_of_entries,
size_of_keys,
size_of_data,
size_of_entries: size_of_keys + size_of_data,
})
}

View File

@ -217,6 +217,7 @@ InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ;
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
@ -332,6 +333,7 @@ impl ErrorCode for milli::Error {
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::SortError(_) => Code::InvalidSearchSort,
UserError::InvalidMinTypoWordLenSetting(_, _) => {
Code::InvalidSettingsTypoTolerance

View File

@ -34,6 +34,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub struct SearchQueryGet {
#[deserr(default, error = DeserrQueryParamError<InvalidSearchQ>)]
q: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchQ>)]
vector: Option<Vec<f32>>,
#[deserr(default = Param(DEFAULT_SEARCH_OFFSET()), error = DeserrQueryParamError<InvalidSearchOffset>)]
offset: Param<usize>,
#[deserr(default = Param(DEFAULT_SEARCH_LIMIT()), error = DeserrQueryParamError<InvalidSearchLimit>)]
@ -80,6 +82,7 @@ impl From<SearchQueryGet> for SearchQuery {
Self {
q: other.q,
vector: other.vector,
offset: other.offset.0,
limit: other.limit.0,
page: other.page.as_deref().copied(),

View File

@ -31,11 +31,13 @@ pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string();
pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
#[derive(Debug, Clone, Default, PartialEq, Eq, Deserr)]
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct SearchQuery {
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub vector: Option<Vec<f32>>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@ -80,13 +82,15 @@ impl SearchQuery {
// This struct contains the fields of `SearchQuery` inline.
// This is because neither deserr nor serde support `flatten` when using `deny_unknown_fields.
// The `From<SearchQueryWithIndex>` implementation ensures both structs remain up to date.
#[derive(Debug, Clone, PartialEq, Eq, Deserr)]
#[derive(Debug, Clone, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct SearchQueryWithIndex {
#[deserr(error = DeserrJsonError<InvalidIndexUid>, missing_field_error = DeserrJsonError::missing_index_uid)]
pub index_uid: IndexUid,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub q: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
pub vector: Option<Vec<f32>>,
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
pub offset: usize,
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
@ -126,6 +130,7 @@ impl SearchQueryWithIndex {
let SearchQueryWithIndex {
index_uid,
q,
vector,
offset,
limit,
page,
@ -147,6 +152,7 @@ impl SearchQueryWithIndex {
index_uid,
SearchQuery {
q,
vector,
offset,
limit,
page,
@ -270,6 +276,10 @@ pub fn perform_search(
let mut search = index.search(&rtxn);
if let Some(ref vector) = query.vector {
search.vector(vector.clone());
}
if let Some(ref query) = query.q {
search.query(query);
}

View File

@ -15,6 +15,7 @@ license.workspace = true
bimap = { version = "0.6.3", features = ["serde"] }
bincode = "1.3.3"
bstr = "1.4.0"
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
byteorder = "1.4.3"
charabia = { version = "0.7.2", default-features = false }
concat-arrays = "0.1.2"
@ -32,18 +33,21 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f
"lmdb",
"sync-read-txn",
] }
hnsw = { version = "0.11.0", features = ["serde1"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memmap2 = "0.5.10"
obkv = "0.2.0"
once_cell = "1.17.1"
ordered-float = "3.6.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.7.0"
roaring = "0.10.1"
rstar = { version = "0.10.0", features = ["serde"] }
serde = { version = "1.0.160", features = ["derive"] }
serde_json = { version = "1.0.95", features = ["preserve_order"] }
slice-group-by = "0.3.0"
space = "0.17.0"
smallstr = { version = "0.3.0", features = ["serde"] }
smallvec = "1.10.0"
smartstring = "1.0.1"
@ -75,6 +79,9 @@ maplit = "1.0.2"
md5 = "0.7.0"
rand = { version = "0.8.5", features = ["small_rng"] }
[target.'cfg(fuzzing)'.dev-dependencies]
fuzzcheck = "0.12.1"
[features]
all-tokenizations = ["charabia/default"]

View File

@ -52,6 +52,7 @@ fn main() -> Result<(), Box<dyn Error>> {
let docs = execute_search(
&mut ctx,
&(!query.trim().is_empty()).then(|| query.trim().to_owned()),
&None,
TermsMatchingStrategy::Last,
false,
&None,

34
milli/src/distance.rs Normal file
View File

@ -0,0 +1,34 @@
use serde::{Deserialize, Serialize};
use space::Metric;
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct DotProduct;
impl Metric<Vec<f32>> for DotProduct {
type Unit = u32;
// TODO explain me this function, I don't understand why f32.to_bits is ordered.
// I tried to do this and it wasn't OK <https://stackoverflow.com/a/43305015/1941280>
//
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
let dist = 1.0 - dist;
debug_assert!(!dist.is_nan());
dist.to_bits()
}
}
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
pub struct Euclidean;
impl Metric<Vec<f32>> for Euclidean {
type Unit = u32;
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
let dist = squared.sqrt();
debug_assert!(!dist.is_nan());
dist.to_bits()
}
}

View File

@ -111,6 +111,7 @@ pub enum Error {
Io(#[from] io::Error),
}
#[cfg(test)]
pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
let documents = match json {
object @ serde_json::Value::Object(_) => vec![object],
@ -140,6 +141,7 @@ macro_rules! documents {
}};
}
#[cfg(test)]
pub fn documents_batch_reader_from_objects(
objects: impl IntoIterator<Item = Object>,
) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {

View File

@ -110,9 +110,11 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
},
#[error(transparent)]
InvalidGeoField(#[from] GeoError),
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
InvalidVectorDimensions { expected: usize, found: usize },
#[error("{0}")]
InvalidFilter(String),
#[error("Invalid type for filter subexpression: `expected {}, found: {1}`.", .0.join(", "))]
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
InvalidFilterExpression(&'static [&'static str], Value),
#[error("Attribute `{}` is not sortable. {}",
.field,

View File

@ -106,30 +106,22 @@ impl<'a> ExternalDocumentsIds<'a> {
map
}
/// Return an fst of the combined hard and soft deleted ID.
pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
if self.soft.is_empty() {
return Ok(Cow::Borrowed(&self.hard));
}
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
}
fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
if self.soft.len() >= self.hard.len() / 2 {
self.hard = self.to_fst()?.into_owned();
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
let mut new_hard_builder = fst::MapBuilder::memory();
while let Some((external_id, marked_docids)) = iter.next() {
let value = indexed_last_value(marked_docids).unwrap();
if value != DELETED_ID {
new_hard_builder.insert(external_id, value)?;
}
}
drop(iter);
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
self.soft = fst::Map::default().map_data(Cow::Owned)?;
}

View File

@ -8,10 +8,12 @@ use charabia::{Language, Script};
use heed::flags::Flags;
use heed::types::*;
use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn};
use rand_pcg::Pcg32;
use roaring::RoaringBitmap;
use rstar::RTree;
use time::OffsetDateTime;
use crate::distance::DotProduct;
use crate::error::{InternalError, UserError};
use crate::facet::FacetType;
use crate::fields_ids_map::FieldsIdsMap;
@ -26,6 +28,9 @@ use crate::{
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
};
/// The HNSW data-structure that we serialize, fill and search in.
pub type Hnsw = hnsw::Hnsw<DotProduct, Vec<f32>, Pcg32, 12, 24>;
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9;
@ -42,6 +47,7 @@ pub mod main_key {
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids";
pub const GEO_RTREE_KEY: &str = "geo-rtree";
pub const VECTOR_HNSW_KEY: &str = "vector-hnsw";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
@ -86,6 +92,7 @@ pub mod db_name {
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
pub const DOCUMENTS: &str = "documents";
pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids";
}
@ -93,10 +100,10 @@ pub mod db_name {
#[derive(Clone)]
pub struct Index {
/// The LMDB environment which this index is associated with.
pub env: heed::Env,
pub(crate) env: heed::Env,
/// Contains many different types (e.g. the fields ids map).
pub main: PolyDatabase,
pub(crate) main: PolyDatabase,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>,
@ -149,8 +156,11 @@ pub struct Index {
/// Maps the document id, the facet field id and the strings.
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
/// Maps a vector id to the document id that have it.
pub vector_id_docid: Database<OwnedType<BEU32>, OwnedType<BEU32>>,
/// Maps the document id to the document as an obkv store.
pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
}
impl Index {
@ -162,7 +172,7 @@ impl Index {
) -> Result<Index> {
use db_name::*;
options.max_dbs(23);
options.max_dbs(24);
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?;
@ -198,11 +208,11 @@ impl Index {
env.create_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
let facet_id_is_empty_docids =
env.create_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
let field_id_docid_facet_f64s =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
let field_id_docid_facet_strings =
env.create_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
let vector_id_docid = env.create_database(&mut wtxn, Some(VECTOR_ID_DOCID))?;
let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?;
wtxn.commit()?;
@ -231,6 +241,7 @@ impl Index {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
})
}
@ -502,6 +513,26 @@ impl Index {
}
}
/* vector HNSW */
/// Writes the provided `hnsw`.
pub(crate) fn put_vector_hnsw(&self, wtxn: &mut RwTxn, hnsw: &Hnsw) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<Hnsw>>(wtxn, main_key::VECTOR_HNSW_KEY, hnsw)
}
/// Delete the `hnsw`.
pub(crate) fn delete_vector_hnsw(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.delete::<_, Str>(wtxn, main_key::VECTOR_HNSW_KEY)
}
/// Returns the `hnsw`.
pub fn vector_hnsw(&self, rtxn: &RoTxn) -> Result<Option<Hnsw>> {
match self.main.get::<_, Str, SerdeBincode<Hnsw>>(rtxn, main_key::VECTOR_HNSW_KEY)? {
Some(hnsw) => Ok(Some(hnsw)),
None => Ok(None),
}
}
/* field distribution */
/// Writes the field distribution which associates every field name with

View File

@ -10,6 +10,7 @@ pub mod documents;
mod asc_desc;
mod criterion;
pub mod distance;
mod error;
mod external_documents_ids;
pub mod facet;

View File

@ -22,6 +22,7 @@ pub mod new;
pub struct Search<'a> {
query: Option<String>,
vector: Option<Vec<f32>>,
// this should be linked to the String in the query
filter: Option<Filter<'a>>,
offset: usize,
@ -39,6 +40,7 @@ impl<'a> Search<'a> {
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
Search {
query: None,
vector: None,
filter: None,
offset: 0,
limit: 20,
@ -57,6 +59,11 @@ impl<'a> Search<'a> {
self
}
pub fn vector(&mut self, vector: impl Into<Vec<f32>>) -> &mut Search<'a> {
self.vector = Some(vector.into());
self
}
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
self.offset = offset;
self
@ -106,6 +113,7 @@ impl<'a> Search<'a> {
execute_search(
&mut ctx,
&self.query,
&self.vector,
self.terms_matching_strategy,
self.exhaustive_number_hits,
&self.filter,
@ -132,6 +140,7 @@ impl fmt::Debug for Search<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let Search {
query,
vector: _,
filter,
offset,
limit,
@ -145,6 +154,7 @@ impl fmt::Debug for Search<'_> {
} = self;
f.debug_struct("Search")
.field("query", query)
.field("vector", &"[...]")
.field("filter", filter)
.field("offset", offset)
.field("limit", limit)

View File

@ -26,6 +26,7 @@ pub fn apply_distinct_rule(
ctx: &mut SearchContext,
field_id: u16,
candidates: &RoaringBitmap,
// TODO: add a universe here, such that the `excluded` are a subset of the universe?
) -> Result<DistinctOutput> {
let mut excluded = RoaringBitmap::new();
let mut remaining = RoaringBitmap::new();

View File

@ -206,7 +206,7 @@ impl State {
)?;
intersection &= &candidates;
if !intersection.is_empty() {
// Although not really worth it in terms of performance,
// TODO: although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
ctx.index

View File

@ -32,7 +32,7 @@ impl<T> Interned<T> {
#[derive(Clone)]
pub struct DedupInterner<T> {
stable_store: Vec<T>,
lookup: FxHashMap<T, Interned<T>>,
lookup: FxHashMap<T, Interned<T>>, // TODO: Arc
}
impl<T> Default for DedupInterner<T> {
fn default() -> Self {

View File

@ -1,4 +1,5 @@
/// Maximum number of tokens we consider in a single search.
// TODO: Loic, find proper value here so we don't overflow the interner.
pub const MAX_TOKEN_COUNT: usize = 1_000;
/// Maximum number of prefixes that can be derived from a single word.

View File

@ -509,6 +509,7 @@ mod tests {
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
&mut ctx,
&Some(query.to_string()),
&None,
crate::TermsMatchingStrategy::default(),
false,
&None,

View File

@ -28,6 +28,7 @@ use db_cache::DatabaseCache;
use exact_attribute::ExactAttribute;
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
use heed::RoTxn;
use hnsw::Searcher;
use interner::{DedupInterner, Interner};
pub use logger::visual::VisualSearchLogger;
pub use logger::{DefaultSearchLogger, SearchLogger};
@ -39,13 +40,16 @@ use ranking_rules::{
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
use roaring::RoaringBitmap;
use sort::Sort;
use space::Neighbor;
use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use crate::search::new::distinct::apply_distinct_rule;
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
use crate::{
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
};
/// A structure used throughout the execution of a search query.
pub struct SearchContext<'ctx> {
@ -349,6 +353,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
pub fn execute_search(
ctx: &mut SearchContext,
query: &Option<String>,
vector: &Option<Vec<f32>>,
terms_matching_strategy: TermsMatchingStrategy,
exhaustive_number_hits: bool,
filters: &Option<Filter>,
@ -428,6 +433,33 @@ pub fn execute_search(
let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output;
let docids = match vector {
Some(vector) => {
// return the nearest documents that are also part of the candidates.
let mut searcher = Searcher::new();
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
let ef = hnsw.len().min(100);
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
let neighbors = hnsw.nearest(vector, ef, &mut searcher, &mut dest[..]);
let mut docids = Vec::new();
for Neighbor { index, distance: _ } in neighbors.iter() {
let index = BEU32::new(*index as u32);
let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get();
if universe.contains(docid) {
docids.push(docid);
if docids.len() == (from + length) {
break;
}
}
}
docids.into_iter().skip(from).take(length).collect()
}
// return the search docids if the vector field is not specified
None => docids,
};
// The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set.
if exhaustive_number_hits {

View File

@ -92,7 +92,7 @@ impl QueryGraph {
/// which contains ngrams.
pub fn from_query(
ctx: &mut SearchContext,
// The terms here must be consecutive
// NOTE: the terms here must be consecutive
terms: &[LocatedQueryTerm],
) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
let mut new_located_query_terms = terms.to_vec();
@ -103,7 +103,7 @@ impl QueryGraph {
let root_node = 0;
let end_node = 1;
// Ee could consider generalizing to 4,5,6,7,etc. ngrams
// TODO: we could consider generalizing to 4,5,6,7,etc. ngrams
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
(vec![], vec![], vec![root_node]);

View File

@ -132,6 +132,7 @@ impl QueryTermSubset {
if full_query_term.ngram_words.is_some() {
return None;
}
// TODO: included in subset
if let Some(phrase) = full_query_term.zero_typo.phrase {
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
} else if let Some(word) = full_query_term.zero_typo.exact {
@ -181,6 +182,7 @@ impl QueryTermSubset {
let word = match &self.zero_typo_subset {
NTypoTermSubset::All => Some(use_prefix_db),
NTypoTermSubset::Subset { words, phrases: _ } => {
// TODO: use a subset of prefix words instead
if words.contains(&use_prefix_db) {
Some(use_prefix_db)
} else {
@ -202,6 +204,7 @@ impl QueryTermSubset {
ctx: &mut SearchContext,
) -> Result<BTreeSet<Word>> {
let mut result = BTreeSet::default();
// TODO: a compute_partially funtion
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
self.original.compute_fully_if_needed(ctx)?;
}
@ -297,6 +300,7 @@ impl QueryTermSubset {
let mut result = BTreeSet::default();
if !self.one_typo_subset.is_empty() {
// TODO: compute less than fully if possible
self.original.compute_fully_if_needed(ctx)?;
}
let original = ctx.term_interner.get_mut(self.original);

View File

@ -139,6 +139,7 @@ pub fn number_of_typos_allowed<'ctx>(
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
// TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
let exact_words = ctx.index.exact_words(ctx.txn)?;
Ok(Box::new(move |word: &str| {
@ -249,6 +250,8 @@ impl PhraseBuilder {
} else {
// token has kind Word
let word = ctx.word_interner.insert(token.lemma().to_string());
// TODO: in a phrase, check that every word exists
// otherwise return an empty term
self.words.push(Some(word));
}
}

View File

@ -1,48 +1,5 @@
/** Implements a "PathVisitor" which finds all paths of a certain cost
from the START to END node of a ranking rule graph.
#![allow(clippy::too_many_arguments)]
A path is a list of conditions. A condition is the data associated with
an edge, given by the ranking rule. Some edges don't have a condition associated
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
The algorithm uses a depth-first search. It benefits from two main optimisations:
- The list of all possible costs to go from any node to the END node is precomputed
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
untraversable depending on what other edges were selected.
These two optimisations are meant to avoid traversing edges that wouldn't lead
to a valid path. In practically all cases, we avoid the exponential complexity
that is inherent to depth-first search in a large ranking rule graph.
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
conditions to a list of traversed conditions.
For example, the DeadEndsCache could say the following:
- Immediately, from the start, the conditions `[a,b]` are forbidden
- if we take the condition `c`, then the conditions `[e]` are also forbidden
- and if after that, we take `f`, then `[h,i]` are also forbidden
- etc.
- if we take `g`, then `[f]` is also forbidden
- etc.
- etc.
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
When a path is found from START to END, we give it to the `visit` closure.
This closure takes a mutable reference to the `DeadEndsCache`. This means that
the caller can update this cache. Therefore, we must handle the case where the
DeadEndsCache has been updated. This means potentially backtracking up to the point
where the traversed conditions are all allowed by the new DeadEndsCache.
The algorithm also implements the `TermsMatchingStrategy` logic.
Some edges are augmented with a list of "nodes_to_skip". Skipping
a node means "reaching this node through an unconditional edge". If we have
already traversed (ie. not skipped) a node that is in this list, then we know that we
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
future node that was present in the "nodes_to_skip" list.
The caller can decide to stop the path finding algorithm
by returning a `ControlFlow::Break` from the `visit` closure.
*/
use std::collections::{BTreeSet, VecDeque};
use std::iter::FromIterator;
use std::ops::ControlFlow;
@ -55,41 +12,30 @@ use crate::search::new::query_graph::QueryNode;
use crate::search::new::small_bitmap::SmallBitmap;
use crate::Result;
/// Closure which processes a path found by the `PathVisitor`
type VisitFn<'f, G> = &'f mut dyn FnMut(
// the path as a list of conditions
&[Interned<<G as RankingRuleGraphTrait>::Condition>],
&mut RankingRuleGraph<G>,
// a mutable reference to the DeadEndsCache, to update it in case the given
// path doesn't resolve to any valid document ids
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
) -> Result<ControlFlow<()>>;
/// A structure which is kept but not updated during the traversal of the graph.
/// It can however be updated by the `visit` closure once a valid path has been found.
struct VisitorContext<'a, G: RankingRuleGraphTrait> {
graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
}
/// The internal state of the traversal algorithm
struct VisitorState<G: RankingRuleGraphTrait> {
/// Budget from the current node to the end node
remaining_cost: u64,
/// Previously visited conditions, in order.
path: Vec<Interned<G::Condition>>,
/// Previously visited conditions, as an efficient and compact set.
visited_conditions: SmallBitmap<G::Condition>,
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
visited_nodes: SmallBitmap<QueryNode>,
/// The conditions that cannot be visited anymore
forbidden_conditions: SmallBitmap<G::Condition>,
/// The nodes that cannot be visited anymore (they must be skipped)
nodes_to_skip: SmallBitmap<QueryNode>,
forbidden_conditions_to_nodes: SmallBitmap<QueryNode>,
}
/// See module documentation
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
state: VisitorState<G>,
ctx: VisitorContext<'a, G>,
@ -110,13 +56,14 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
forbidden_conditions: SmallBitmap::for_interned_values_in(
&graph.conditions_interner,
),
nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in(
&graph.query_graph.nodes,
),
},
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
}
}
/// See module documentation
pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> {
let _ =
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
@ -125,31 +72,22 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
}
impl<G: RankingRuleGraphTrait> VisitorState<G> {
/// Visits a node: traverse all its valid conditional and unconditional edges.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_node(
&mut self,
from_node: Interned<QueryNode>,
visit: VisitFn<G>,
ctx: &mut VisitorContext<G>,
) -> Result<ControlFlow<(), bool>> {
// any valid path will be found from this point
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
// and we will need to do more work to potentially backtrack
let mut any_valid = false;
let edges = ctx.graph.edges_of_node.get(from_node).clone();
for edge_idx in edges.iter() {
// could be none if the edge was deleted
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
if self.remaining_cost < edge.cost as u64 {
continue;
}
self.remaining_cost -= edge.cost as u64;
let cf = match edge.condition {
Some(condition) => self.visit_condition(
condition,
@ -181,10 +119,6 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
Ok(ControlFlow::Continue(any_valid))
}
/// Visits an unconditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_no_condition(
&mut self,
dest_node: Interned<QueryNode>,
@ -200,29 +134,20 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
{
return Ok(ControlFlow::Continue(false));
}
// We've reached the END node!
if dest_node == ctx.graph.query_graph.end_node {
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
// We could change the return type of the visit closure such that the caller
// tells us whether the dead ends cache was updated or not.
// Alternatively, maybe the DeadEndsCache should have a generation number
// to it, so that we don't need to play with these booleans at all.
match control_flow {
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
}
} else {
let old_fbct = self.nodes_to_skip.clone();
self.nodes_to_skip.union(edge_new_nodes_to_skip);
let old_fbct = self.forbidden_conditions_to_nodes.clone();
self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?;
self.nodes_to_skip = old_fbct;
self.forbidden_conditions_to_nodes = old_fbct;
Ok(cf)
}
}
/// Visits a conditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_condition(
&mut self,
condition: Interned<G::Condition>,
@ -234,7 +159,7 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
assert!(dest_node != ctx.graph.query_graph.end_node);
if self.forbidden_conditions.contains(condition)
|| self.nodes_to_skip.contains(dest_node)
|| self.forbidden_conditions_to_nodes.contains(dest_node)
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes)
{
return Ok(ControlFlow::Continue(false));
@ -255,19 +180,19 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
self.visited_nodes.insert(dest_node);
self.visited_conditions.insert(condition);
let old_forb_cond = self.forbidden_conditions.clone();
let old_fc = self.forbidden_conditions.clone();
if let Some(next_forbidden) =
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
{
self.forbidden_conditions.union(&next_forbidden);
}
let old_nodes_to_skip = self.nodes_to_skip.clone();
self.nodes_to_skip.union(edge_new_nodes_to_skip);
let old_fctn = self.forbidden_conditions_to_nodes.clone();
self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?;
self.nodes_to_skip = old_nodes_to_skip;
self.forbidden_conditions = old_forb_cond;
self.forbidden_conditions_to_nodes = old_fctn;
self.forbidden_conditions = old_fc;
self.visited_conditions.remove(condition);
self.visited_nodes.remove(dest_node);

View File

@ -9,8 +9,12 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
use crate::Result;
// TODO: give a generation to each universe, then be able to get the exact
// delta of docids between two universes of different generations!
/// A cache storing the document ids associated with each ranking rule edge
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
// TOOD: should be a mapped interner?
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
_phantom: PhantomData<G>,
}
@ -50,7 +54,7 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
}
let condition = graph.conditions_interner.get_mut(interned_condition);
let computed = G::resolve_condition(ctx, condition, universe)?;
// Can we put an assert here for computed.universe_len == universe.len() ?
// TODO: if computed.universe_len != universe.len() ?
let _ = self.cache.insert(interned_condition, computed);
let computed = &self.cache[&interned_condition];
Ok(computed)

View File

@ -2,7 +2,6 @@ use crate::search::new::interner::{FixedSizeInterner, Interned};
use crate::search::new::small_bitmap::SmallBitmap;
pub struct DeadEndsCache<T> {
// conditions and next could/should be part of the same vector
conditions: Vec<Interned<T>>,
next: Vec<Self>,
pub forbidden: SmallBitmap<T>,
@ -28,7 +27,7 @@ impl<T> DeadEndsCache<T> {
self.forbidden.insert(condition);
}
fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
pub fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
Some(&mut self.next[idx])
} else {

View File

@ -69,9 +69,14 @@ impl RankingRuleGraphTrait for FidGraph {
let mut edges = vec![];
for fid in all_fields {
// TODO: We can improve performances and relevancy by storing
// the term subsets associated to each field ids fetched.
edges.push((
fid as u32 * term.term_ids.len() as u32,
conditions_interner.insert(FidCondition { term: term.clone(), fid }),
fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
conditions_interner.insert(FidCondition {
term: term.clone(), // TODO remove this ugly clone
fid,
}),
));
}

View File

@ -94,9 +94,14 @@ impl RankingRuleGraphTrait for PositionGraph {
let mut edges = vec![];
for (cost, positions) in positions_for_costs {
// TODO: We can improve performances and relevancy by storing
// the term subsets associated to each position fetched
edges.push((
cost,
conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
conditions_interner.insert(PositionCondition {
term: term.clone(), // TODO remove this ugly clone
positions,
}),
));
}

View File

@ -65,6 +65,13 @@ pub fn compute_docids(
}
}
// TODO: add safeguard in case the cartesian product is too large!
// even if we restrict the word derivations to a maximum of 100, the size of the
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
// Before computing the edges, check that the left word and left phrase
// aren't disjoint with the universe, but only do it if there is more than
@ -104,6 +111,8 @@ pub fn compute_docids(
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
// TODO: think about whether we want to reduce the subset,
// we probably should!
start_term_subset: Some(left_term.clone()),
end_term_subset: right_term.clone(),
})
@ -194,7 +203,12 @@ fn compute_non_prefix_edges(
*docids |= new_docids;
}
}
if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
if backward_proximity >= 1
// TODO: for now, we don't do any swapping when either term is a phrase
// but maybe we should. We'd need to look at the first/last word of the phrase
// depending on the context.
&& left_phrase.is_none() && right_phrase.is_none()
{
if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
{

View File

@ -33,6 +33,8 @@ pub fn compute_query_term_subset_docids(
ctx: &mut SearchContext,
term: &QueryTermSubset,
) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_docids) = ctx.word_docids(word)? {
@ -57,6 +59,8 @@ pub fn compute_query_term_subset_docids_within_field_id(
term: &QueryTermSubset,
fid: u16,
) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
@ -67,6 +71,7 @@ pub fn compute_query_term_subset_docids_within_field_id(
for phrase in term.all_phrases(ctx)? {
// There may be false positives when resolving a phrase, so we're not
// guaranteed that all of its words are within a single fid.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
@ -90,6 +95,7 @@ pub fn compute_query_term_subset_docids_within_position(
term: &QueryTermSubset,
position: u16,
) -> Result<RoaringBitmap> {
// TODO Use the roaring::MultiOps trait
let mut docids = RoaringBitmap::new();
for word in term.all_single_words_except_prefix_db(ctx)? {
if let Some(word_position_docids) =
@ -102,6 +108,7 @@ pub fn compute_query_term_subset_docids_within_position(
for phrase in term.all_phrases(ctx)? {
// It's difficult to know the expected position of the words in the phrase,
// so instead we just check the first one.
// TODO: fix this?
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
@ -125,6 +132,9 @@ pub fn compute_query_graph_docids(
q: &QueryGraph,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
// TODO: there must be a faster way to compute this big
// roaring bitmap expression
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());

View File

@ -141,6 +141,10 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Query>>> {
let iter = self.iter.as_mut().unwrap();
// TODO: we should make use of the universe in the function below
// good for correctness, but ideally iter.next_bucket would take the current universe into account,
// as right now it could return buckets that don't intersect with the universe, meaning we will make many
// unneeded calls.
if let Some(mut bucket) = iter.next_bucket()? {
bucket.candidates &= universe;
Ok(Some(bucket))

View File

@ -527,7 +527,7 @@ fn test_distinct_all_candidates() {
let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap();
let candidates = candidates.iter().collect::<Vec<_>>();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
// This is incorrect, but unfortunately impossible to do better efficiently.
// TODO: this is incorrect!
insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]");
}

View File

@ -122,11 +122,11 @@ fn create_edge_cases_index() -> TempIndex {
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
"
},
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
// If the query is `sunflower wilting`, then we should make sure that
// the proximity condition `flower wilting: sprx N` also comes with the condition
// `sun wilting: sprx N+1`, but this is not the exact condition we use for now.
// the sprximity condition `flower wilting: sprx N` also comes with the condition
// `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now.
// We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which
// is better than nothing but not the best.
{
@ -139,7 +139,7 @@ fn create_edge_cases_index() -> TempIndex {
},
{
"id": 3,
// This document matches the query `sunflower wilting`, but the sprximity condition
// This document matches the query `sunflower wilting`, but the sprximity condition
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
// which would reduce to only `flower` and `wilting` being in sprximity.
"text": "A flower wilting under the sun, unlike a sunflower"
@ -299,7 +299,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// "2" and "4" should be swapped ideally
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@ -316,7 +316,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// "2" and "4" should be swapped ideally
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@ -341,7 +341,7 @@ fn test_proximity_split_word() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// "2" and "4" should be swapped ideally
// TODO: "2" and "4" should be swapped ideally
insta::assert_debug_snapshot!(texts, @r###"
[
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",

View File

@ -2,8 +2,9 @@
This module tests the interactions between the proximity and typo ranking rules.
The proximity ranking rule should transform the query graph such that it
only contains the word pairs that it used to compute its bucket, but this is not currently
implemented.
only contains the word pairs that it used to compute its bucket.
TODO: This is not currently implemented.
*/
use crate::index::tests::TempIndex;
@ -63,7 +64,7 @@ fn test_trap_basic() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
// This is incorrect, 1 should come before 0
// TODO: this is incorrect, 1 should come before 0
insta::assert_debug_snapshot!(texts, @r###"
[
"\"summer. holiday. sommer holidty\"",

View File

@ -571,8 +571,8 @@ fn test_typo_synonyms() {
s.terms_matching_strategy(TermsMatchingStrategy::All);
s.query("the fast brownish fox jumps over the lackadaisical dog");
// The interaction of ngrams + synonyms means that the multi-word synonyms end up having a typo cost.
// This is probably not what we want.
// TODO: is this correct? interaction of ngrams + synonyms means that the
// multi-word synonyms end up having a typo cost. This is probably not what we want.
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]");
let texts = collect_field_values(&index, &txn, "text", &documents_ids);

View File

@ -39,6 +39,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_is_empty_docids,
field_id_docid_facet_f64s,
field_id_docid_facet_strings,
vector_id_docid,
documents,
} = self.index;
@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
self.index.delete_vector_hnsw(self.wtxn)?;
// We clean all the faceted documents ids.
for field_id in faceted_fields {
@ -95,6 +97,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
facet_id_string_docids.clear(self.wtxn)?;
field_id_docid_facet_f64s.clear(self.wtxn)?;
field_id_docid_facet_strings.clear(self.wtxn)?;
vector_id_docid.clear(self.wtxn)?;
documents.clear(self.wtxn)?;
Ok(number_of_documents)

View File

@ -4,8 +4,10 @@ use std::collections::{BTreeSet, HashMap, HashSet};
use fst::IntoStreamer;
use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
use heed::{BytesDecode, BytesEncode, Database, RwIter};
use hnsw::Searcher;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use space::KnnPoints;
use time::OffsetDateTime;
use super::facet::delete::FacetsDelete;
@ -14,6 +16,7 @@ use crate::error::InternalError;
use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::Hnsw;
use crate::{
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
};
@ -71,6 +74,7 @@ impl std::fmt::Display for DeletionStrategy {
pub(crate) struct DetailedDocumentDeletionResult {
pub deleted_documents: u64,
pub remaining_documents: u64,
pub soft_deletion_used: bool,
}
impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
@ -107,8 +111,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
Some(docid)
}
pub fn execute(self) -> Result<DocumentDeletionResult> {
let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
self.execute_inner()?;
let DetailedDocumentDeletionResult {
deleted_documents,
remaining_documents,
soft_deletion_used: _,
} = self.execute_inner()?;
Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
}
@ -129,6 +136,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
return Ok(DetailedDocumentDeletionResult {
deleted_documents: 0,
remaining_documents: 0,
soft_deletion_used: false,
});
}
@ -144,6 +152,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
return Ok(DetailedDocumentDeletionResult {
deleted_documents: current_documents_ids_len,
remaining_documents,
soft_deletion_used: false,
});
}
@ -212,6 +221,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
return Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
soft_deletion_used: true,
});
}
@ -240,6 +250,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
facet_id_exists_docids,
facet_id_is_null_docids,
facet_id_is_empty_docids,
vector_id_docid,
documents,
} = self.index;
// Remove from the documents database
@ -429,11 +440,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&self.to_delete_docids,
)?;
// An ugly and slow way to remove the vectors from the HNSW
// It basically reconstructs the HNSW from scratch without editing the current one.
let current_hnsw = self.index.vector_hnsw(self.wtxn)?.unwrap_or_default();
if !current_hnsw.is_empty() {
let mut new_hnsw = Hnsw::default();
let mut searcher = Searcher::new();
let mut new_vector_id_docids = Vec::new();
for result in vector_id_docid.iter(self.wtxn)? {
let (vector_id, docid) = result?;
if !self.to_delete_docids.contains(docid.get()) {
let vector = current_hnsw.get_point(vector_id.get() as usize).clone();
let vector_id = new_hnsw.insert(vector, &mut searcher);
new_vector_id_docids.push((vector_id as u32, docid));
}
}
vector_id_docid.clear(self.wtxn)?;
for (vector_id, docid) in new_vector_id_docids {
vector_id_docid.put(self.wtxn, &BEU32::new(vector_id), &docid)?;
}
self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
}
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
soft_deletion_used: false,
})
}

View File

@ -0,0 +1,40 @@
use std::fs::File;
use std::io;
use bytemuck::cast_slice;
use serde_json::from_slice;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::{FieldId, InternalError, Result};
/// Extracts the embedding vector contained in each document under the `_vector` field.
///
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
#[logging_timer::time]
pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
vector_fid: FieldId,
) -> Result<grenad::Reader<File>> {
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut cursor = obkv_documents.into_cursor()?;
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
let obkv = obkv::KvReader::new(value);
// first we get the _vector field
if let Some(vector) = obkv.get(vector_fid) {
// try to extract the vector
let vector: Vec<f32> = from_slice(vector).map_err(InternalError::SerdeJson).unwrap();
let bytes = cast_slice(&vector);
writer.insert(docid_bytes, bytes)?;
}
// else => the _vector object was `null`, there is nothing to do
}
writer_into_reader(writer)
}

View File

@ -4,6 +4,7 @@ mod extract_facet_string_docids;
mod extract_fid_docid_facet_values;
mod extract_fid_word_count_docids;
mod extract_geo_points;
mod extract_vector_points;
mod extract_word_docids;
mod extract_word_fid_docids;
mod extract_word_pair_proximity_docids;
@ -22,6 +23,7 @@ use self::extract_facet_string_docids::extract_facet_string_docids;
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_vector_points::extract_vector_points;
use self::extract_word_docids::extract_word_docids;
use self::extract_word_fid_docids::extract_word_fid_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
@ -45,6 +47,7 @@ pub(crate) fn data_from_obkv_documents(
faceted_fields: HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
vector_field_id: Option<FieldId>,
stop_words: Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>,
@ -69,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
&faceted_fields,
primary_key_id,
geo_fields_ids,
vector_field_id,
&stop_words,
max_positions_per_attributes,
)
@ -279,6 +283,7 @@ fn send_and_extract_flattened_documents_data(
faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
vector_field_id: Option<FieldId>,
stop_words: &Option<fst::Set<&[u8]>>,
max_positions_per_attributes: Option<u32>,
) -> Result<(
@ -307,6 +312,20 @@ fn send_and_extract_flattened_documents_data(
});
}
if let Some(vector_field_id) = vector_field_id {
let documents_chunk_cloned = flattened_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
rayon::spawn(move || {
let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
let _ = match result {
Ok(vector_points) => {
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
}
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
};
});
}
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
rayon::join(
|| {

View File

@ -2,7 +2,7 @@ use std::sync::Arc;
use memmap2::Mmap;
/// Wrapper around Mmap allowing to virtually clone grenad-chunks
/// Wrapper around Mmap allowing to virtualy clone grenad-chunks
/// in a parallel process like the indexing.
#[derive(Debug, Clone)]
pub struct ClonableMmap {

View File

@ -236,7 +236,7 @@ where
primary_key,
fields_ids_map,
field_distribution,
new_external_documents_ids,
mut external_documents_ids,
new_documents_ids,
replaced_documents_ids,
documents_count,
@ -304,6 +304,8 @@ where
}
None => None,
};
// get the fid of the `_vector` field.
let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
let stop_words = self.index.stop_words(self.wtxn)?;
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
@ -340,6 +342,7 @@ where
faceted_fields,
primary_key_id,
geo_fields_ids,
vector_field_id,
stop_words,
max_positions_per_attributes,
exact_attributes,
@ -363,6 +366,9 @@ where
deletion_builder.delete_documents(&replaced_documents_ids);
let deleted_documents_result = deletion_builder.execute_inner()?;
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
if !deleted_documents_result.soft_deletion_used {
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
}
}
let index_documents_ids = self.index.documents_ids(self.wtxn)?;
@ -442,9 +448,6 @@ where
self.index.put_primary_key(self.wtxn, &primary_key)?;
// We write the external documents ids into the main database.
let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
external_documents_ids.insert_ids(&new_external_documents_ids)?;
let external_documents_ids = external_documents_ids.into_static();
self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
let all_documents_ids = index_documents_ids | new_documents_ids;
@ -2514,170 +2517,4 @@ mod tests {
db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
}
#[test]
fn reproduce_the_bug() {
/*
[milli/examples/fuzz.rs:69] &batches = [
Batch(
[
AddDoc(
{ "id": 1, "doggo": "bernese" }, => internal 0
),
],
),
Batch(
[
DeleteDoc(
1, => delete internal 0
),
AddDoc(
{ "id": 0, "catto": "jorts" }, => internal 1
),
],
),
Batch(
[
AddDoc(
{ "id": 1, "catto": "jorts" }, => internal 2
),
],
),
]
*/
let mut index = TempIndex::new();
index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
// START OF BATCH
println!("--- ENTERING BATCH 1");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
// OP
let documents = documents!([
{ "id": 1, "doggo": "bernese" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
// FINISHING
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 1,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":1,"doggo":"bernese"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
1 0
"###);
// A first batch of documents has been inserted
// BATCH 2
println!("--- ENTERING BATCH 2");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
insta::assert_display_snapshot!(removed.unwrap(), @"1");
let documents = documents!([
{ "id": 0, "catto": "jorts" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 1,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":0,"catto":"jorts"}
"###);
db_snap!(index, external_documents_ids, @r###"
soft:
hard:
0 1
"###);
db_snap!(index, soft_deleted_documents_ids, @"[]");
// BATCH 3
println!("--- ENTERING BATCH 3");
let mut wtxn = index.write_txn().unwrap();
let builder = IndexDocuments::new(
&mut wtxn,
&index,
&index.indexer_config,
index.index_documents_config.clone(),
|_| (),
|| false,
)
.unwrap();
let documents = documents!([
{ "id": 1, "catto": "jorts" },
]);
let (builder, added) = builder.add_documents(documents).unwrap();
insta::assert_display_snapshot!(added.unwrap(), @"1");
let addition = builder.execute().unwrap();
insta::assert_debug_snapshot!(addition, @r###"
DocumentAdditionResult {
indexed_documents: 1,
number_of_documents: 2,
}
"###);
wtxn.commit().unwrap();
db_snap!(index, documents, @r###"
{"id":1,"catto":"jorts"}
{"id":0,"catto":"jorts"}
"###);
// Ensuring all the returned IDs actually exists
let rtxn = index.read_txn().unwrap();
let res = index.search(&rtxn).execute().unwrap();
index.documents(&rtxn, res.documents_ids).unwrap();
}
}

View File

@ -21,14 +21,15 @@ use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key};
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
Result, BEU32,
};
pub struct TransformOutput {
pub primary_key: String,
pub fields_ids_map: FieldsIdsMap,
pub field_distribution: FieldDistribution,
pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
pub external_documents_ids: ExternalDocumentsIds<'static>,
pub new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap,
pub documents_count: usize,
@ -567,6 +568,8 @@ impl<'a, 'i> Transform<'a, 'i> {
}))?
.to_string();
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
// We create a final writer to write the new documents in order from the sorter.
let mut writer = create_writer(
self.indexer_settings.chunk_compression_type,
@ -648,12 +651,13 @@ impl<'a, 'i> Transform<'a, 'i> {
fst_new_external_documents_ids_builder.insert(key, value)
})?;
let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
external_documents_ids.insert_ids(&new_external_documents_ids)?;
Ok(TransformOutput {
primary_key,
fields_ids_map: self.fields_ids_map,
field_distribution,
new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
external_documents_ids: external_documents_ids.into_static(),
new_documents_ids: self.new_documents_ids,
replaced_documents_ids: self.replaced_documents_ids,
documents_count: self.documents_count,
@ -687,8 +691,7 @@ impl<'a, 'i> Transform<'a, 'i> {
let new_external_documents_ids = {
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
// This call should be free and can't fail since the previous method merged both fsts.
external_documents_ids.into_static().to_fst()?.into_owned()
external_documents_ids
};
let documents_ids = self.index.documents_ids(wtxn)?;
@ -773,7 +776,7 @@ impl<'a, 'i> Transform<'a, 'i> {
primary_key,
fields_ids_map: new_fields_ids_map,
field_distribution,
new_external_documents_ids,
external_documents_ids: new_external_documents_ids.into_static(),
new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(),
documents_count,

View File

@ -4,20 +4,24 @@ use std::convert::TryInto;
use std::fs::File;
use std::io;
use bytemuck::allocation::pod_collect_to_vec;
use charabia::{Language, Script};
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::RwTxn;
use hnsw::Searcher;
use roaring::RoaringBitmap;
use space::KnnPoints;
use super::helpers::{
self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn};
use crate::error::UserError;
use crate::facet::FacetType;
use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result};
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
pub(crate) enum TypedChunk {
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -38,6 +42,7 @@ pub(crate) enum TypedChunk {
FieldIdFacetIsNullDocids(grenad::Reader<File>),
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
GeoPoints(grenad::Reader<File>),
VectorPoints(grenad::Reader<File>),
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
}
@ -221,6 +226,38 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_geo_rtree(wtxn, &rtree)?;
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
}
TypedChunk::VectorPoints(vector_points) => {
let mut hnsw = index.vector_hnsw(wtxn)?.unwrap_or_default();
let mut searcher = Searcher::new();
let mut expected_dimensions = match index.vector_id_docid.iter(wtxn)?.next() {
Some(result) => {
let (vector_id, _) = result?;
Some(hnsw.get_point(vector_id.get() as usize).len())
}
None => None,
};
let mut cursor = vector_points.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// convert the key back to a u32 (4 bytes)
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
// convert the vector back to a Vec<f32>
let vector: Vec<f32> = pod_collect_to_vec(value);
// TODO Move this error in the vector extractor
let found = vector.len();
let expected = *expected_dimensions.get_or_insert(found);
if expected != found {
return Err(UserError::InvalidVectorDimensions { expected, found })?;
}
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
}
log::debug!("There are {} entries in the HNSW so far", hnsw.len());
index.put_vector_hnsw(wtxn, &hnsw)?;
}
TypedChunk::ScriptLanguageDocids(hash_pair) => {
let mut buffer = Vec::new();
for (key, value) in hash_pair {