mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-30 01:35:36 +00:00
Compare commits
61 Commits
v1.13.0-rc
...
use-scoped
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
50268b930c | ||
|
|
93ba4b924a | ||
|
|
b7d5576347 | ||
|
|
f67b246108 | ||
|
|
a1f60c61e8 | ||
|
|
de2fedaa9d | ||
|
|
89717ba0f1 | ||
|
|
8d93de28b8 | ||
|
|
5e7803632d | ||
|
|
885710a07b | ||
|
|
c55fdad2c3 | ||
|
|
1caad4c4b0 | ||
|
|
8419ed52a1 | ||
|
|
a65c52cc97 | ||
|
|
49e9655c24 | ||
|
|
fa763ca5dc | ||
|
|
c7aeb554b2 | ||
|
|
88d9d47928 | ||
|
|
8e0d8d31f9 | ||
|
|
81a38099ec | ||
|
|
bd27fe7d02 | ||
|
|
41203f0931 | ||
|
|
803a699b15 | ||
|
|
246ad3b06e | ||
|
|
70305b9f71 | ||
|
|
5dab435d13 | ||
|
|
c83c1a3c51 | ||
|
|
b83275c9c5 | ||
|
|
d7f35ee3ba | ||
|
|
1dce341bfb | ||
|
|
4876c1c8eb | ||
|
|
43c8d54501 | ||
|
|
84e2a1f836 | ||
|
|
00eb47d42e | ||
|
|
9293e7f2c1 | ||
|
|
80198aa855 | ||
|
|
fa00b42c93 | ||
|
|
6c9409edf8 | ||
|
|
acb06cb3e6 | ||
|
|
7d0d8f4445 | ||
|
|
491d115c3c | ||
|
|
55fa2dda00 | ||
|
|
c71eea8023 | ||
|
|
df40533741 | ||
|
|
0c3e7fe963 | ||
|
|
45f843ccb9 | ||
|
|
35b6bca598 | ||
|
|
7f82d33597 | ||
|
|
8c5856007c | ||
|
|
ae1d7f4d9b | ||
|
|
792be63567 | ||
|
|
70aac71c63 | ||
|
|
a562d6abc1 | ||
|
|
b7fdd9516c | ||
|
|
5f2a1a4fd1 | ||
|
|
2b0e17ede0 | ||
|
|
37092adc71 | ||
|
|
86fcad788e | ||
|
|
2ea5c57871 | ||
|
|
b63c64395d | ||
|
|
628119e31e |
12
Cargo.lock
generated
12
Cargo.lock
generated
@@ -519,6 +519,7 @@ dependencies = [
|
||||
"rand_chacha",
|
||||
"reqwest",
|
||||
"roaring",
|
||||
"scoped_thread_pool",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
]
|
||||
@@ -2062,6 +2063,7 @@ dependencies = [
|
||||
"either",
|
||||
"fastrand",
|
||||
"milli",
|
||||
"scoped_thread_pool",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
@@ -2768,6 +2770,7 @@ dependencies = [
|
||||
"page_size",
|
||||
"rayon",
|
||||
"roaring",
|
||||
"scoped_thread_pool",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"synchronoise",
|
||||
@@ -3636,6 +3639,7 @@ dependencies = [
|
||||
"rustls",
|
||||
"rustls-pemfile",
|
||||
"rustls-pki-types",
|
||||
"scoped_thread_pool",
|
||||
"segment",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -3814,6 +3818,7 @@ dependencies = [
|
||||
"roaring",
|
||||
"rstar",
|
||||
"rustc-hash 2.1.0",
|
||||
"scoped_thread_pool",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"slice-group-by",
|
||||
@@ -5088,6 +5093,13 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scoped_thread_pool"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
|
||||
@@ -17,6 +17,7 @@ csv = "1.3.1"
|
||||
memmap2 = "0.9.5"
|
||||
milli = { path = "../milli" }
|
||||
mimalloc = { version = "0.1.43", default-features = false }
|
||||
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
|
||||
serde_json = { version = "1.0.135", features = ["preserve_order"] }
|
||||
tempfile = "3.15.0"
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,7 @@
|
||||
|
||||
use std::fs::{create_dir_all, remove_dir_all, File};
|
||||
use std::io::{self, BufReader, BufWriter, Read};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr as _;
|
||||
|
||||
@@ -9,9 +10,11 @@ use anyhow::Context;
|
||||
use bumpalo::Bump;
|
||||
use criterion::BenchmarkId;
|
||||
use memmap2::Mmap;
|
||||
use milli::heed::EnvOpenOptions;
|
||||
use milli::documents::PrimaryKey;
|
||||
use milli::heed::{EnvOpenOptions, RwTxn};
|
||||
use milli::progress::Progress;
|
||||
use milli::update::new::indexer;
|
||||
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use milli::vector::EmbeddingConfigs;
|
||||
use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy};
|
||||
@@ -96,28 +99,59 @@ pub fn base_setup(conf: &Conf) -> Index {
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
let new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let documents = documents_from(conf.dataset, conf.dataset_format);
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
indexer.add_documents(&documents).unwrap();
|
||||
|
||||
index_documents(
|
||||
indexer,
|
||||
&index,
|
||||
&rtxn,
|
||||
new_fields_ids_map,
|
||||
&mut wtxn,
|
||||
config,
|
||||
db_fields_ids_map,
|
||||
);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
drop(rtxn);
|
||||
|
||||
index
|
||||
}
|
||||
|
||||
pub fn index_documents(
|
||||
indexer: indexer::DocumentOperation,
|
||||
index: &Index,
|
||||
rtxn: &milli::heed::RoTxn,
|
||||
mut new_fields_ids_map: milli::FieldsIdsMap,
|
||||
wtxn: &mut RwTxn,
|
||||
config: IndexerConfig,
|
||||
db_fields_ids_map: milli::FieldsIdsMap,
|
||||
) {
|
||||
let indexer_alloc = Bump::new();
|
||||
let thread_count =
|
||||
std::thread::available_parallelism().unwrap_or(NonZeroUsize::new(1).unwrap());
|
||||
let thread_pool = scoped_thread_pool::ThreadPool::new(thread_count, "index".into());
|
||||
let (document_changes, _operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
&indexer_alloc,
|
||||
&index,
|
||||
&rtxn,
|
||||
index,
|
||||
rtxn,
|
||||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
wtxn,
|
||||
index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -129,11 +163,38 @@ pub fn base_setup(conf: &Conf) -> Index {
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
drop(rtxn);
|
||||
|
||||
index
|
||||
pub fn index_delete_documents(
|
||||
indexer: indexer::DocumentDeletion,
|
||||
primary_key: PrimaryKey,
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
config: &IndexerConfig,
|
||||
db_fields_ids_map: milli::FieldsIdsMap,
|
||||
new_fields_ids_map: milli::FieldsIdsMap,
|
||||
) {
|
||||
let indexer_alloc = Bump::new();
|
||||
let thread_count =
|
||||
std::thread::available_parallelism().unwrap_or(NonZeroUsize::new(1).unwrap());
|
||||
let thread_pool = scoped_thread_pool::ThreadPool::new(thread_count, "index".into());
|
||||
let document_changes =
|
||||
indexer.into_changes(&indexer_alloc, primary_key, &thread_pool, CHUNK_SIZE);
|
||||
indexer::index(
|
||||
wtxn,
|
||||
index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
Some(primary_key),
|
||||
&document_changes,
|
||||
EmbeddingConfigs::default(),
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
|
||||
|
||||
@@ -10,8 +10,10 @@ dump
|
||||
├── instance-uid.uuid
|
||||
├── keys.jsonl
|
||||
├── metadata.json
|
||||
└── tasks
|
||||
├── update_files
|
||||
│ └── [task_id].jsonl
|
||||
├── tasks
|
||||
│ ├── update_files
|
||||
│ │ └── [task_id].jsonl
|
||||
│ └── queue.jsonl
|
||||
└── batches
|
||||
└── queue.jsonl
|
||||
```
|
||||
```
|
||||
|
||||
@@ -228,6 +228,7 @@ pub(crate) mod test {
|
||||
|
||||
use big_s::S;
|
||||
use maplit::{btreemap, btreeset};
|
||||
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats};
|
||||
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
||||
use meilisearch_types::features::{Network, Remote, RuntimeTogglableFeatures};
|
||||
use meilisearch_types::index_uid_pattern::IndexUidPattern;
|
||||
@@ -235,7 +236,8 @@ pub(crate) mod test {
|
||||
use meilisearch_types::milli;
|
||||
use meilisearch_types::milli::update::Setting;
|
||||
use meilisearch_types::settings::{Checked, FacetingSettings, Settings};
|
||||
use meilisearch_types::tasks::{Details, Status};
|
||||
use meilisearch_types::task_view::DetailsView;
|
||||
use meilisearch_types::tasks::{Details, Kind, Status};
|
||||
use serde_json::{json, Map, Value};
|
||||
use time::macros::datetime;
|
||||
use uuid::Uuid;
|
||||
@@ -305,6 +307,30 @@ pub(crate) mod test {
|
||||
settings.check()
|
||||
}
|
||||
|
||||
pub fn create_test_batches() -> Vec<Batch> {
|
||||
vec![Batch {
|
||||
uid: 0,
|
||||
details: DetailsView {
|
||||
received_documents: Some(12),
|
||||
indexed_documents: Some(Some(10)),
|
||||
..DetailsView::default()
|
||||
},
|
||||
progress: None,
|
||||
stats: BatchStats {
|
||||
total_nb_tasks: 1,
|
||||
status: maplit::btreemap! { Status::Succeeded => 1 },
|
||||
types: maplit::btreemap! { Kind::DocumentAdditionOrUpdate => 1 },
|
||||
index_uids: maplit::btreemap! { "doggo".to_string() => 1 },
|
||||
},
|
||||
enqueued_at: Some(BatchEnqueuedAt {
|
||||
earliest: datetime!(2022-11-11 0:00 UTC),
|
||||
oldest: datetime!(2022-11-11 0:00 UTC),
|
||||
}),
|
||||
started_at: datetime!(2022-11-20 0:00 UTC),
|
||||
finished_at: Some(datetime!(2022-11-21 0:00 UTC)),
|
||||
}]
|
||||
}
|
||||
|
||||
pub fn create_test_tasks() -> Vec<(TaskDump, Option<Vec<Document>>)> {
|
||||
vec![
|
||||
(
|
||||
@@ -427,6 +453,15 @@ pub(crate) mod test {
|
||||
index.flush().unwrap();
|
||||
index.settings(&settings).unwrap();
|
||||
|
||||
// ========== pushing the batch queue
|
||||
let batches = create_test_batches();
|
||||
|
||||
let mut batch_queue = dump.create_batches_queue().unwrap();
|
||||
for batch in &batches {
|
||||
batch_queue.push_batch(batch).unwrap();
|
||||
}
|
||||
batch_queue.flush().unwrap();
|
||||
|
||||
// ========== pushing the task queue
|
||||
let tasks = create_test_tasks();
|
||||
|
||||
|
||||
@@ -102,6 +102,13 @@ impl DumpReader {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn batches(&mut self) -> Result<Box<dyn Iterator<Item = Result<v6::Batch>> + '_>> {
|
||||
match self {
|
||||
DumpReader::Current(current) => Ok(current.batches()),
|
||||
DumpReader::Compat(_compat) => Ok(Box::new(std::iter::empty())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keys(&mut self) -> Result<Box<dyn Iterator<Item = Result<v6::Key>> + '_>> {
|
||||
match self {
|
||||
DumpReader::Current(current) => Ok(current.keys()),
|
||||
@@ -227,6 +234,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2024-05-16 15:51:34.151044 +00:00:00");
|
||||
insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None");
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -348,6 +359,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2023-07-06 7:10:27.21958 +00:00:00");
|
||||
insta::assert_debug_snapshot!(dump.instance_uid().unwrap(), @"None");
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -412,6 +427,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-04 15:55:10.344982459 +00:00:00");
|
||||
insta::assert_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d");
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -492,6 +511,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-06 12:53:49.131989609 +00:00:00");
|
||||
insta::assert_snapshot!(dump.instance_uid().unwrap().unwrap(), @"9e15e977-f2ae-4761-943f-1eaf75fd736d");
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -569,6 +592,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-07 11:39:03.709153554 +00:00:00");
|
||||
assert_eq!(dump.instance_uid().unwrap(), None);
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -662,6 +689,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2022-10-09 20:27:59.904096267 +00:00:00");
|
||||
assert_eq!(dump.instance_uid().unwrap(), None);
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -755,6 +786,10 @@ pub(crate) mod test {
|
||||
insta::assert_snapshot!(dump.date().unwrap(), @"2023-01-30 16:26:09.247261 +00:00:00");
|
||||
assert_eq!(dump.instance_uid().unwrap(), None);
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
@@ -831,6 +866,10 @@ pub(crate) mod test {
|
||||
assert_eq!(dump.date(), None);
|
||||
assert_eq!(dump.instance_uid().unwrap(), None);
|
||||
|
||||
// batches didn't exists at the time
|
||||
let batches = dump.batches().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
meili_snap::snapshot!(meili_snap::json_string!(batches), @"[]");
|
||||
|
||||
// tasks
|
||||
let tasks = dump.tasks().unwrap().collect::<Result<Vec<_>>>().unwrap();
|
||||
let (tasks, update_files): (Vec<_>, Vec<_>) = tasks.into_iter().unzip();
|
||||
|
||||
@@ -18,6 +18,7 @@ pub type Checked = meilisearch_types::settings::Checked;
|
||||
pub type Unchecked = meilisearch_types::settings::Unchecked;
|
||||
|
||||
pub type Task = crate::TaskDump;
|
||||
pub type Batch = meilisearch_types::batches::Batch;
|
||||
pub type Key = meilisearch_types::keys::Key;
|
||||
pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures;
|
||||
pub type Network = meilisearch_types::features::Network;
|
||||
@@ -49,6 +50,7 @@ pub struct V6Reader {
|
||||
instance_uid: Option<Uuid>,
|
||||
metadata: Metadata,
|
||||
tasks: BufReader<File>,
|
||||
batches: Option<BufReader<File>>,
|
||||
keys: BufReader<File>,
|
||||
features: Option<RuntimeTogglableFeatures>,
|
||||
network: Option<Network>,
|
||||
@@ -79,6 +81,12 @@ impl V6Reader {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let batches = match File::open(dump.path().join("batches").join("queue.jsonl")) {
|
||||
Ok(file) => Some(BufReader::new(file)),
|
||||
// The batch file was only introduced during the v1.13, anything prior to that won't have batches
|
||||
Err(err) if err.kind() == ErrorKind::NotFound => None,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let network_file = match fs::read(dump.path().join("network.json")) {
|
||||
Ok(network_file) => Some(network_file),
|
||||
@@ -101,6 +109,7 @@ impl V6Reader {
|
||||
metadata: serde_json::from_reader(&*meta_file)?,
|
||||
instance_uid,
|
||||
tasks: BufReader::new(File::open(dump.path().join("tasks").join("queue.jsonl"))?),
|
||||
batches,
|
||||
keys: BufReader::new(File::open(dump.path().join("keys.jsonl"))?),
|
||||
features,
|
||||
network,
|
||||
@@ -144,7 +153,7 @@ impl V6Reader {
|
||||
&mut self,
|
||||
) -> Box<dyn Iterator<Item = Result<(Task, Option<Box<super::UpdateFile>>)>> + '_> {
|
||||
Box::new((&mut self.tasks).lines().map(|line| -> Result<_> {
|
||||
let task: Task = serde_json::from_str(&line?).unwrap();
|
||||
let task: Task = serde_json::from_str(&line?)?;
|
||||
|
||||
let update_file_path = self
|
||||
.dump
|
||||
@@ -156,8 +165,7 @@ impl V6Reader {
|
||||
if update_file_path.exists() {
|
||||
Ok((
|
||||
task,
|
||||
Some(Box::new(UpdateFile::new(&update_file_path).unwrap())
|
||||
as Box<super::UpdateFile>),
|
||||
Some(Box::new(UpdateFile::new(&update_file_path)?) as Box<super::UpdateFile>),
|
||||
))
|
||||
} else {
|
||||
Ok((task, None))
|
||||
@@ -165,6 +173,16 @@ impl V6Reader {
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn batches(&mut self) -> Box<dyn Iterator<Item = Result<Batch>> + '_> {
|
||||
match self.batches.as_mut() {
|
||||
Some(batches) => Box::new((batches).lines().map(|line| -> Result<_> {
|
||||
let batch = serde_json::from_str(&line?)?;
|
||||
Ok(batch)
|
||||
})),
|
||||
None => Box::new(std::iter::empty()) as Box<dyn Iterator<Item = Result<Batch>> + '_>,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keys(&mut self) -> Box<dyn Iterator<Item = Result<Key>> + '_> {
|
||||
Box::new(
|
||||
(&mut self.keys).lines().map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }),
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::path::PathBuf;
|
||||
|
||||
use flate2::write::GzEncoder;
|
||||
use flate2::Compression;
|
||||
use meilisearch_types::batches::Batch;
|
||||
use meilisearch_types::features::{Network, RuntimeTogglableFeatures};
|
||||
use meilisearch_types::keys::Key;
|
||||
use meilisearch_types::settings::{Checked, Settings};
|
||||
@@ -54,6 +55,10 @@ impl DumpWriter {
|
||||
TaskWriter::new(self.dir.path().join("tasks"))
|
||||
}
|
||||
|
||||
pub fn create_batches_queue(&self) -> Result<BatchWriter> {
|
||||
BatchWriter::new(self.dir.path().join("batches"))
|
||||
}
|
||||
|
||||
pub fn create_experimental_features(&self, features: RuntimeTogglableFeatures) -> Result<()> {
|
||||
Ok(std::fs::write(
|
||||
self.dir.path().join("experimental-features.json"),
|
||||
@@ -88,7 +93,7 @@ impl KeyWriter {
|
||||
}
|
||||
|
||||
pub fn push_key(&mut self, key: &Key) -> Result<()> {
|
||||
self.keys.write_all(&serde_json::to_vec(key)?)?;
|
||||
serde_json::to_writer(&mut self.keys, &key)?;
|
||||
self.keys.write_all(b"\n")?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -118,7 +123,7 @@ impl TaskWriter {
|
||||
/// Pushes tasks in the dump.
|
||||
/// If the tasks has an associated `update_file` it'll use the `task_id` as its name.
|
||||
pub fn push_task(&mut self, task: &TaskDump) -> Result<UpdateFile> {
|
||||
self.queue.write_all(&serde_json::to_vec(task)?)?;
|
||||
serde_json::to_writer(&mut self.queue, &task)?;
|
||||
self.queue.write_all(b"\n")?;
|
||||
|
||||
Ok(UpdateFile::new(self.update_files.join(format!("{}.jsonl", task.uid))))
|
||||
@@ -130,6 +135,30 @@ impl TaskWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BatchWriter {
|
||||
queue: BufWriter<File>,
|
||||
}
|
||||
|
||||
impl BatchWriter {
|
||||
pub(crate) fn new(path: PathBuf) -> Result<Self> {
|
||||
std::fs::create_dir(&path)?;
|
||||
let queue = File::create(path.join("queue.jsonl"))?;
|
||||
Ok(BatchWriter { queue: BufWriter::new(queue) })
|
||||
}
|
||||
|
||||
/// Pushes batches in the dump.
|
||||
pub fn push_batch(&mut self, batch: &Batch) -> Result<()> {
|
||||
serde_json::to_writer(&mut self.queue, &batch)?;
|
||||
self.queue.write_all(b"\n")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush(mut self) -> Result<()> {
|
||||
self.queue.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UpdateFile {
|
||||
path: PathBuf,
|
||||
writer: Option<BufWriter<File>>,
|
||||
@@ -141,8 +170,8 @@ impl UpdateFile {
|
||||
}
|
||||
|
||||
pub fn push_document(&mut self, document: &Document) -> Result<()> {
|
||||
if let Some(writer) = self.writer.as_mut() {
|
||||
writer.write_all(&serde_json::to_vec(document)?)?;
|
||||
if let Some(mut writer) = self.writer.as_mut() {
|
||||
serde_json::to_writer(&mut writer, &document)?;
|
||||
writer.write_all(b"\n")?;
|
||||
} else {
|
||||
let file = File::create(&self.path).unwrap();
|
||||
@@ -209,8 +238,8 @@ pub(crate) mod test {
|
||||
use super::*;
|
||||
use crate::reader::Document;
|
||||
use crate::test::{
|
||||
create_test_api_keys, create_test_documents, create_test_dump, create_test_instance_uid,
|
||||
create_test_settings, create_test_tasks,
|
||||
create_test_api_keys, create_test_batches, create_test_documents, create_test_dump,
|
||||
create_test_instance_uid, create_test_settings, create_test_tasks,
|
||||
};
|
||||
|
||||
fn create_directory_hierarchy(dir: &Path) -> String {
|
||||
@@ -285,8 +314,10 @@ pub(crate) mod test {
|
||||
let dump_path = dump.path();
|
||||
|
||||
// ==== checking global file hierarchy (we want to be sure there isn't too many files or too few)
|
||||
insta::assert_snapshot!(create_directory_hierarchy(dump_path), @r###"
|
||||
insta::assert_snapshot!(create_directory_hierarchy(dump_path), @r"
|
||||
.
|
||||
├---- batches/
|
||||
│ └---- queue.jsonl
|
||||
├---- indexes/
|
||||
│ └---- doggos/
|
||||
│ │ ├---- documents.jsonl
|
||||
@@ -301,7 +332,7 @@ pub(crate) mod test {
|
||||
├---- keys.jsonl
|
||||
├---- metadata.json
|
||||
└---- network.json
|
||||
"###);
|
||||
");
|
||||
|
||||
// ==== checking the top level infos
|
||||
let metadata = fs::read_to_string(dump_path.join("metadata.json")).unwrap();
|
||||
@@ -354,6 +385,16 @@ pub(crate) mod test {
|
||||
}
|
||||
}
|
||||
|
||||
// ==== checking the batch queue
|
||||
let batches_queue = fs::read_to_string(dump_path.join("batches/queue.jsonl")).unwrap();
|
||||
for (batch, expected) in batches_queue.lines().zip(create_test_batches()) {
|
||||
let mut batch = serde_json::from_str::<Batch>(batch).unwrap();
|
||||
if batch.details.settings == Some(Box::new(Settings::<Unchecked>::default())) {
|
||||
batch.details.settings = None;
|
||||
}
|
||||
assert_eq!(batch, expected, "{batch:#?}{expected:#?}");
|
||||
}
|
||||
|
||||
// ==== checking the keys
|
||||
let keys = fs::read_to_string(dump_path.join("keys.jsonl")).unwrap();
|
||||
for (key, expected) in keys.lines().zip(create_test_api_keys()) {
|
||||
|
||||
@@ -17,6 +17,7 @@ clap = { version = "4.5.24", features = ["derive"] }
|
||||
either = "1.13.0"
|
||||
fastrand = "2.3.0"
|
||||
milli = { path = "../milli" }
|
||||
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
|
||||
serde = { version = "1.0.217", features = ["derive"] }
|
||||
serde_json = { version = "1.0.135", features = ["preserve_order"] }
|
||||
tempfile = "3.15.0"
|
||||
|
||||
@@ -12,6 +12,7 @@ use milli::documents::mmap_from_objects;
|
||||
use milli::heed::EnvOpenOptions;
|
||||
use milli::progress::Progress;
|
||||
use milli::update::new::indexer;
|
||||
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use milli::update::{IndexDocumentsMethod, IndexerConfig};
|
||||
use milli::vector::EmbeddingConfigs;
|
||||
use milli::Index;
|
||||
@@ -121,6 +122,11 @@ fn main() {
|
||||
}
|
||||
}
|
||||
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism(
|
||||
"index".into(),
|
||||
);
|
||||
|
||||
let (document_changes, _operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
&indexer_alloc,
|
||||
@@ -130,12 +136,15 @@ fn main() {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -28,6 +28,7 @@ memmap2 = "0.9.5"
|
||||
page_size = "0.6.0"
|
||||
rayon = "1.10.0"
|
||||
roaring = { version = "0.10.10", features = ["serde"] }
|
||||
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
|
||||
serde = { version = "1.0.217", features = ["derive"] }
|
||||
serde_json = { version = "1.0.135", features = ["preserve_order"] }
|
||||
synchronoise = "1.0.1"
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use dump::{KindDump, TaskDump, UpdateFile};
|
||||
use meilisearch_types::batches::{Batch, BatchId};
|
||||
use meilisearch_types::heed::RwTxn;
|
||||
use meilisearch_types::milli;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||
@@ -14,9 +15,15 @@ pub struct Dump<'a> {
|
||||
index_scheduler: &'a IndexScheduler,
|
||||
wtxn: RwTxn<'a>,
|
||||
|
||||
batch_to_task_mapping: HashMap<BatchId, RoaringBitmap>,
|
||||
|
||||
indexes: HashMap<String, RoaringBitmap>,
|
||||
statuses: HashMap<Status, RoaringBitmap>,
|
||||
kinds: HashMap<Kind, RoaringBitmap>,
|
||||
|
||||
batch_indexes: HashMap<String, RoaringBitmap>,
|
||||
batch_statuses: HashMap<Status, RoaringBitmap>,
|
||||
batch_kinds: HashMap<Kind, RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl<'a> Dump<'a> {
|
||||
@@ -27,12 +34,72 @@ impl<'a> Dump<'a> {
|
||||
Ok(Dump {
|
||||
index_scheduler,
|
||||
wtxn,
|
||||
batch_to_task_mapping: HashMap::new(),
|
||||
indexes: HashMap::new(),
|
||||
statuses: HashMap::new(),
|
||||
kinds: HashMap::new(),
|
||||
batch_indexes: HashMap::new(),
|
||||
batch_statuses: HashMap::new(),
|
||||
batch_kinds: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Register a new batch coming from a dump in the scheduler.
|
||||
/// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running.
|
||||
pub fn register_dumped_batch(&mut self, batch: Batch) -> Result<()> {
|
||||
self.index_scheduler.queue.batches.all_batches.put(&mut self.wtxn, &batch.uid, &batch)?;
|
||||
if let Some(enqueued_at) = batch.enqueued_at {
|
||||
utils::insert_task_datetime(
|
||||
&mut self.wtxn,
|
||||
self.index_scheduler.queue.batches.enqueued_at,
|
||||
enqueued_at.earliest,
|
||||
batch.uid,
|
||||
)?;
|
||||
utils::insert_task_datetime(
|
||||
&mut self.wtxn,
|
||||
self.index_scheduler.queue.batches.enqueued_at,
|
||||
enqueued_at.oldest,
|
||||
batch.uid,
|
||||
)?;
|
||||
}
|
||||
utils::insert_task_datetime(
|
||||
&mut self.wtxn,
|
||||
self.index_scheduler.queue.batches.started_at,
|
||||
batch.started_at,
|
||||
batch.uid,
|
||||
)?;
|
||||
if let Some(finished_at) = batch.finished_at {
|
||||
utils::insert_task_datetime(
|
||||
&mut self.wtxn,
|
||||
self.index_scheduler.queue.batches.finished_at,
|
||||
finished_at,
|
||||
batch.uid,
|
||||
)?;
|
||||
}
|
||||
|
||||
for index in batch.stats.index_uids.keys() {
|
||||
match self.batch_indexes.get_mut(index) {
|
||||
Some(bitmap) => {
|
||||
bitmap.insert(batch.uid);
|
||||
}
|
||||
None => {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(batch.uid);
|
||||
self.batch_indexes.insert(index.to_string(), bitmap);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
for status in batch.stats.status.keys() {
|
||||
self.batch_statuses.entry(*status).or_default().insert(batch.uid);
|
||||
}
|
||||
for kind in batch.stats.types.keys() {
|
||||
self.batch_kinds.entry(*kind).or_default().insert(batch.uid);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Register a new task coming from a dump in the scheduler.
|
||||
/// By taking a mutable ref we're pretty sure no one will ever import a dump while actix is running.
|
||||
pub fn register_dumped_task(
|
||||
@@ -149,6 +216,9 @@ impl<'a> Dump<'a> {
|
||||
};
|
||||
|
||||
self.index_scheduler.queue.tasks.all_tasks.put(&mut self.wtxn, &task.uid, &task)?;
|
||||
if let Some(batch_id) = task.batch_uid {
|
||||
self.batch_to_task_mapping.entry(batch_id).or_default().insert(task.uid);
|
||||
}
|
||||
|
||||
for index in task.indexes() {
|
||||
match self.indexes.get_mut(index) {
|
||||
@@ -198,6 +268,14 @@ impl<'a> Dump<'a> {
|
||||
|
||||
/// Commit all the changes and exit the importing dump state
|
||||
pub fn finish(mut self) -> Result<()> {
|
||||
for (batch_id, task_ids) in self.batch_to_task_mapping {
|
||||
self.index_scheduler.queue.batch_to_tasks_mapping.put(
|
||||
&mut self.wtxn,
|
||||
&batch_id,
|
||||
&task_ids,
|
||||
)?;
|
||||
}
|
||||
|
||||
for (index, bitmap) in self.indexes {
|
||||
self.index_scheduler.queue.tasks.index_tasks.put(&mut self.wtxn, &index, &bitmap)?;
|
||||
}
|
||||
@@ -208,6 +286,16 @@ impl<'a> Dump<'a> {
|
||||
self.index_scheduler.queue.tasks.put_kind(&mut self.wtxn, kind, &bitmap)?;
|
||||
}
|
||||
|
||||
for (index, bitmap) in self.batch_indexes {
|
||||
self.index_scheduler.queue.batches.index_tasks.put(&mut self.wtxn, &index, &bitmap)?;
|
||||
}
|
||||
for (status, bitmap) in self.batch_statuses {
|
||||
self.index_scheduler.queue.batches.put_status(&mut self.wtxn, status, &bitmap)?;
|
||||
}
|
||||
for (kind, bitmap) in self.batch_kinds {
|
||||
self.index_scheduler.queue.batches.put_kind(&mut self.wtxn, kind, &bitmap)?;
|
||||
}
|
||||
|
||||
self.wtxn.commit()?;
|
||||
self.index_scheduler.scheduler.wake_up.signal();
|
||||
|
||||
|
||||
@@ -109,6 +109,8 @@ pub enum Error {
|
||||
InvalidIndexUid { index_uid: String },
|
||||
#[error("Task `{0}` not found.")]
|
||||
TaskNotFound(TaskId),
|
||||
#[error("Task `{0}` does not contain any documents. Only `documentAdditionOrUpdate` tasks with the statuses `enqueued` or `processing` contain documents")]
|
||||
TaskFileNotFound(TaskId),
|
||||
#[error("Batch `{0}` not found.")]
|
||||
BatchNotFound(BatchId),
|
||||
#[error("Query parameters to filter the tasks to delete are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
||||
@@ -189,6 +191,7 @@ impl Error {
|
||||
| Error::InvalidTaskCanceledBy { .. }
|
||||
| Error::InvalidIndexUid { .. }
|
||||
| Error::TaskNotFound(_)
|
||||
| Error::TaskFileNotFound(_)
|
||||
| Error::BatchNotFound(_)
|
||||
| Error::TaskDeletionWithEmptyQuery
|
||||
| Error::TaskCancelationWithEmptyQuery
|
||||
@@ -250,6 +253,7 @@ impl ErrorCode for Error {
|
||||
Error::InvalidTaskCanceledBy { .. } => Code::InvalidTaskCanceledBy,
|
||||
Error::InvalidIndexUid { .. } => Code::InvalidIndexUid,
|
||||
Error::TaskNotFound(_) => Code::TaskNotFound,
|
||||
Error::TaskFileNotFound(_) => Code::TaskFileNotFound,
|
||||
Error::BatchNotFound(_) => Code::BatchNotFound,
|
||||
Error::TaskDeletionWithEmptyQuery => Code::MissingTaskFilters,
|
||||
Error::TaskCancelationWithEmptyQuery => Code::MissingTaskFilters,
|
||||
|
||||
@@ -105,6 +105,19 @@ impl RoFeatures {
|
||||
.into())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_get_task_documents_route(&self) -> Result<()> {
|
||||
if self.runtime.get_task_documents_route {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Getting the documents of an enqueued task",
|
||||
feature: "get task documents route",
|
||||
issue_link: "https://github.com/orgs/meilisearch/discussions/808",
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FeatureData {
|
||||
|
||||
@@ -106,6 +106,12 @@ pub struct IndexStats {
|
||||
/// are not returned to the disk after a deletion, this number is typically larger than
|
||||
/// `used_database_size` that only includes the size of the used pages.
|
||||
pub database_size: u64,
|
||||
/// Number of embeddings in the index.
|
||||
/// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
|
||||
pub number_of_embeddings: Option<u64>,
|
||||
/// Number of embedded documents in the index.
|
||||
/// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
|
||||
pub number_of_embedded_documents: Option<u64>,
|
||||
/// Size taken by the used pages of the index' DB, in bytes.
|
||||
///
|
||||
/// As the DB backend does not return to the disk the pages that are not currently used by the DB,
|
||||
@@ -130,8 +136,11 @@ impl IndexStats {
|
||||
///
|
||||
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
|
||||
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
|
||||
let arroy_stats = index.arroy_stats(rtxn)?;
|
||||
Ok(IndexStats {
|
||||
number_of_documents: index.number_of_documents(rtxn)?,
|
||||
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
|
||||
number_of_embedded_documents: Some(arroy_stats.documents.len()),
|
||||
database_size: index.on_disk_size()?,
|
||||
used_database_size: index.used_size()?,
|
||||
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),
|
||||
|
||||
@@ -33,7 +33,7 @@ mod test_utils;
|
||||
pub mod upgrade;
|
||||
mod utils;
|
||||
pub mod uuid_codec;
|
||||
mod versioning;
|
||||
pub mod versioning;
|
||||
|
||||
pub type Result<T, E = Error> = std::result::Result<T, E>;
|
||||
pub type TaskId = u32;
|
||||
|
||||
@@ -96,6 +96,7 @@ make_enum_progress! {
|
||||
StartTheDumpCreation,
|
||||
DumpTheApiKeys,
|
||||
DumpTheTasks,
|
||||
DumpTheBatches,
|
||||
DumpTheIndexes,
|
||||
DumpTheExperimentalFeatures,
|
||||
CompressTheDump,
|
||||
|
||||
@@ -8,6 +8,7 @@ mod tasks_test;
|
||||
mod test;
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::File as StdFile;
|
||||
use std::time::Duration;
|
||||
|
||||
use file_store::FileStore;
|
||||
@@ -216,6 +217,11 @@ impl Queue {
|
||||
}
|
||||
}
|
||||
|
||||
/// Open and returns the task's content File.
|
||||
pub fn update_file(&self, uuid: Uuid) -> file_store::Result<StdFile> {
|
||||
self.file_store.get_update(uuid)
|
||||
}
|
||||
|
||||
/// Delete a file from the index scheduler.
|
||||
///
|
||||
/// Counterpart to the [`create_update_file`](IndexScheduler::create_update_file) method.
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::fs::File;
|
||||
use std::io::BufWriter;
|
||||
use std::sync::atomic::Ordering;
|
||||
@@ -11,7 +12,9 @@ use meilisearch_types::tasks::{Details, KindWithContent, Status, Task};
|
||||
use time::macros::format_description;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::processing::{AtomicDocumentStep, AtomicTaskStep, DumpCreationProgress};
|
||||
use crate::processing::{
|
||||
AtomicBatchStep, AtomicDocumentStep, AtomicTaskStep, DumpCreationProgress,
|
||||
};
|
||||
use crate::{Error, IndexScheduler, Result};
|
||||
|
||||
impl IndexScheduler {
|
||||
@@ -102,7 +105,40 @@ impl IndexScheduler {
|
||||
}
|
||||
dump_tasks.flush()?;
|
||||
|
||||
// 3. Dump the indexes
|
||||
// 3. dump the batches
|
||||
progress.update_progress(DumpCreationProgress::DumpTheBatches);
|
||||
let mut dump_batches = dump.create_batches_queue()?;
|
||||
|
||||
let (atomic_batch_progress, update_batch_progress) =
|
||||
AtomicBatchStep::new(self.queue.batches.all_batches.len(&rtxn)? as u32);
|
||||
progress.update_progress(update_batch_progress);
|
||||
|
||||
for ret in self.queue.batches.all_batches.iter(&rtxn)? {
|
||||
if self.scheduler.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
|
||||
let (_, mut b) = ret?;
|
||||
// In the case we're dumping ourselves we want to be marked as finished
|
||||
// to not loop over ourselves indefinitely.
|
||||
if b.uid == task.uid {
|
||||
let finished_at = OffsetDateTime::now_utc();
|
||||
|
||||
// We're going to fake the date because we don't know if everything is going to go well.
|
||||
// But we need to dump the task as finished and successful.
|
||||
// If something fail everything will be set appropriately in the end.
|
||||
let mut statuses = BTreeMap::new();
|
||||
statuses.insert(Status::Succeeded, b.stats.total_nb_tasks);
|
||||
b.stats.status = statuses;
|
||||
b.finished_at = Some(finished_at);
|
||||
}
|
||||
|
||||
dump_batches.push_batch(&b)?;
|
||||
atomic_batch_progress.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
dump_batches.flush()?;
|
||||
|
||||
// 4. Dump the indexes
|
||||
progress.update_progress(DumpCreationProgress::DumpTheIndexes);
|
||||
let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32;
|
||||
let mut count = 0;
|
||||
@@ -142,7 +178,7 @@ impl IndexScheduler {
|
||||
let documents = index
|
||||
.all_documents(&rtxn)
|
||||
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
|
||||
// 3.1. Dump the documents
|
||||
// 4.1. Dump the documents
|
||||
for ret in documents {
|
||||
if self.scheduler.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
@@ -204,7 +240,7 @@ impl IndexScheduler {
|
||||
atomic.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
// 3.2. Dump the settings
|
||||
// 4.2. Dump the settings
|
||||
let settings = meilisearch_types::settings::settings(
|
||||
index,
|
||||
&rtxn,
|
||||
@@ -215,7 +251,7 @@ impl IndexScheduler {
|
||||
Ok(())
|
||||
})?;
|
||||
|
||||
// 4. Dump experimental feature settings
|
||||
// 5. Dump experimental feature settings
|
||||
progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures);
|
||||
let features = self.features().runtime_features();
|
||||
dump.create_experimental_features(features)?;
|
||||
|
||||
@@ -3,6 +3,7 @@ use bumpalo::Bump;
|
||||
use meilisearch_types::heed::RwTxn;
|
||||
use meilisearch_types::milli::documents::PrimaryKey;
|
||||
use meilisearch_types::milli::progress::Progress;
|
||||
use meilisearch_types::milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction};
|
||||
use meilisearch_types::milli::update::DocumentAdditionResult;
|
||||
use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder};
|
||||
@@ -112,17 +113,24 @@ impl IndexScheduler {
|
||||
|
||||
let local_pool;
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let pool = match &indexer_config.thread_pool {
|
||||
let pool = match &indexer_config.rayon_thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
local_pool = ThreadPoolNoAbortBuilder::new()
|
||||
.thread_name(|i| format!("indexing-thread-{i}"))
|
||||
.thread_name(|i| format!("rayon-{i}"))
|
||||
.build()
|
||||
.unwrap();
|
||||
&local_pool
|
||||
}
|
||||
};
|
||||
|
||||
let thread_pool = match &indexer_config.thread_pool {
|
||||
Some(thread_pool) => thread_pool,
|
||||
None => {
|
||||
&scoped_thread_pool::ThreadPool::with_available_parallelism("index".into())
|
||||
}
|
||||
};
|
||||
|
||||
progress.update_progress(DocumentOperationProgress::ComputingDocumentChanges);
|
||||
let (document_changes, operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
@@ -133,6 +141,8 @@ impl IndexScheduler {
|
||||
&mut new_fields_ids_map,
|
||||
&|| must_stop_processing.get(),
|
||||
progress.clone(),
|
||||
thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
|
||||
|
||||
@@ -173,6 +183,7 @@ impl IndexScheduler {
|
||||
indexer::index(
|
||||
index_wtxn,
|
||||
index,
|
||||
thread_pool,
|
||||
pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -261,7 +272,7 @@ impl IndexScheduler {
|
||||
if task.error.is_none() {
|
||||
let local_pool;
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let pool = match &indexer_config.thread_pool {
|
||||
let pool = match &indexer_config.rayon_thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
local_pool = ThreadPoolNoAbortBuilder::new()
|
||||
@@ -272,16 +283,19 @@ impl IndexScheduler {
|
||||
}
|
||||
};
|
||||
|
||||
let thread_pool = match &indexer_config.thread_pool {
|
||||
Some(thread_pool) => thread_pool,
|
||||
None => &scoped_thread_pool::ThreadPool::with_available_parallelism(
|
||||
"index".into(),
|
||||
),
|
||||
};
|
||||
|
||||
let candidates_count = candidates.len();
|
||||
progress.update_progress(DocumentEditionProgress::ComputingDocumentChanges);
|
||||
let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone());
|
||||
let document_changes = pool
|
||||
.install(|| {
|
||||
indexer
|
||||
.into_changes(&primary_key)
|
||||
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))
|
||||
})
|
||||
.unwrap()?;
|
||||
let document_changes = indexer
|
||||
.into_changes(&primary_key, &indexer_alloc, thread_pool, CHUNK_SIZE)
|
||||
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
|
||||
let embedders = index
|
||||
.embedding_configs(index_wtxn)
|
||||
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
|
||||
@@ -291,6 +305,7 @@ impl IndexScheduler {
|
||||
indexer::index(
|
||||
index_wtxn,
|
||||
index,
|
||||
thread_pool,
|
||||
pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -421,7 +436,7 @@ impl IndexScheduler {
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
let local_pool;
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let pool = match &indexer_config.thread_pool {
|
||||
let pool = match &indexer_config.rayon_thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
local_pool = ThreadPoolNoAbortBuilder::new()
|
||||
@@ -432,11 +447,19 @@ impl IndexScheduler {
|
||||
}
|
||||
};
|
||||
|
||||
let thread_pool = match &indexer_config.thread_pool {
|
||||
Some(thread_pool) => thread_pool,
|
||||
None => &scoped_thread_pool::ThreadPool::with_available_parallelism(
|
||||
"index".into(),
|
||||
),
|
||||
};
|
||||
|
||||
progress.update_progress(DocumentDeletionProgress::DeleteDocuments);
|
||||
let mut indexer = indexer::DocumentDeletion::new();
|
||||
let candidates_count = to_delete.len();
|
||||
indexer.delete_documents_by_docids(to_delete);
|
||||
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
|
||||
let document_changes =
|
||||
indexer.into_changes(&indexer_alloc, primary_key, thread_pool, CHUNK_SIZE);
|
||||
let embedders = index
|
||||
.embedding_configs(index_wtxn)
|
||||
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
|
||||
@@ -446,6 +469,7 @@ impl IndexScheduler {
|
||||
indexer::index(
|
||||
index_wtxn,
|
||||
index,
|
||||
thread_pool,
|
||||
pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -903,7 +903,7 @@ fn create_and_list_index() {
|
||||
|
||||
index_scheduler.index("kefir").unwrap();
|
||||
let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap();
|
||||
snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#"
|
||||
snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###"
|
||||
[
|
||||
1,
|
||||
[
|
||||
@@ -912,6 +912,8 @@ fn create_and_list_index() {
|
||||
{
|
||||
"number_of_documents": 0,
|
||||
"database_size": "[bytes]",
|
||||
"number_of_embeddings": 0,
|
||||
"number_of_embedded_documents": 0,
|
||||
"used_database_size": "[bytes]",
|
||||
"primary_key": null,
|
||||
"field_distribution": {},
|
||||
@@ -921,5 +923,5 @@ fn create_and_list_index() {
|
||||
]
|
||||
]
|
||||
]
|
||||
"#);
|
||||
"###);
|
||||
}
|
||||
|
||||
@@ -6,8 +6,7 @@ use meili_snap::snapshot;
|
||||
use meilisearch_types::milli::obkv_to_json;
|
||||
use meilisearch_types::milli::update::IndexDocumentsMethod::*;
|
||||
use meilisearch_types::milli::update::Setting;
|
||||
use meilisearch_types::tasks::Kind;
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent};
|
||||
|
||||
use crate::insta_snapshot::snapshot_index_scheduler;
|
||||
use crate::test_utils::Breakpoint::*;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use crate::{upgrade::upgrade_index_scheduler, Result};
|
||||
use meilisearch_types::{
|
||||
heed::{types::Str, Database, Env, RoTxn, RwTxn},
|
||||
milli::heed_codec::version::VersionCodec,
|
||||
versioning,
|
||||
};
|
||||
use meilisearch_types::heed::types::Str;
|
||||
use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::heed_codec::version::VersionCodec;
|
||||
use meilisearch_types::versioning;
|
||||
|
||||
use crate::upgrade::upgrade_index_scheduler;
|
||||
use crate::Result;
|
||||
|
||||
/// The number of database used by queue itself
|
||||
const NUMBER_OF_DATABASES: u32 = 1;
|
||||
@@ -21,30 +22,38 @@ pub struct Versioning {
|
||||
}
|
||||
|
||||
impl Versioning {
|
||||
pub(crate) const fn nb_db() -> u32 {
|
||||
pub const fn nb_db() -> u32 {
|
||||
NUMBER_OF_DATABASES
|
||||
}
|
||||
|
||||
pub fn get_version(&self, rtxn: &RoTxn) -> Result<Option<(u32, u32, u32)>> {
|
||||
Ok(self.version.get(rtxn, entry_name::MAIN)?)
|
||||
pub fn get_version(&self, rtxn: &RoTxn) -> Result<Option<(u32, u32, u32)>, heed::Error> {
|
||||
self.version.get(rtxn, entry_name::MAIN)
|
||||
}
|
||||
|
||||
pub fn set_version(&self, wtxn: &mut RwTxn, version: (u32, u32, u32)) -> Result<()> {
|
||||
Ok(self.version.put(wtxn, entry_name::MAIN, &version)?)
|
||||
pub fn set_version(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
version: (u32, u32, u32),
|
||||
) -> Result<(), heed::Error> {
|
||||
self.version.put(wtxn, entry_name::MAIN, &version)
|
||||
}
|
||||
|
||||
pub fn set_current_version(&self, wtxn: &mut RwTxn) -> Result<()> {
|
||||
pub fn set_current_version(&self, wtxn: &mut RwTxn) -> Result<(), heed::Error> {
|
||||
let major = versioning::VERSION_MAJOR.parse().unwrap();
|
||||
let minor = versioning::VERSION_MINOR.parse().unwrap();
|
||||
let patch = versioning::VERSION_PATCH.parse().unwrap();
|
||||
self.set_version(wtxn, (major, minor, patch))
|
||||
}
|
||||
|
||||
/// Create an index scheduler and start its run loop.
|
||||
/// Return `Self` without checking anything about the version
|
||||
pub fn raw_new(env: &Env, wtxn: &mut RwTxn) -> Result<Self, heed::Error> {
|
||||
let version = env.create_database(wtxn, Some(db_name::VERSION))?;
|
||||
Ok(Self { version })
|
||||
}
|
||||
|
||||
pub(crate) fn new(env: &Env, db_version: (u32, u32, u32)) -> Result<Self> {
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let version = env.create_database(&mut wtxn, Some(db_name::VERSION))?;
|
||||
let this = Self { version };
|
||||
let this = Self::raw_new(env, &mut wtxn)?;
|
||||
let from = match this.get_version(&wtxn)? {
|
||||
Some(version) => version,
|
||||
// fresh DB: use the db version
|
||||
|
||||
@@ -30,7 +30,21 @@ pub struct Batch {
|
||||
pub enqueued_at: Option<BatchEnqueuedAt>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
|
||||
impl PartialEq for Batch {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
let Self { uid, progress, details, stats, started_at, finished_at, enqueued_at } = self;
|
||||
|
||||
*uid == other.uid
|
||||
&& progress.is_none() == other.progress.is_none()
|
||||
&& details == &other.details
|
||||
&& stats == &other.stats
|
||||
&& started_at == &other.started_at
|
||||
&& finished_at == &other.finished_at
|
||||
&& enqueued_at == &other.enqueued_at
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct BatchEnqueuedAt {
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub earliest: OffsetDateTime,
|
||||
@@ -38,7 +52,7 @@ pub struct BatchEnqueuedAt {
|
||||
pub oldest: OffsetDateTime,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Serialize, Deserialize, ToSchema)]
|
||||
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
#[schema(rename_all = "camelCase")]
|
||||
pub struct BatchStats {
|
||||
|
||||
@@ -372,6 +372,7 @@ RemoteRemoteError , System , BAD_GATEWAY ;
|
||||
RemoteTimeout , System , BAD_GATEWAY ;
|
||||
TooManySearchRequests , System , SERVICE_UNAVAILABLE ;
|
||||
TaskNotFound , InvalidRequest , NOT_FOUND ;
|
||||
TaskFileNotFound , InvalidRequest , NOT_FOUND ;
|
||||
BatchNotFound , InvalidRequest , NOT_FOUND ;
|
||||
TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ;
|
||||
TooManyVectors , InvalidRequest , BAD_REQUEST ;
|
||||
|
||||
@@ -10,6 +10,7 @@ pub struct RuntimeTogglableFeatures {
|
||||
pub edit_documents_by_function: bool,
|
||||
pub contains_filter: bool,
|
||||
pub network: bool,
|
||||
pub get_task_documents_route: bool,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy)]
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
use std::fs;
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::path::Path;
|
||||
|
||||
use milli::heed;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
/// The name of the file that contains the version of the database.
|
||||
pub const VERSION_FILE_NAME: &str = "VERSION";
|
||||
|
||||
@@ -10,37 +13,7 @@ pub static VERSION_MINOR: &str = env!("CARGO_PKG_VERSION_MINOR");
|
||||
pub static VERSION_PATCH: &str = env!("CARGO_PKG_VERSION_PATCH");
|
||||
|
||||
/// Persists the version of the current Meilisearch binary to a VERSION file
|
||||
pub fn update_version_file_for_dumpless_upgrade(
|
||||
db_path: &Path,
|
||||
from: (u32, u32, u32),
|
||||
to: (u32, u32, u32),
|
||||
) -> Result<(), VersionFileError> {
|
||||
let (from_major, from_minor, from_patch) = from;
|
||||
let (to_major, to_minor, to_patch) = to;
|
||||
|
||||
if from_major > to_major
|
||||
|| (from_major == to_major && from_minor > to_minor)
|
||||
|| (from_major == to_major && from_minor == to_minor && from_patch > to_patch)
|
||||
{
|
||||
Err(VersionFileError::DowngradeNotSupported {
|
||||
major: from_major,
|
||||
minor: from_minor,
|
||||
patch: from_patch,
|
||||
})
|
||||
} else if from_major < 1 || (from_major == to_major && from_minor < 12) {
|
||||
Err(VersionFileError::TooOldForAutomaticUpgrade {
|
||||
major: from_major,
|
||||
minor: from_minor,
|
||||
patch: from_patch,
|
||||
})
|
||||
} else {
|
||||
create_current_version_file(db_path)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Persists the version of the current Meilisearch binary to a VERSION file
|
||||
pub fn create_current_version_file(db_path: &Path) -> io::Result<()> {
|
||||
pub fn create_current_version_file(db_path: &Path) -> anyhow::Result<()> {
|
||||
create_version_file(db_path, VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
|
||||
}
|
||||
|
||||
@@ -49,9 +22,14 @@ pub fn create_version_file(
|
||||
major: &str,
|
||||
minor: &str,
|
||||
patch: &str,
|
||||
) -> io::Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
let version_path = db_path.join(VERSION_FILE_NAME);
|
||||
fs::write(version_path, format!("{}.{}.{}", major, minor, patch))
|
||||
// In order to persist the file later we must create it in the `data.ms` and not in `/tmp`
|
||||
let mut file = NamedTempFile::new_in(db_path)?;
|
||||
file.write_all(format!("{}.{}.{}", major, minor, patch).as_bytes())?;
|
||||
file.flush()?;
|
||||
file.persist(version_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_version(db_path: &Path) -> Result<(u32, u32, u32), VersionFileError> {
|
||||
@@ -61,7 +39,7 @@ pub fn get_version(db_path: &Path) -> Result<(u32, u32, u32), VersionFileError>
|
||||
Ok(version) => parse_version(&version),
|
||||
Err(error) => match error.kind() {
|
||||
ErrorKind::NotFound => Err(VersionFileError::MissingVersionFile),
|
||||
_ => Err(error.into()),
|
||||
_ => Err(anyhow::Error::from(error).into()),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -112,7 +90,9 @@ pub enum VersionFileError {
|
||||
DowngradeNotSupported { major: u32, minor: u32, patch: u32 },
|
||||
#[error("Database version {major}.{minor}.{patch} is too old for the experimental dumpless upgrade feature. Please generate a dump using the v{major}.{minor}.{patch} and import it in the v{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_PATCH}")]
|
||||
TooOldForAutomaticUpgrade { major: u32, minor: u32, patch: u32 },
|
||||
#[error("Error while modifying the database: {0}")]
|
||||
ErrorWhileModifyingTheDatabase(#[from] heed::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
IoError(#[from] std::io::Error),
|
||||
AnyhowError(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
@@ -115,6 +115,7 @@ utoipa = { version = "5.3.1", features = [
|
||||
"openapi_extensions",
|
||||
] }
|
||||
utoipa-scalar = { version = "0.3.0", optional = true, features = ["actix-web"] }
|
||||
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
|
||||
|
||||
[dev-dependencies]
|
||||
actix-rt = "2.10.0"
|
||||
|
||||
@@ -197,6 +197,7 @@ struct Infos {
|
||||
experimental_max_number_of_batched_tasks: usize,
|
||||
experimental_limit_batched_tasks_total_size: u64,
|
||||
experimental_network: bool,
|
||||
experimental_get_task_documents_route: bool,
|
||||
gpu_enabled: bool,
|
||||
db_path: bool,
|
||||
import_dump: bool,
|
||||
@@ -288,6 +289,7 @@ impl Infos {
|
||||
edit_documents_by_function,
|
||||
contains_filter,
|
||||
network,
|
||||
get_task_documents_route,
|
||||
} = features;
|
||||
|
||||
// We're going to override every sensible information.
|
||||
@@ -306,6 +308,7 @@ impl Infos {
|
||||
experimental_enable_logs_route: experimental_enable_logs_route | logs_route,
|
||||
experimental_reduce_indexing_memory_usage,
|
||||
experimental_network: network,
|
||||
experimental_get_task_documents_route: get_task_documents_route,
|
||||
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
|
||||
db_path: db_path != PathBuf::from("./data.ms"),
|
||||
import_dump: import_dump.is_some(),
|
||||
|
||||
@@ -32,6 +32,7 @@ use analytics::Analytics;
|
||||
use anyhow::bail;
|
||||
use error::PayloadError;
|
||||
use extractors::payload::PayloadConfig;
|
||||
use index_scheduler::versioning::Versioning;
|
||||
use index_scheduler::{IndexScheduler, IndexSchedulerOptions};
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::milli::constants::VERSION_MAJOR;
|
||||
@@ -40,10 +41,9 @@ use meilisearch_types::milli::update::{IndexDocumentsConfig, IndexDocumentsMetho
|
||||
use meilisearch_types::settings::apply_settings_to_builder;
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use meilisearch_types::versioning::{
|
||||
create_current_version_file, get_version, update_version_file_for_dumpless_upgrade,
|
||||
VersionFileError, VERSION_MINOR, VERSION_PATCH,
|
||||
create_current_version_file, get_version, VersionFileError, VERSION_MINOR, VERSION_PATCH,
|
||||
};
|
||||
use meilisearch_types::{compression, milli, VERSION_FILE_NAME};
|
||||
use meilisearch_types::{compression, heed, milli, VERSION_FILE_NAME};
|
||||
pub use option::Opt;
|
||||
use option::ScheduleSnapshot;
|
||||
use search_queue::SearchQueue;
|
||||
@@ -356,14 +356,19 @@ fn open_or_create_database_unchecked(
|
||||
|
||||
/// Ensures Meilisearch version is compatible with the database, returns an error in case of version mismatch.
|
||||
/// Returns the version that was contained in the version file
|
||||
fn check_version(opt: &Opt, binary_version: (u32, u32, u32)) -> anyhow::Result<(u32, u32, u32)> {
|
||||
fn check_version(
|
||||
opt: &Opt,
|
||||
index_scheduler_opt: &IndexSchedulerOptions,
|
||||
binary_version: (u32, u32, u32),
|
||||
) -> anyhow::Result<(u32, u32, u32)> {
|
||||
let (bin_major, bin_minor, bin_patch) = binary_version;
|
||||
let (db_major, db_minor, db_patch) = get_version(&opt.db_path)?;
|
||||
|
||||
if db_major != bin_major || db_minor != bin_minor || db_patch > bin_patch {
|
||||
if opt.experimental_dumpless_upgrade {
|
||||
update_version_file_for_dumpless_upgrade(
|
||||
&opt.db_path,
|
||||
opt,
|
||||
index_scheduler_opt,
|
||||
(db_major, db_minor, db_patch),
|
||||
(bin_major, bin_minor, bin_patch),
|
||||
)?;
|
||||
@@ -380,6 +385,57 @@ fn check_version(opt: &Opt, binary_version: (u32, u32, u32)) -> anyhow::Result<(
|
||||
Ok((db_major, db_minor, db_patch))
|
||||
}
|
||||
|
||||
/// Persists the version of the current Meilisearch binary to a VERSION file
|
||||
pub fn update_version_file_for_dumpless_upgrade(
|
||||
opt: &Opt,
|
||||
index_scheduler_opt: &IndexSchedulerOptions,
|
||||
from: (u32, u32, u32),
|
||||
to: (u32, u32, u32),
|
||||
) -> Result<(), VersionFileError> {
|
||||
let (from_major, from_minor, from_patch) = from;
|
||||
let (to_major, to_minor, to_patch) = to;
|
||||
|
||||
// Early exit in case of error
|
||||
if from_major > to_major
|
||||
|| (from_major == to_major && from_minor > to_minor)
|
||||
|| (from_major == to_major && from_minor == to_minor && from_patch > to_patch)
|
||||
{
|
||||
return Err(VersionFileError::DowngradeNotSupported {
|
||||
major: from_major,
|
||||
minor: from_minor,
|
||||
patch: from_patch,
|
||||
});
|
||||
} else if from_major < 1 || (from_major == to_major && from_minor < 12) {
|
||||
return Err(VersionFileError::TooOldForAutomaticUpgrade {
|
||||
major: from_major,
|
||||
minor: from_minor,
|
||||
patch: from_patch,
|
||||
});
|
||||
}
|
||||
|
||||
// In the case of v1.12, the index-scheduler didn't store its internal version at the time.
|
||||
// => We must write it immediately **in the index-scheduler** otherwise we'll update the version file
|
||||
// there is a risk of DB corruption if a restart happens after writing the version file but before
|
||||
// writing the version in the index-scheduler. See <https://github.com/meilisearch/meilisearch/issues/5280>
|
||||
if from_major == 1 && from_minor == 12 {
|
||||
let env = unsafe {
|
||||
heed::EnvOpenOptions::new()
|
||||
.max_dbs(Versioning::nb_db())
|
||||
.map_size(index_scheduler_opt.task_db_size)
|
||||
.open(&index_scheduler_opt.tasks_path)
|
||||
}?;
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let versioning = Versioning::raw_new(&env, &mut wtxn)?;
|
||||
versioning.set_version(&mut wtxn, (from_major, from_minor, from_patch))?;
|
||||
wtxn.commit()?;
|
||||
// Should be instant since we're the only one using the env
|
||||
env.prepare_for_closing().wait();
|
||||
}
|
||||
|
||||
create_current_version_file(&opt.db_path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Ensure you're in a valid state and open the IndexScheduler + AuthController for you.
|
||||
fn open_or_create_database(
|
||||
opt: &Opt,
|
||||
@@ -387,7 +443,11 @@ fn open_or_create_database(
|
||||
empty_db: bool,
|
||||
binary_version: (u32, u32, u32),
|
||||
) -> anyhow::Result<(IndexScheduler, AuthController)> {
|
||||
let version = if !empty_db { check_version(opt, binary_version)? } else { binary_version };
|
||||
let version = if !empty_db {
|
||||
check_version(opt, &index_scheduler_opt, binary_version)?
|
||||
} else {
|
||||
binary_version
|
||||
};
|
||||
|
||||
open_or_create_database_unchecked(opt, index_scheduler_opt, OnFailure::KeepDb, version)
|
||||
}
|
||||
@@ -511,9 +571,15 @@ fn import_dump(
|
||||
index_scheduler.refresh_index_stats(&uid)?;
|
||||
}
|
||||
|
||||
// 5. Import the queue
|
||||
let mut index_scheduler_dump = index_scheduler.register_dumped_task()?;
|
||||
// 5.1. Import the batches
|
||||
for ret in dump_reader.batches()? {
|
||||
let batch = ret?;
|
||||
index_scheduler_dump.register_dumped_batch(batch)?;
|
||||
}
|
||||
|
||||
// 5. Import the tasks.
|
||||
// 5.2. Import the tasks
|
||||
for ret in dump_reader.tasks()? {
|
||||
let (task, file) = ret?;
|
||||
index_scheduler_dump.register_dumped_task(task, file)?;
|
||||
|
||||
@@ -743,15 +743,21 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
|
||||
let thread_pool = ThreadPoolNoAbortBuilder::new()
|
||||
.thread_name(|index| format!("indexing-thread:{index}"))
|
||||
let rayon_thread_pool = ThreadPoolNoAbortBuilder::new()
|
||||
.thread_name(|index| format!("rayon-{index}"))
|
||||
.num_threads(*other.max_indexing_threads)
|
||||
.build()?;
|
||||
|
||||
let thread_pool = Some(scoped_thread_pool::ThreadPool::new(
|
||||
NonZeroUsize::new(*other.max_indexing_threads).unwrap_or(NonZeroUsize::new(1).unwrap()),
|
||||
"index".to_string(),
|
||||
));
|
||||
|
||||
Ok(Self {
|
||||
log_every_n: Some(DEFAULT_LOG_EVERY_N),
|
||||
max_memory: other.max_indexing_memory.map(|b| b.as_u64() as usize),
|
||||
thread_pool: Some(thread_pool),
|
||||
rayon_thread_pool: Some(rayon_thread_pool),
|
||||
thread_pool,
|
||||
max_positions_per_attributes: None,
|
||||
skip_index_budget: other.skip_index_budget,
|
||||
..Default::default()
|
||||
|
||||
@@ -51,6 +51,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
edit_documents_by_function: Some(false),
|
||||
contains_filter: Some(false),
|
||||
network: Some(false),
|
||||
get_task_documents_route: Some(false),
|
||||
})),
|
||||
(status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
@@ -91,6 +92,8 @@ pub struct RuntimeTogglableFeatures {
|
||||
pub contains_filter: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub network: Option<bool>,
|
||||
#[deserr(default)]
|
||||
pub get_task_documents_route: Option<bool>,
|
||||
}
|
||||
|
||||
impl From<meilisearch_types::features::RuntimeTogglableFeatures> for RuntimeTogglableFeatures {
|
||||
@@ -101,6 +104,7 @@ impl From<meilisearch_types::features::RuntimeTogglableFeatures> for RuntimeTogg
|
||||
edit_documents_by_function,
|
||||
contains_filter,
|
||||
network,
|
||||
get_task_documents_route,
|
||||
} = value;
|
||||
|
||||
Self {
|
||||
@@ -109,6 +113,7 @@ impl From<meilisearch_types::features::RuntimeTogglableFeatures> for RuntimeTogg
|
||||
edit_documents_by_function: Some(edit_documents_by_function),
|
||||
contains_filter: Some(contains_filter),
|
||||
network: Some(network),
|
||||
get_task_documents_route: Some(get_task_documents_route),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -120,6 +125,7 @@ pub struct PatchExperimentalFeatureAnalytics {
|
||||
edit_documents_by_function: bool,
|
||||
contains_filter: bool,
|
||||
network: bool,
|
||||
get_task_documents_route: bool,
|
||||
}
|
||||
|
||||
impl Aggregate for PatchExperimentalFeatureAnalytics {
|
||||
@@ -134,6 +140,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics {
|
||||
edit_documents_by_function: new.edit_documents_by_function,
|
||||
contains_filter: new.contains_filter,
|
||||
network: new.network,
|
||||
get_task_documents_route: new.get_task_documents_route,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -157,6 +164,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics {
|
||||
edit_documents_by_function: Some(false),
|
||||
contains_filter: Some(false),
|
||||
network: Some(false),
|
||||
get_task_documents_route: Some(false),
|
||||
})),
|
||||
(status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
@@ -190,6 +198,10 @@ async fn patch_features(
|
||||
.unwrap_or(old_features.edit_documents_by_function),
|
||||
contains_filter: new_features.0.contains_filter.unwrap_or(old_features.contains_filter),
|
||||
network: new_features.0.network.unwrap_or(old_features.network),
|
||||
get_task_documents_route: new_features
|
||||
.0
|
||||
.get_task_documents_route
|
||||
.unwrap_or(old_features.get_task_documents_route),
|
||||
};
|
||||
|
||||
// explicitly destructure for analytics rather than using the `Serialize` implementation, because
|
||||
@@ -201,6 +213,7 @@ async fn patch_features(
|
||||
edit_documents_by_function,
|
||||
contains_filter,
|
||||
network,
|
||||
get_task_documents_route,
|
||||
} = new_features;
|
||||
|
||||
analytics.publish(
|
||||
@@ -210,6 +223,7 @@ async fn patch_features(
|
||||
edit_documents_by_function,
|
||||
contains_filter,
|
||||
network,
|
||||
get_task_documents_route,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
|
||||
@@ -496,6 +496,12 @@ pub struct IndexStats {
|
||||
pub number_of_documents: u64,
|
||||
/// Whether or not the index is currently ingesting document
|
||||
pub is_indexing: bool,
|
||||
/// Number of embeddings in the index
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub number_of_embeddings: Option<u64>,
|
||||
/// Number of embedded documents in the index
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub number_of_embedded_documents: Option<u64>,
|
||||
/// Association of every field name with the number of times it occurs in the documents.
|
||||
#[schema(value_type = HashMap<String, u64>)]
|
||||
pub field_distribution: FieldDistribution,
|
||||
@@ -506,6 +512,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
|
||||
IndexStats {
|
||||
number_of_documents: stats.inner_stats.number_of_documents,
|
||||
is_indexing: stats.is_indexing,
|
||||
number_of_embeddings: stats.inner_stats.number_of_embeddings,
|
||||
number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents,
|
||||
field_distribution: stats.inner_stats.field_distribution,
|
||||
}
|
||||
}
|
||||
@@ -524,6 +532,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
|
||||
(status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!(
|
||||
{
|
||||
"numberOfDocuments": 10,
|
||||
"numberOfEmbeddings": 10,
|
||||
"numberOfEmbeddedDocuments": 10,
|
||||
"isIndexing": true,
|
||||
"fieldDistribution": {
|
||||
"genre": 10,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::io::ErrorKind;
|
||||
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use deserr::actix_web::AwebQueryParameter;
|
||||
@@ -16,6 +18,7 @@ use serde::Serialize;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::macros::format_description;
|
||||
use time::{Date, Duration, OffsetDateTime, Time};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::task;
|
||||
use utoipa::{IntoParams, OpenApi, ToSchema};
|
||||
|
||||
@@ -44,7 +47,11 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
.route(web::delete().to(SeqHandler(delete_tasks))),
|
||||
)
|
||||
.service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks))))
|
||||
.service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task))));
|
||||
.service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task))))
|
||||
.service(
|
||||
web::resource("/{task_id}/documents")
|
||||
.route(web::get().to(SeqHandler(get_task_documents_file))),
|
||||
);
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr, IntoParams)]
|
||||
@@ -639,6 +646,76 @@ async fn get_task(
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a task's documents.
|
||||
///
|
||||
/// Get a [task's documents file](https://www.meilisearch.com/docs/learn/async/asynchronous_operations).
|
||||
#[utoipa::path(
|
||||
get,
|
||||
path = "/{taskUid}/documents",
|
||||
tag = "Tasks",
|
||||
security(("Bearer" = ["tasks.get", "tasks.*", "*"])),
|
||||
params(("taskUid", format = UInt32, example = 0, description = "The task identifier", nullable = false)),
|
||||
responses(
|
||||
(status = 200, description = "The content of the task update", body = serde_json::Value, content_type = "application/x-ndjson"),
|
||||
(status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
"message": "The Authorization header is missing. It must use the bearer authorization method.",
|
||||
"code": "missing_authorization_header",
|
||||
"type": "auth",
|
||||
"link": "https://docs.meilisearch.com/errors#missing_authorization_header"
|
||||
}
|
||||
)),
|
||||
(status = 404, description = "The task uid does not exists", body = ResponseError, content_type = "application/json", example = json!(
|
||||
{
|
||||
"message": "Task :taskUid not found.",
|
||||
"code": "task_not_found",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors/#task_not_found"
|
||||
}
|
||||
))
|
||||
)
|
||||
)]
|
||||
async fn get_task_documents_file(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
|
||||
task_uid: web::Path<String>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
index_scheduler.features().check_get_task_documents_route()?;
|
||||
let task_uid_string = task_uid.into_inner();
|
||||
|
||||
let task_uid: TaskId = match task_uid_string.parse() {
|
||||
Ok(id) => id,
|
||||
Err(_e) => {
|
||||
return Err(index_scheduler::Error::InvalidTaskUid { task_uid: task_uid_string }.into())
|
||||
}
|
||||
};
|
||||
|
||||
let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() };
|
||||
let filters = index_scheduler.filters();
|
||||
let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(&query, filters)?;
|
||||
|
||||
if let Some(task) = tasks.first() {
|
||||
match task.content_uuid() {
|
||||
Some(uuid) => {
|
||||
let mut tfile = match index_scheduler.queue.update_file(uuid) {
|
||||
Ok(file) => tokio::fs::File::from_std(file),
|
||||
Err(file_store::Error::IoError(e)) if e.kind() == ErrorKind::NotFound => {
|
||||
return Err(index_scheduler::Error::TaskFileNotFound(task_uid).into())
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
// Yes, that's awful to put everything in memory when we could have streamed it from
|
||||
// disk but it's really (really) complex to do with the current state of async Rust.
|
||||
let mut content = String::new();
|
||||
tfile.read_to_string(&mut content).await?;
|
||||
Ok(HttpResponse::Ok().content_type("application/x-ndjson").body(content))
|
||||
}
|
||||
None => Err(index_scheduler::Error::TaskFileNotFound(task_uid).into()),
|
||||
}
|
||||
} else {
|
||||
Err(index_scheduler::Error::TaskNotFound(task_uid).into())
|
||||
}
|
||||
}
|
||||
|
||||
pub enum DeserializeDateOption {
|
||||
Before,
|
||||
After,
|
||||
|
||||
@@ -95,12 +95,16 @@ pub async fn perform_federated_search(
|
||||
facet_order,
|
||||
} = search_by_index;
|
||||
|
||||
let before_waiting_remote_results = std::time::Instant::now();
|
||||
|
||||
// 2.3. Wait for proxy search requests to complete
|
||||
let (mut remote_results, remote_errors) = remote_search.finish().await;
|
||||
|
||||
let after_waiting_remote_results = std::time::Instant::now();
|
||||
|
||||
// 3. merge hits and metadata across indexes and hosts
|
||||
// 3.1. merge metadata
|
||||
let (estimated_total_hits, degraded, used_negative_operator, facets) =
|
||||
let (estimated_total_hits, degraded, used_negative_operator, facets, max_remote_duration) =
|
||||
merge_metadata(&mut results_by_index, &remote_results);
|
||||
|
||||
// 3.2. merge hits
|
||||
@@ -122,9 +126,15 @@ pub async fn perform_federated_search(
|
||||
let (facet_distribution, facet_stats, facets_by_index) =
|
||||
facet_order.merge(federation.merge_facets, remote_results, facets);
|
||||
|
||||
let after_merge = std::time::Instant::now();
|
||||
|
||||
let local_duration = (before_waiting_remote_results - before_search)
|
||||
+ (after_merge - after_waiting_remote_results);
|
||||
let max_duration = Duration::max(local_duration, max_remote_duration);
|
||||
|
||||
Ok(FederatedSearchResult {
|
||||
hits: merged_hits,
|
||||
processing_time_ms: before_search.elapsed().as_millis(),
|
||||
processing_time_ms: max_duration.as_millis(),
|
||||
hits_info: HitsInfo::OffsetLimit {
|
||||
limit: federation.limit,
|
||||
offset: federation.offset,
|
||||
@@ -370,11 +380,12 @@ struct SearchResultByIndex {
|
||||
fn merge_metadata(
|
||||
results_by_index: &mut Vec<SearchResultByIndex>,
|
||||
remote_results: &Vec<FederatedSearchResult>,
|
||||
) -> (usize, bool, bool, FederatedFacets) {
|
||||
) -> (usize, bool, bool, FederatedFacets, Duration) {
|
||||
let mut estimated_total_hits = 0;
|
||||
let mut degraded = false;
|
||||
let mut used_negative_operator = false;
|
||||
let mut facets: FederatedFacets = FederatedFacets::default();
|
||||
let mut max_remote_duration = Duration::ZERO;
|
||||
for SearchResultByIndex {
|
||||
index,
|
||||
hits: _,
|
||||
@@ -395,7 +406,7 @@ fn merge_metadata(
|
||||
}
|
||||
for FederatedSearchResult {
|
||||
hits: _,
|
||||
processing_time_ms: _,
|
||||
processing_time_ms,
|
||||
hits_info,
|
||||
semantic_hit_count: _,
|
||||
facet_distribution: _,
|
||||
@@ -406,6 +417,8 @@ fn merge_metadata(
|
||||
remote_errors: _,
|
||||
} in remote_results
|
||||
{
|
||||
let this_remote_duration = Duration::from_millis(*processing_time_ms as u64);
|
||||
max_remote_duration = Duration::max(this_remote_duration, max_remote_duration);
|
||||
estimated_total_hits += match hits_info {
|
||||
HitsInfo::Pagination { total_hits: estimated_total_hits, .. }
|
||||
| HitsInfo::OffsetLimit { estimated_total_hits, .. } => estimated_total_hits,
|
||||
@@ -415,7 +428,7 @@ fn merge_metadata(
|
||||
degraded |= degraded_for_host;
|
||||
used_negative_operator |= host_used_negative_operator;
|
||||
}
|
||||
(estimated_total_hits, degraded, used_negative_operator, facets)
|
||||
(estimated_total_hits, degraded, used_negative_operator, facets, max_remote_duration)
|
||||
}
|
||||
|
||||
type LocalQueriesByIndex = BTreeMap<String, Vec<QueryByIndex>>;
|
||||
|
||||
Binary file not shown.
@@ -163,6 +163,10 @@ impl Server<Owned> {
|
||||
self.service.get("/tasks").await
|
||||
}
|
||||
|
||||
pub async fn batches(&self) -> (Value, StatusCode) {
|
||||
self.service.get("/batches").await
|
||||
}
|
||||
|
||||
pub async fn set_features(&self, value: Value) -> (Value, StatusCode) {
|
||||
self.service.patch("/experimental-features", value).await
|
||||
}
|
||||
|
||||
@@ -1803,6 +1803,275 @@ async fn add_documents_with_geo_field() {
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
|
||||
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "1"
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"_geo": null
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"_geo": {
|
||||
"lat": 1,
|
||||
"lng": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "4",
|
||||
"_geo": {
|
||||
"lat": "1",
|
||||
"lng": "1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"offset": 0,
|
||||
"limit": 20,
|
||||
"total": 4
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
// we are expecting docs 4 and 3 first as they have geo
|
||||
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
|
||||
@r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"id": "4",
|
||||
"_geo": {
|
||||
"lat": "1",
|
||||
"lng": "1"
|
||||
},
|
||||
"_geoDistance": 5522018
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"_geo": {
|
||||
"lat": 1,
|
||||
"lng": 1
|
||||
},
|
||||
"_geoDistance": 5522018
|
||||
},
|
||||
{
|
||||
"id": "1"
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"_geo": null
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[time]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 4
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn update_documents_with_geo_field() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
index.update_settings(json!({"sortableAttributes": ["_geo"]})).await;
|
||||
|
||||
let documents = json!([
|
||||
{
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"_geo": null,
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"_geo": { "lat": 1, "lng": 1 },
|
||||
},
|
||||
{
|
||||
"id": "4",
|
||||
"_geo": { "lat": "1", "lng": "1" },
|
||||
},
|
||||
]);
|
||||
|
||||
let (task, _status_code) = index.add_documents(documents, None).await;
|
||||
let response = index.wait_task(task.uid()).await;
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": 1,
|
||||
"batchUid": 1,
|
||||
"indexUid": "doggo",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 4,
|
||||
"indexedDocuments": 4
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
// we are expecting docs 4 and 3 first as they have geo
|
||||
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
|
||||
@r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"id": "4",
|
||||
"_geo": {
|
||||
"lat": "1",
|
||||
"lng": "1"
|
||||
},
|
||||
"_geoDistance": 5522018
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"_geo": {
|
||||
"lat": 1,
|
||||
"lng": 1
|
||||
},
|
||||
"_geoDistance": 5522018
|
||||
},
|
||||
{
|
||||
"id": "1"
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"_geo": null
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[time]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 4
|
||||
}
|
||||
"###);
|
||||
|
||||
let updated_documents = json!([{
|
||||
"id": "3",
|
||||
"doggo": "kefir",
|
||||
}]);
|
||||
let (task, _status_code) = index.update_documents(updated_documents, None).await;
|
||||
let response = index.wait_task(task.uid()).await;
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": 2,
|
||||
"batchUid": 2,
|
||||
"indexUid": "doggo",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
let (response, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
|
||||
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": "1"
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"_geo": null
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"_geo": {
|
||||
"lat": 1,
|
||||
"lng": 1
|
||||
},
|
||||
"doggo": "kefir"
|
||||
},
|
||||
{
|
||||
"id": "4",
|
||||
"_geo": {
|
||||
"lat": "1",
|
||||
"lng": "1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"offset": 0,
|
||||
"limit": 20,
|
||||
"total": 4
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
// the search response should not have changed: we are expecting docs 4 and 3 first as they have geo
|
||||
snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
|
||||
@r###"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"id": "4",
|
||||
"_geo": {
|
||||
"lat": "1",
|
||||
"lng": "1"
|
||||
},
|
||||
"_geoDistance": 5522018
|
||||
},
|
||||
{
|
||||
"id": "3",
|
||||
"_geo": {
|
||||
"lat": 1,
|
||||
"lng": 1
|
||||
},
|
||||
"doggo": "kefir",
|
||||
"_geoDistance": 5522018
|
||||
},
|
||||
{
|
||||
"id": "1"
|
||||
},
|
||||
{
|
||||
"id": "2",
|
||||
"_geo": null
|
||||
}
|
||||
],
|
||||
"query": "",
|
||||
"processingTimeMs": "[time]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 4
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
|
||||
@@ -161,6 +161,8 @@ async fn delete_document_by_filter() {
|
||||
{
|
||||
"numberOfDocuments": 4,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"color": 3,
|
||||
"id": 4
|
||||
@@ -208,6 +210,8 @@ async fn delete_document_by_filter() {
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"color": 1,
|
||||
"id": 2
|
||||
@@ -274,6 +278,8 @@ async fn delete_document_by_filter() {
|
||||
{
|
||||
"numberOfDocuments": 1,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"color": 1,
|
||||
"id": 1
|
||||
|
||||
@@ -22,6 +22,7 @@ pub enum GetDump {
|
||||
TestV5,
|
||||
|
||||
TestV6WithExperimental,
|
||||
TestV6WithBatchesAndEnqueuedTasks,
|
||||
}
|
||||
|
||||
impl GetDump {
|
||||
@@ -74,6 +75,10 @@ impl GetDump {
|
||||
"tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump"
|
||||
)
|
||||
.into(),
|
||||
GetDump::TestV6WithBatchesAndEnqueuedTasks => {
|
||||
exist_relative_path!("tests/assets/v6_v1.13.0_batches_and_enqueued_tasks.dump")
|
||||
.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,9 +27,24 @@ async fn import_dump_v1_movie_raw() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -173,6 +188,8 @@ async fn import_dump_v1_movie_with_settings() {
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
@@ -333,9 +350,24 @@ async fn import_dump_v1_rubygems_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"description": 53,
|
||||
"id": 53,
|
||||
"name": 53,
|
||||
"summary": 53,
|
||||
"total_downloads": 53,
|
||||
"version": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -483,9 +515,24 @@ async fn import_dump_v2_movie_raw() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -623,9 +670,24 @@ async fn import_dump_v2_movie_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -773,9 +835,24 @@ async fn import_dump_v2_rubygems_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"description": 53,
|
||||
"id": 53,
|
||||
"name": 53,
|
||||
"summary": 53,
|
||||
"total_downloads": 53,
|
||||
"version": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -920,9 +997,24 @@ async fn import_dump_v3_movie_raw() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -1060,9 +1152,24 @@ async fn import_dump_v3_movie_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -1210,9 +1317,24 @@ async fn import_dump_v3_rubygems_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"description": 53,
|
||||
"id": 53,
|
||||
"name": 53,
|
||||
"summary": 53,
|
||||
"total_downloads": 53,
|
||||
"version": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -1357,9 +1479,24 @@ async fn import_dump_v4_movie_raw() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -1497,9 +1634,24 @@ async fn import_dump_v4_movie_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"genres": 53,
|
||||
"id": 53,
|
||||
"overview": 53,
|
||||
"poster": 53,
|
||||
"release_date": 53,
|
||||
"title": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -1647,9 +1799,24 @@ async fn import_dump_v4_rubygems_with_settings() {
|
||||
|
||||
let (stats, code) = index.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(
|
||||
stats,
|
||||
json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }})
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 53,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"description": 53,
|
||||
"id": 53,
|
||||
"name": 53,
|
||||
"summary": 53,
|
||||
"total_downloads": 53,
|
||||
"version": 53
|
||||
}
|
||||
}
|
||||
"###
|
||||
);
|
||||
|
||||
let (settings, code) = index.settings().await;
|
||||
@@ -1798,33 +1965,35 @@ async fn import_dump_v5() {
|
||||
server.wait_task(task["uid"].as_u64().unwrap()).await;
|
||||
}
|
||||
|
||||
let expected_stats = json!({
|
||||
"numberOfDocuments": 10,
|
||||
"isIndexing": false,
|
||||
"fieldDistribution": {
|
||||
"cast": 10,
|
||||
"director": 10,
|
||||
"genres": 10,
|
||||
"id": 10,
|
||||
"overview": 10,
|
||||
"popularity": 10,
|
||||
"poster_path": 10,
|
||||
"producer": 10,
|
||||
"production_companies": 10,
|
||||
"release_date": 10,
|
||||
"tagline": 10,
|
||||
"title": 10,
|
||||
"vote_average": 10,
|
||||
"vote_count": 10
|
||||
}
|
||||
});
|
||||
|
||||
let index1 = server.index("test");
|
||||
let index2 = server.index("test2");
|
||||
|
||||
let (stats, code) = index1.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(stats, expected_stats);
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 10,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"cast": 10,
|
||||
"director": 10,
|
||||
"genres": 10,
|
||||
"id": 10,
|
||||
"overview": 10,
|
||||
"popularity": 10,
|
||||
"poster_path": 10,
|
||||
"producer": 10,
|
||||
"production_companies": 10,
|
||||
"release_date": 10,
|
||||
"tagline": 10,
|
||||
"title": 10,
|
||||
"vote_average": 10,
|
||||
"vote_count": 10
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
let (docs, code) = index2.get_all_documents(GetAllDocumentsOptions::default()).await;
|
||||
snapshot!(code, @"200 OK");
|
||||
@@ -1835,7 +2004,32 @@ async fn import_dump_v5() {
|
||||
|
||||
let (stats, code) = index2.stats().await;
|
||||
snapshot!(code, @"200 OK");
|
||||
assert_eq!(stats, expected_stats);
|
||||
snapshot!(
|
||||
json_string!(stats),
|
||||
@r###"
|
||||
{
|
||||
"numberOfDocuments": 10,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"cast": 10,
|
||||
"director": 10,
|
||||
"genres": 10,
|
||||
"id": 10,
|
||||
"overview": 10,
|
||||
"popularity": 10,
|
||||
"poster_path": 10,
|
||||
"producer": 10,
|
||||
"production_companies": 10,
|
||||
"release_date": 10,
|
||||
"tagline": 10,
|
||||
"title": 10,
|
||||
"vote_average": 10,
|
||||
"vote_count": 10
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
let (keys, code) = server.list_api_keys("").await;
|
||||
snapshot!(code, @"200 OK");
|
||||
@@ -1909,7 +2103,8 @@ async fn import_dump_v6_containing_experimental_features() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -1993,6 +2188,63 @@ async fn import_dump_v6_containing_experimental_features() {
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn import_dump_v6_containing_batches_and_enqueued_tasks() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
|
||||
let options = Opt {
|
||||
import_dump: Some(GetDump::TestV6WithBatchesAndEnqueuedTasks.path()),
|
||||
..default_settings(temp.path())
|
||||
};
|
||||
let mut server = Server::new_auth_with_options(options, temp).await;
|
||||
server.use_api_key("MASTER_KEY");
|
||||
server.wait_task(2).await.succeeded();
|
||||
let (tasks, _) = server.tasks().await;
|
||||
snapshot!(json_string!(tasks, { ".results[1].startedAt" => "[date]", ".results[1].finishedAt" => "[date]", ".results[1].duration" => "[date]" }), name: "tasks");
|
||||
let (batches, _) = server.batches().await;
|
||||
snapshot!(json_string!(batches, { ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].duration" => "[date]" }), name: "batches");
|
||||
|
||||
let (indexes, code) = server.list_indexes(None, None).await;
|
||||
assert_eq!(code, 200, "{indexes}");
|
||||
|
||||
assert_eq!(indexes["results"].as_array().unwrap().len(), 1);
|
||||
assert_eq!(indexes["results"][0]["uid"], json!("kefir"));
|
||||
assert_eq!(indexes["results"][0]["primaryKey"], json!("id"));
|
||||
|
||||
let (response, code) = server.get_features().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let index = server.index("kefir");
|
||||
let (documents, _) = index.get_all_documents_raw("").await;
|
||||
snapshot!(documents, @r#"
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": 1,
|
||||
"dog": "kefir"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"dog": "intel"
|
||||
}
|
||||
],
|
||||
"offset": 0,
|
||||
"limit": 20,
|
||||
"total": 2
|
||||
}
|
||||
"#);
|
||||
}
|
||||
|
||||
// In this test we must generate the dump ourselves to ensure the
|
||||
// `user provided` vectors are well set
|
||||
#[actix_rt::test]
|
||||
@@ -2071,7 +2323,8 @@ async fn generate_and_import_dump_containing_vectors() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
|
||||
@@ -0,0 +1,78 @@
|
||||
---
|
||||
source: crates/meilisearch/tests/dumps/mod.rs
|
||||
snapshot_kind: text
|
||||
---
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"uid": 2,
|
||||
"progress": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"stats": {
|
||||
"totalNbTasks": 1,
|
||||
"status": {
|
||||
"succeeded": 1
|
||||
},
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 1
|
||||
},
|
||||
"indexUids": {
|
||||
"kefir": 1
|
||||
}
|
||||
},
|
||||
"duration": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
},
|
||||
{
|
||||
"uid": 1,
|
||||
"progress": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"stats": {
|
||||
"totalNbTasks": 1,
|
||||
"status": {
|
||||
"succeeded": 1
|
||||
},
|
||||
"types": {
|
||||
"documentAdditionOrUpdate": 1
|
||||
},
|
||||
"indexUids": {
|
||||
"kefir": 1
|
||||
}
|
||||
},
|
||||
"duration": "PT0.144827890S",
|
||||
"startedAt": "2025-02-04T10:15:21.275640274Z",
|
||||
"finishedAt": "2025-02-04T10:15:21.420468164Z"
|
||||
},
|
||||
{
|
||||
"uid": 0,
|
||||
"progress": null,
|
||||
"details": {},
|
||||
"stats": {
|
||||
"totalNbTasks": 1,
|
||||
"status": {
|
||||
"succeeded": 1
|
||||
},
|
||||
"types": {
|
||||
"indexCreation": 1
|
||||
},
|
||||
"indexUids": {
|
||||
"kefir": 1
|
||||
}
|
||||
},
|
||||
"duration": "PT0.032902186S",
|
||||
"startedAt": "2025-02-04T10:14:43.559526162Z",
|
||||
"finishedAt": "2025-02-04T10:14:43.592428348Z"
|
||||
}
|
||||
],
|
||||
"total": 3,
|
||||
"limit": 20,
|
||||
"from": 2,
|
||||
"next": null
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
---
|
||||
source: crates/meilisearch/tests/dumps/mod.rs
|
||||
snapshot_kind: text
|
||||
---
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"uid": 3,
|
||||
"batchUid": null,
|
||||
"indexUid": null,
|
||||
"status": "succeeded",
|
||||
"type": "dumpCreation",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"dumpUid": null
|
||||
},
|
||||
"error": null,
|
||||
"duration": "PT0.000629059S",
|
||||
"enqueuedAt": "2025-02-04T10:22:31.318175268Z",
|
||||
"startedAt": "2025-02-04T10:22:31.331701375Z",
|
||||
"finishedAt": "2025-02-04T10:22:31.332330434Z"
|
||||
},
|
||||
{
|
||||
"uid": 2,
|
||||
"batchUid": 2,
|
||||
"indexUid": "kefir",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[date]",
|
||||
"enqueuedAt": "2025-02-04T10:15:49.212484063Z",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
},
|
||||
{
|
||||
"uid": 1,
|
||||
"batchUid": null,
|
||||
"indexUid": "kefir",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "PT0.144827890S",
|
||||
"enqueuedAt": "2025-02-04T10:15:21.258630973Z",
|
||||
"startedAt": "2025-02-04T10:15:21.275640274Z",
|
||||
"finishedAt": "2025-02-04T10:15:21.420468164Z"
|
||||
},
|
||||
{
|
||||
"uid": 0,
|
||||
"batchUid": null,
|
||||
"indexUid": "kefir",
|
||||
"status": "succeeded",
|
||||
"type": "indexCreation",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"primaryKey": null
|
||||
},
|
||||
"error": null,
|
||||
"duration": "PT0.032902186S",
|
||||
"enqueuedAt": "2025-02-04T10:14:43.550379968Z",
|
||||
"startedAt": "2025-02-04T10:14:43.559526162Z",
|
||||
"finishedAt": "2025-02-04T10:14:43.592428348Z"
|
||||
}
|
||||
],
|
||||
"total": 4,
|
||||
"limit": 20,
|
||||
"from": 3,
|
||||
"next": null
|
||||
}
|
||||
@@ -22,7 +22,8 @@ async fn experimental_features() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -35,7 +36,8 @@ async fn experimental_features() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -48,7 +50,8 @@ async fn experimental_features() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -62,7 +65,8 @@ async fn experimental_features() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -76,7 +80,8 @@ async fn experimental_features() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
}
|
||||
@@ -97,7 +102,8 @@ async fn experimental_feature_metrics() {
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false,
|
||||
"network": false
|
||||
"network": false,
|
||||
"getTaskDocumentsRoute": false
|
||||
}
|
||||
"###);
|
||||
|
||||
@@ -152,7 +158,7 @@ async fn errors() {
|
||||
meili_snap::snapshot!(code, @"400 Bad Request");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`",
|
||||
"message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`",
|
||||
"code": "bad_request",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#bad_request"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@@ -74,3 +75,253 @@ async fn stats() {
|
||||
assert_eq!(response["indexes"]["test"]["fieldDistribution"]["name"], 1);
|
||||
assert_eq!(response["indexes"]["test"]["fieldDistribution"]["age"], 1);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn add_remove_embeddings() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"manual": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
},
|
||||
"handcrafted": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
},
|
||||
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
// 2 embedded documents for 5 embeddings in total
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }},
|
||||
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }},
|
||||
]);
|
||||
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 5,
|
||||
"numberOfEmbeddedDocuments": 2,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// 2 embedded documents for 3 embeddings in total
|
||||
let documents = json!([
|
||||
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }},
|
||||
]);
|
||||
|
||||
let (response, code) = index.update_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 3,
|
||||
"numberOfEmbeddedDocuments": 2,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// 2 embedded documents for 2 embeddings in total
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": null, "handcrafted": [0, 0, 0] }},
|
||||
]);
|
||||
|
||||
let (response, code) = index.update_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 2,
|
||||
"numberOfEmbeddedDocuments": 2,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// 1 embedded documents for 2 embeddings in total
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }},
|
||||
{"id": 1, "name": "echo", "_vectors": { "manual": null, "handcrafted": null }},
|
||||
]);
|
||||
|
||||
let (response, code) = index.update_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 2,
|
||||
"numberOfEmbeddedDocuments": 1,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn add_remove_embedded_documents() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"manual": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
},
|
||||
"handcrafted": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
},
|
||||
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
// 2 embedded documents for 5 embeddings in total
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }},
|
||||
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }},
|
||||
]);
|
||||
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 5,
|
||||
"numberOfEmbeddedDocuments": 2,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// delete one embedded document, remaining 1 embedded documents for 3 embeddings in total
|
||||
let (response, code) = index.delete_document(0).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 1,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 3,
|
||||
"numberOfEmbeddedDocuments": 1,
|
||||
"fieldDistribution": {
|
||||
"id": 1,
|
||||
"name": 1
|
||||
}
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn update_embedder_settings() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
// 2 embedded documents for 3 embeddings in total
|
||||
// but no embedders are added in the settings yet so we expect 0 embedded documents for 0 embeddings in total
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }},
|
||||
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }},
|
||||
]);
|
||||
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
index.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// add embedders to the settings
|
||||
// 2 embedded documents for 3 embeddings in total
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"manual": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
},
|
||||
"handcrafted": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
},
|
||||
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
server.wait_task(response.uid()).await.succeeded();
|
||||
|
||||
let (stats, _code) = index.stats().await;
|
||||
snapshot!(json_string!(stats), @r###"
|
||||
{
|
||||
"numberOfDocuments": 2,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 3,
|
||||
"numberOfEmbeddedDocuments": 2,
|
||||
"fieldDistribution": {
|
||||
"id": 2,
|
||||
"name": 2
|
||||
}
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
@@ -135,6 +135,8 @@ async fn check_the_index_scheduler(server: &Server) {
|
||||
"kefir": {
|
||||
"numberOfDocuments": 1,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"age": 1,
|
||||
"description": 1,
|
||||
@@ -215,6 +217,8 @@ async fn check_the_index_scheduler(server: &Server) {
|
||||
"kefir": {
|
||||
"numberOfDocuments": 1,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"age": 1,
|
||||
"description": 1,
|
||||
@@ -228,10 +232,12 @@ async fn check_the_index_scheduler(server: &Server) {
|
||||
"###);
|
||||
let index = server.index("kefir");
|
||||
let (stats, _) = index.stats().await;
|
||||
snapshot!(stats, @r#"
|
||||
snapshot!(stats, @r###"
|
||||
{
|
||||
"numberOfDocuments": 1,
|
||||
"isIndexing": false,
|
||||
"numberOfEmbeddings": 0,
|
||||
"numberOfEmbeddedDocuments": 0,
|
||||
"fieldDistribution": {
|
||||
"age": 1,
|
||||
"description": 1,
|
||||
@@ -240,7 +246,7 @@ async fn check_the_index_scheduler(server: &Server) {
|
||||
"surname": 1
|
||||
}
|
||||
}
|
||||
"#);
|
||||
"###);
|
||||
|
||||
// Delete all the tasks of a specific batch
|
||||
let (task, _) = server.delete_tasks("batchUids=10").await;
|
||||
|
||||
@@ -1,22 +1,26 @@
|
||||
use std::fs::{read_dir, read_to_string, remove_file, File};
|
||||
use std::io::BufWriter;
|
||||
use std::io::{BufWriter, Write as _};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use clap::{Parser, Subcommand};
|
||||
use clap::{Parser, Subcommand, ValueEnum};
|
||||
use dump::{DumpWriter, IndexMetadata};
|
||||
use file_store::FileStore;
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||
use meilisearch_types::batches::Batch;
|
||||
use meilisearch_types::heed::types::{Bytes, SerdeJson, Str};
|
||||
use meilisearch_types::heed::{
|
||||
CompactionOption, Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
|
||||
};
|
||||
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
||||
use meilisearch_types::tasks::{Status, Task};
|
||||
use meilisearch_types::versioning::{get_version, parse_version};
|
||||
use meilisearch_types::Index;
|
||||
use serde_json::Value::Object;
|
||||
use time::macros::format_description;
|
||||
use time::OffsetDateTime;
|
||||
use upgrade::OfflineUpgrade;
|
||||
@@ -68,6 +72,24 @@ enum Command {
|
||||
skip_enqueued_tasks: bool,
|
||||
},
|
||||
|
||||
/// Exports the documents of an index in NDJSON format from a Meilisearch index to stdout.
|
||||
///
|
||||
/// This command can be executed on a running Meilisearch database. However, please note that
|
||||
/// it will maintain a read-only transaction for the duration of the extraction process.
|
||||
ExportDocuments {
|
||||
/// The index name to export the documents from.
|
||||
#[arg(long)]
|
||||
index_name: String,
|
||||
|
||||
/// Do not export vectors with the documents.
|
||||
#[arg(long)]
|
||||
ignore_vectors: bool,
|
||||
|
||||
/// The number of documents to skip.
|
||||
#[arg(long)]
|
||||
offset: Option<usize>,
|
||||
},
|
||||
|
||||
/// Attempts to upgrade from one major version to the next without a dump.
|
||||
///
|
||||
/// Make sure to run this commmand when Meilisearch is not running!
|
||||
@@ -102,6 +124,25 @@ enum Command {
|
||||
/// the compaction operation can start. Once the compaction is done, the big index is replaced
|
||||
/// by the compacted one and the mutable transaction is released.
|
||||
CompactIndex { index_name: String },
|
||||
|
||||
/// Uses the hair dryer the dedicate pages hot in cache
|
||||
///
|
||||
/// To make the index faster we must make sure it is hot in the DB cache that's the cure of
|
||||
/// memory-mapping but also it's strengh. This command is designed to make a spcific part of
|
||||
/// the index hot in cache.
|
||||
HairDryer {
|
||||
#[arg(long, value_delimiter = ',')]
|
||||
index_name: Vec<String>,
|
||||
|
||||
#[arg(long, value_delimiter = ',')]
|
||||
index_part: Vec<IndexPart>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, ValueEnum)]
|
||||
enum IndexPart {
|
||||
/// Will make the arroy index hot.
|
||||
Arroy,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
@@ -114,11 +155,17 @@ fn main() -> anyhow::Result<()> {
|
||||
Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
|
||||
export_a_dump(db_path, dump_dir, skip_enqueued_tasks, detected_version)
|
||||
}
|
||||
Command::ExportDocuments { index_name, ignore_vectors, offset } => {
|
||||
export_documents(db_path, index_name, ignore_vectors, offset)
|
||||
}
|
||||
Command::OfflineUpgrade { target_version } => {
|
||||
let target_version = parse_version(&target_version).context("While parsing `--target-version`. Make sure `--target-version` is in the format MAJOR.MINOR.PATCH")?;
|
||||
OfflineUpgrade { db_path, current_version: detected_version, target_version }.upgrade()
|
||||
}
|
||||
Command::CompactIndex { index_name } => compact_index(db_path, &index_name),
|
||||
Command::HairDryer { index_name, index_part } => {
|
||||
hair_dryer(db_path, &index_name, &index_part)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -255,70 +302,86 @@ fn export_a_dump(
|
||||
|
||||
eprintln!("Successfully dumped {count} keys!");
|
||||
|
||||
eprintln!("Dumping the queue");
|
||||
let rtxn = env.read_txn()?;
|
||||
let all_tasks: Database<BEU32, SerdeJson<Task>> =
|
||||
try_opening_database(&env, &rtxn, "all-tasks")?;
|
||||
let all_batches: Database<BEU32, SerdeJson<Batch>> =
|
||||
try_opening_database(&env, &rtxn, "all-batches")?;
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &rtxn, "index-mapping")?;
|
||||
|
||||
if skip_enqueued_tasks {
|
||||
eprintln!("Skip dumping the enqueued tasks...");
|
||||
} else {
|
||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||
let mut count = 0;
|
||||
for ret in all_tasks.iter(&rtxn)? {
|
||||
let (_, t) = ret?;
|
||||
let status = t.status;
|
||||
let content_file = t.content_uuid();
|
||||
eprintln!("Dumping the tasks");
|
||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||
let mut count_tasks = 0;
|
||||
let mut count_enqueued_tasks = 0;
|
||||
for ret in all_tasks.iter(&rtxn)? {
|
||||
let (_, t) = ret?;
|
||||
let status = t.status;
|
||||
let content_file = t.content_uuid();
|
||||
|
||||
let mut dump_content_file = dump_tasks.push_task(&t.into())?;
|
||||
if status == Status::Enqueued && skip_enqueued_tasks {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||
if let Some(content_file_uuid) = content_file {
|
||||
if status == Status::Enqueued {
|
||||
let content_file = file_store.get_update(content_file_uuid)?;
|
||||
let mut dump_content_file = dump_tasks.push_task(&t.into())?;
|
||||
|
||||
if (detected_version.0, detected_version.1, detected_version.2) < (1, 12, 0) {
|
||||
eprintln!("Dumping the enqueued tasks reading them in obkv format...");
|
||||
let reader =
|
||||
DocumentsBatchReader::from_reader(content_file).with_context(|| {
|
||||
format!("While reading content file {:?}", content_file_uuid)
|
||||
})?;
|
||||
let (mut cursor, documents_batch_index) =
|
||||
reader.into_cursor_and_fields_index();
|
||||
while let Some(doc) = cursor.next_document().with_context(|| {
|
||||
format!("While iterating on content file {:?}", content_file_uuid)
|
||||
})? {
|
||||
dump_content_file
|
||||
.push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
|
||||
}
|
||||
} else {
|
||||
eprintln!(
|
||||
"Dumping the enqueued tasks reading them in JSON stream format..."
|
||||
);
|
||||
for document in
|
||||
serde_json::de::Deserializer::from_reader(content_file).into_iter()
|
||||
{
|
||||
let document = document.with_context(|| {
|
||||
format!("While reading content file {:?}", content_file_uuid)
|
||||
})?;
|
||||
dump_content_file.push_document(&document)?;
|
||||
}
|
||||
// 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||
if let Some(content_file_uuid) = content_file {
|
||||
if status == Status::Enqueued {
|
||||
let content_file = file_store.get_update(content_file_uuid)?;
|
||||
|
||||
if (detected_version.0, detected_version.1, detected_version.2) < (1, 12, 0) {
|
||||
eprintln!("Dumping the enqueued tasks reading them in obkv format...");
|
||||
let reader =
|
||||
DocumentsBatchReader::from_reader(content_file).with_context(|| {
|
||||
format!("While reading content file {:?}", content_file_uuid)
|
||||
})?;
|
||||
let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
while let Some(doc) = cursor.next_document().with_context(|| {
|
||||
format!("While iterating on content file {:?}", content_file_uuid)
|
||||
})? {
|
||||
dump_content_file
|
||||
.push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
|
||||
}
|
||||
} else {
|
||||
eprintln!("Dumping the enqueued tasks reading them in JSON stream format...");
|
||||
for document in
|
||||
serde_json::de::Deserializer::from_reader(content_file).into_iter()
|
||||
{
|
||||
let document = document.with_context(|| {
|
||||
format!("While reading content file {:?}", content_file_uuid)
|
||||
})?;
|
||||
dump_content_file.push_document(&document)?;
|
||||
}
|
||||
|
||||
dump_content_file.flush()?;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
dump_content_file.flush()?;
|
||||
count_enqueued_tasks += 1;
|
||||
}
|
||||
}
|
||||
dump_tasks.flush()?;
|
||||
|
||||
eprintln!("Successfully dumped {count} enqueued tasks!");
|
||||
count_tasks += 1;
|
||||
}
|
||||
dump_tasks.flush()?;
|
||||
eprintln!(
|
||||
"Successfully dumped {count_tasks} tasks including {count_enqueued_tasks} enqueued tasks!"
|
||||
);
|
||||
|
||||
// 4. dump the batches
|
||||
eprintln!("Dumping the batches");
|
||||
let mut dump_batches = dump.create_batches_queue()?;
|
||||
let mut count = 0;
|
||||
|
||||
for ret in all_batches.iter(&rtxn)? {
|
||||
let (_, b) = ret?;
|
||||
dump_batches.push_batch(&b)?;
|
||||
count += 1;
|
||||
}
|
||||
dump_batches.flush()?;
|
||||
eprintln!("Successfully dumped {count} batches!");
|
||||
|
||||
// 5. Dump the indexes
|
||||
eprintln!("Dumping the indexes...");
|
||||
|
||||
// 4. Dump the indexes
|
||||
let mut count = 0;
|
||||
for result in index_mapping.iter(&rtxn)? {
|
||||
let (uid, uuid) = result?;
|
||||
@@ -339,14 +402,14 @@ fn export_a_dump(
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
// 4.1. Dump the documents
|
||||
// 5.1. Dump the documents
|
||||
for ret in index.all_documents(&rtxn)? {
|
||||
let (_id, doc) = ret?;
|
||||
let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
index_dumper.push_document(&document)?;
|
||||
}
|
||||
|
||||
// 4.2. Dump the settings
|
||||
// 5.2. Dump the settings
|
||||
let settings = meilisearch_types::settings::settings(
|
||||
&index,
|
||||
&rtxn,
|
||||
@@ -443,3 +506,170 @@ fn compact_index(db_path: PathBuf, index_name: &str) -> anyhow::Result<()> {
|
||||
|
||||
bail!("Target index {index_name} not found!")
|
||||
}
|
||||
|
||||
fn export_documents(
|
||||
db_path: PathBuf,
|
||||
index_name: String,
|
||||
ignore_vectors: bool,
|
||||
offset: Option<usize>,
|
||||
) -> anyhow::Result<()> {
|
||||
let index_scheduler_path = db_path.join("tasks");
|
||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||
|
||||
let rtxn = env.read_txn()?;
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &rtxn, "index-mapping")?;
|
||||
|
||||
for result in index_mapping.iter(&rtxn)? {
|
||||
let (uid, uuid) = result?;
|
||||
if uid == index_name {
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
let index =
|
||||
Index::new(EnvOpenOptions::new(), &index_path, false).with_context(|| {
|
||||
format!("While trying to open the index at path {:?}", index_path.display())
|
||||
})?;
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
let embedding_configs = index.embedding_configs(&rtxn)?;
|
||||
|
||||
if let Some(offset) = offset {
|
||||
eprintln!("Skipping {offset} documents");
|
||||
}
|
||||
|
||||
let mut stdout = BufWriter::new(std::io::stdout());
|
||||
let all_documents = index.documents_ids(&rtxn)?.into_iter().skip(offset.unwrap_or(0));
|
||||
for (i, ret) in index.iter_documents(&rtxn, all_documents)?.enumerate() {
|
||||
let (id, doc) = ret?;
|
||||
let mut document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
|
||||
if i % 10_000 == 0 {
|
||||
eprintln!("Starting the {}th document", i + offset.unwrap_or(0));
|
||||
}
|
||||
|
||||
if !ignore_vectors {
|
||||
'inject_vectors: {
|
||||
let embeddings = index.embeddings(&rtxn, id)?;
|
||||
|
||||
if embeddings.is_empty() {
|
||||
break 'inject_vectors;
|
||||
}
|
||||
|
||||
let vectors = document
|
||||
.entry(RESERVED_VECTORS_FIELD_NAME)
|
||||
.or_insert(Object(Default::default()));
|
||||
|
||||
let Object(vectors) = vectors else {
|
||||
return Err(meilisearch_types::milli::Error::UserError(
|
||||
meilisearch_types::milli::UserError::InvalidVectorsMapType {
|
||||
document_id: {
|
||||
if let Ok(Some(Ok(index))) = index
|
||||
.external_id_of(&rtxn, std::iter::once(id))
|
||||
.map(|it| it.into_iter().next())
|
||||
{
|
||||
index
|
||||
} else {
|
||||
format!("internal docid={id}")
|
||||
}
|
||||
},
|
||||
value: vectors.clone(),
|
||||
},
|
||||
)
|
||||
.into());
|
||||
};
|
||||
|
||||
for (embedder_name, embeddings) in embeddings {
|
||||
let user_provided = embedding_configs
|
||||
.iter()
|
||||
.find(|conf| conf.name == embedder_name)
|
||||
.is_some_and(|conf| conf.user_provided.contains(id));
|
||||
|
||||
let embeddings = ExplicitVectors {
|
||||
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
||||
embeddings,
|
||||
)),
|
||||
regenerate: !user_provided,
|
||||
};
|
||||
vectors
|
||||
.insert(embedder_name, serde_json::to_value(embeddings).unwrap());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
serde_json::to_writer(&mut stdout, &document)?;
|
||||
}
|
||||
|
||||
stdout.flush()?;
|
||||
} else {
|
||||
eprintln!("Found index {uid} but it's not the right index...");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn hair_dryer(
|
||||
db_path: PathBuf,
|
||||
index_names: &[String],
|
||||
index_parts: &[IndexPart],
|
||||
) -> anyhow::Result<()> {
|
||||
let index_scheduler_path = db_path.join("tasks");
|
||||
let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
|
||||
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||
|
||||
eprintln!("Trying to get a read transaction on the index scheduler...");
|
||||
|
||||
let rtxn = env.read_txn()?;
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &rtxn, "index-mapping")?;
|
||||
|
||||
for result in index_mapping.iter(&rtxn)? {
|
||||
let (uid, uuid) = result?;
|
||||
if index_names.iter().any(|i| i == uid) {
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
let index =
|
||||
Index::new(EnvOpenOptions::new(), &index_path, false).with_context(|| {
|
||||
format!("While trying to open the index at path {:?}", index_path.display())
|
||||
})?;
|
||||
|
||||
eprintln!("Trying to get a read transaction on the {uid} index...");
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
for part in index_parts {
|
||||
match part {
|
||||
IndexPart::Arroy => {
|
||||
let mut count = 0;
|
||||
let total = index.vector_arroy.len(&rtxn)?;
|
||||
eprintln!("Hair drying arroy for {uid}...");
|
||||
for (i, result) in index
|
||||
.vector_arroy
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.iter(&rtxn)?
|
||||
.enumerate()
|
||||
{
|
||||
let (key, value) = result?;
|
||||
|
||||
// All of this just to avoid compiler optimizations 🤞
|
||||
// We must read all the bytes to make the pages hot in cache.
|
||||
// <https://doc.rust-lang.org/std/hint/fn.black_box.html>
|
||||
count += std::hint::black_box(key.iter().fold(0, |acc, _| acc + 1));
|
||||
count += std::hint::black_box(value.iter().fold(0, |acc, _| acc + 1));
|
||||
|
||||
if i % 10_000 == 0 {
|
||||
let perc = (i as f64) / (total as f64) * 100.0;
|
||||
eprintln!("Visited {i}/{total} ({perc:.2}%) keys")
|
||||
}
|
||||
}
|
||||
eprintln!("Done hair drying a total of at least {count} bytes.");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
eprintln!("Found index {uid} but it's not the right index...");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,120 +1,121 @@
|
||||
[package]
|
||||
name = "milli"
|
||||
edition = "2021"
|
||||
name = "milli"
|
||||
publish = false
|
||||
|
||||
version.workspace = true
|
||||
authors.workspace = true
|
||||
description.workspace = true
|
||||
homepage.workspace = true
|
||||
readme.workspace = true
|
||||
version.workspace = true
|
||||
# edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
big_s = "1.0.2"
|
||||
bimap = { version = "0.6.3", features = ["serde"] }
|
||||
bimap = {version = "0.6.3", features = ["serde"]}
|
||||
bincode = "1.3.3"
|
||||
bstr = "1.11.3"
|
||||
bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] }
|
||||
bytemuck = {version = "1.21.0", features = ["extern_crate_alloc"]}
|
||||
byteorder = "1.5.0"
|
||||
charabia = { version = "0.9.2", default-features = false }
|
||||
charabia = {version = "0.9.2", default-features = false}
|
||||
concat-arrays = "0.1.2"
|
||||
convert_case = "0.6.0"
|
||||
crossbeam-channel = "0.5.14"
|
||||
deserr = "0.6.3"
|
||||
either = { version = "1.13.0", features = ["serde"] }
|
||||
flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
either = {version = "1.13.0", features = ["serde"]}
|
||||
flatten-serde-json = {path = "../flatten-serde-json"}
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.5.0", default-features = false, features = ["rayon", "tempfile"] }
|
||||
heed = { version = "0.20.5", default-features = false, features = [
|
||||
"serde-json",
|
||||
"serde-bincode",
|
||||
"read-txn-no-tls",
|
||||
] }
|
||||
indexmap = { version = "2.7.0", features = ["serde"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
grenad = {version = "0.5.0", default-features = false, features = ["rayon", "tempfile"]}
|
||||
heed = {version = "0.20.5", default-features = false, features = [
|
||||
"serde-json",
|
||||
"serde-bincode",
|
||||
"read-txn-no-tls",
|
||||
]}
|
||||
indexmap = {version = "2.7.0", features = ["serde"]}
|
||||
json-depth-checker = {path = "../json-depth-checker"}
|
||||
levenshtein_automata = {version = "0.2.1", features = ["fst_automaton"]}
|
||||
memchr = "2.7.4"
|
||||
memmap2 = "0.9.5"
|
||||
obkv = "0.3.0"
|
||||
once_cell = "1.20.2"
|
||||
ordered-float = "4.6.0"
|
||||
rayon = "1.10.0"
|
||||
roaring = { version = "0.10.10", features = ["serde"] }
|
||||
rstar = { version = "0.12.2", features = ["serde"] }
|
||||
serde = { version = "1.0.217", features = ["derive"] }
|
||||
serde_json = { version = "1.0.135", features = ["preserve_order", "raw_value"] }
|
||||
roaring = {version = "0.10.10", features = ["serde"]}
|
||||
rstar = {version = "0.12.2", features = ["serde"]}
|
||||
serde = {version = "1.0.217", features = ["derive"]}
|
||||
serde_json = {version = "1.0.135", features = ["preserve_order", "raw_value"]}
|
||||
slice-group-by = "0.3.1"
|
||||
smallstr = { version = "0.3.0", features = ["serde"] }
|
||||
smallstr = {version = "0.3.0", features = ["serde"]}
|
||||
smallvec = "1.13.2"
|
||||
smartstring = "1.0.1"
|
||||
tempfile = "3.15.0"
|
||||
thiserror = "2.0.9"
|
||||
time = { version = "0.3.37", features = [
|
||||
"serde-well-known",
|
||||
"formatting",
|
||||
"parsing",
|
||||
"macros",
|
||||
] }
|
||||
uuid = { version = "1.11.0", features = ["v4"] }
|
||||
time = {version = "0.3.37", features = [
|
||||
"serde-well-known",
|
||||
"formatting",
|
||||
"parsing",
|
||||
"macros",
|
||||
]}
|
||||
uuid = {version = "1.11.0", features = ["v4"]}
|
||||
|
||||
filter-parser = { path = "../filter-parser" }
|
||||
filter-parser = {path = "../filter-parser"}
|
||||
scoped_thread_pool = {path = "/home/dureuill/dev/scoped_thread_pool"}
|
||||
|
||||
# documents words self-join
|
||||
itertools = "0.14.0"
|
||||
|
||||
csv = "1.3.1"
|
||||
candle-core = { version = "0.8.2" }
|
||||
candle-transformers = { version = "0.8.2" }
|
||||
candle-nn = { version = "0.8.2" }
|
||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
|
||||
"onig",
|
||||
] }
|
||||
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
|
||||
"online",
|
||||
] }
|
||||
tiktoken-rs = "0.6.0"
|
||||
liquid = "0.26.9"
|
||||
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = [
|
||||
"serde",
|
||||
"no_module",
|
||||
"no_custom_syntax",
|
||||
"no_time",
|
||||
"sync",
|
||||
] }
|
||||
allocator-api2 = "0.2.21"
|
||||
arroy = "0.5.0"
|
||||
rand = "0.8.5"
|
||||
tracing = "0.1.41"
|
||||
ureq = { version = "2.12.1", features = ["json"] }
|
||||
url = "2.5.4"
|
||||
rayon-par-bridge = "0.1.0"
|
||||
hashbrown = "0.15.2"
|
||||
bbqueue = {git = "https://github.com/meilisearch/bbqueue"}
|
||||
bumpalo = "3.16.0"
|
||||
bumparaw-collections = "0.1.4"
|
||||
thread_local = "1.1.8"
|
||||
allocator-api2 = "0.2.21"
|
||||
rustc-hash = "2.1.0"
|
||||
uell = "0.1.0"
|
||||
candle-core = {version = "0.8.2"}
|
||||
candle-nn = {version = "0.8.2"}
|
||||
candle-transformers = {version = "0.8.2"}
|
||||
csv = "1.3.1"
|
||||
enum-iterator = "2.1.0"
|
||||
bbqueue = { git = "https://github.com/meilisearch/bbqueue" }
|
||||
flume = { version = "0.11.1", default-features = false }
|
||||
utoipa = { version = "5.3.1", features = ["non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions"] }
|
||||
flume = {version = "0.11.1", default-features = false}
|
||||
hashbrown = "0.15.2"
|
||||
hf-hub = {git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
|
||||
"online",
|
||||
]}
|
||||
liquid = "0.26.9"
|
||||
rand = "0.8.5"
|
||||
rayon-par-bridge = "0.1.0"
|
||||
rhai = {git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = [
|
||||
"serde",
|
||||
"no_module",
|
||||
"no_custom_syntax",
|
||||
"no_time",
|
||||
"sync",
|
||||
]}
|
||||
rustc-hash = "2.1.0"
|
||||
thread_local = "1.1.8"
|
||||
tiktoken-rs = "0.6.0"
|
||||
tokenizers = {git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
|
||||
"onig",
|
||||
]}
|
||||
tracing = "0.1.41"
|
||||
uell = "0.1.0"
|
||||
ureq = {version = "2.12.1", features = ["json"]}
|
||||
url = "2.5.4"
|
||||
utoipa = {version = "5.3.1", features = ["non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions"]}
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.43", default-features = false }
|
||||
mimalloc = {version = "0.1.43", default-features = false}
|
||||
# fixed version due to format breakages in v1.40
|
||||
insta = "=1.39.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
meili-snap = {path = "../meili-snap"}
|
||||
rand = {version = "0.8.5", features = ["small_rng"]}
|
||||
|
||||
[features]
|
||||
all-tokenizations = [
|
||||
"charabia/default",
|
||||
"charabia/default",
|
||||
]
|
||||
|
||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||
|
||||
@@ -515,3 +515,68 @@ fn conditionally_lookup_for_error_message() {
|
||||
assert_eq!(err.to_string(), format!("{} {}", prefix, suffix));
|
||||
}
|
||||
}
|
||||
|
||||
impl Error {
|
||||
pub fn from_scoped_thread_pool_error(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<Self>,
|
||||
thread_id: usize,
|
||||
error: scoped_thread_pool::Error<Self>,
|
||||
) -> Self {
|
||||
match error {
|
||||
scoped_thread_pool::Error::Err(error) => error,
|
||||
scoped_thread_pool::Error::Panic(payload)
|
||||
| scoped_thread_pool::Error::ThreadExited(Some(payload)) => {
|
||||
let msg = match payload.downcast_ref::<&'static str>() {
|
||||
Some(s) => *s,
|
||||
None => match payload.downcast_ref::<String>() {
|
||||
Some(s) => &s[..],
|
||||
None => "Box<dyn Any>",
|
||||
},
|
||||
};
|
||||
tracing::error!(
|
||||
thread_name = thread_pool.thread_name(thread_id),
|
||||
"Thread panicked with {msg}"
|
||||
);
|
||||
Error::InternalError(InternalError::PanicInThreadPool(PanicCatched))
|
||||
}
|
||||
scoped_thread_pool::Error::ThreadExited(None) => {
|
||||
Error::InternalError(InternalError::PanicInThreadPool(PanicCatched))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_scoped_thread_pool_errors(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<Self>,
|
||||
value: scoped_thread_pool::Errors<Error>,
|
||||
) -> Self {
|
||||
// iterate all errors, keeping the "max" one
|
||||
// such that AbortedIndexing < regular error < panic
|
||||
let mut max = None;
|
||||
for (thread_id, error) in value.0 {
|
||||
max = match (max, error) {
|
||||
(None, error) => Some((thread_id, error)),
|
||||
(max @ Some((_, scoped_thread_pool::Error::Panic(_))), _) => max,
|
||||
(_, new @ scoped_thread_pool::Error::Panic(_)) => Some((thread_id, new)),
|
||||
(max @ Some((_, scoped_thread_pool::Error::ThreadExited(Some(_)))), _) => max,
|
||||
(_, new @ scoped_thread_pool::Error::ThreadExited(Some(_))) => {
|
||||
Some((thread_id, new))
|
||||
}
|
||||
(max @ Some((_, scoped_thread_pool::Error::ThreadExited(None))), _) => max,
|
||||
(_, new @ scoped_thread_pool::Error::ThreadExited(None)) => Some((thread_id, new)),
|
||||
(
|
||||
Some((
|
||||
_,
|
||||
scoped_thread_pool::Error::Err(Error::InternalError(
|
||||
InternalError::AbortedIndexation,
|
||||
)),
|
||||
)),
|
||||
new,
|
||||
) => Some((thread_id, new)),
|
||||
(max @ Some((_, scoped_thread_pool::Error::Err(_))), _) => max,
|
||||
};
|
||||
}
|
||||
// Errors never have an empty list
|
||||
let (thread_id, error) = max.unwrap();
|
||||
Self::from_scoped_thread_pool_error(thread_pool, thread_id, error)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec;
|
||||
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig};
|
||||
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
||||
@@ -1731,6 +1731,18 @@ impl Index {
|
||||
let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
|
||||
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
|
||||
}
|
||||
|
||||
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
|
||||
let mut stats = ArroyStats::default();
|
||||
let embedding_configs = self.embedding_configs(rtxn)?;
|
||||
for config in embedding_configs {
|
||||
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
||||
let reader =
|
||||
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
||||
reader.aggregate_stats(rtxn, &mut stats)?;
|
||||
}
|
||||
Ok(stats)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
@@ -1776,6 +1788,7 @@ pub(crate) mod tests {
|
||||
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use crate::update::settings::InnerIndexSettings;
|
||||
use crate::update::{
|
||||
self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings,
|
||||
@@ -1825,7 +1838,7 @@ pub(crate) mod tests {
|
||||
) -> Result<(), crate::error::Error> {
|
||||
let local_pool;
|
||||
let indexer_config = &self.indexer_config;
|
||||
let pool = match &indexer_config.thread_pool {
|
||||
let pool = match &indexer_config.rayon_thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
|
||||
@@ -1833,6 +1846,11 @@ pub(crate) mod tests {
|
||||
}
|
||||
};
|
||||
|
||||
let thread_pool = match &indexer_config.thread_pool {
|
||||
Some(thread_pool) => thread_pool,
|
||||
None => &scoped_thread_pool::ThreadPool::with_available_parallelism("index".into()),
|
||||
};
|
||||
|
||||
let rtxn = self.inner.read_txn()?;
|
||||
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
@@ -1852,29 +1870,28 @@ pub(crate) mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)?;
|
||||
|
||||
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
|
||||
return Err(error.into());
|
||||
}
|
||||
|
||||
pool.install(|| {
|
||||
indexer::index(
|
||||
wtxn,
|
||||
&self.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
|
||||
indexer::index(
|
||||
wtxn,
|
||||
&self.inner,
|
||||
thread_pool,
|
||||
&pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1913,7 +1930,7 @@ pub(crate) mod tests {
|
||||
) -> Result<(), crate::error::Error> {
|
||||
let local_pool;
|
||||
let indexer_config = &self.indexer_config;
|
||||
let pool = match &indexer_config.thread_pool {
|
||||
let pool = match &indexer_config.rayon_thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
|
||||
@@ -1921,6 +1938,11 @@ pub(crate) mod tests {
|
||||
}
|
||||
};
|
||||
|
||||
let thread_pool = match &indexer_config.thread_pool {
|
||||
Some(thread_pool) => thread_pool,
|
||||
None => &scoped_thread_pool::ThreadPool::with_available_parallelism("index".into()),
|
||||
};
|
||||
|
||||
let rtxn = self.inner.read_txn()?;
|
||||
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
@@ -1943,28 +1965,28 @@ pub(crate) mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)?;
|
||||
|
||||
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
|
||||
return Err(error.into());
|
||||
}
|
||||
|
||||
pool.install(|| {
|
||||
indexer::index(
|
||||
wtxn,
|
||||
&self.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
indexer::index(
|
||||
wtxn,
|
||||
&self.inner,
|
||||
thread_pool,
|
||||
&pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1993,7 +2015,7 @@ pub(crate) mod tests {
|
||||
|
||||
let local_pool;
|
||||
let indexer_config = &index.indexer_config;
|
||||
let pool = match &indexer_config.thread_pool {
|
||||
let pool = match &indexer_config.rayon_thread_pool {
|
||||
Some(pool) => pool,
|
||||
None => {
|
||||
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
|
||||
@@ -2001,6 +2023,11 @@ pub(crate) mod tests {
|
||||
}
|
||||
};
|
||||
|
||||
let thread_pool = match &indexer_config.thread_pool {
|
||||
Some(thread_pool) => thread_pool,
|
||||
None => &scoped_thread_pool::ThreadPool::with_available_parallelism("index".into()),
|
||||
};
|
||||
|
||||
let rtxn = index.inner.read_txn().unwrap();
|
||||
let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
@@ -2024,6 +2051,8 @@ pub(crate) mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -2034,7 +2063,8 @@ pub(crate) mod tests {
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
thread_pool,
|
||||
&pool,
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
||||
@@ -7,6 +7,7 @@ use maplit::{btreemap, hashset};
|
||||
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{db_snap, Criterion, Index};
|
||||
@@ -65,6 +66,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
// index documents
|
||||
indexer.add_documents(&payload).unwrap();
|
||||
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let (document_changes, operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
@@ -75,6 +79,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -85,6 +91,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -28,7 +28,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
let force_reindexing = settings_diff.reindex_searchable();
|
||||
|
||||
// initialize destination values.
|
||||
|
||||
@@ -23,7 +23,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
|
||||
@@ -55,7 +55,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
facet_search: bool,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
@@ -145,7 +145,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
|
||||
@@ -44,7 +44,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
|
||||
@@ -26,7 +26,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
|
||||
@@ -35,7 +35,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
|
||||
@@ -39,7 +39,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
|
||||
@@ -24,7 +24,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let max_memory = indexer.max_memory_by_rayon_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
|
||||
@@ -119,7 +119,11 @@ impl GrenadParameters {
|
||||
///
|
||||
/// This should be called inside of a rayon thread pool,
|
||||
/// otherwise, it will take the global number of threads.
|
||||
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
||||
pub fn max_memory_by_thread(&self, thread_count: usize) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| (max_memory / thread_count))
|
||||
}
|
||||
|
||||
pub fn max_memory_by_rayon_thread(&self) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| (max_memory / rayon::current_num_threads()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -227,7 +227,7 @@ where
|
||||
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
|
||||
|
||||
let backup_pool;
|
||||
let pool = match self.indexer_config.thread_pool {
|
||||
let pool = match self.indexer_config.rayon_thread_pool {
|
||||
Some(ref pool) => pool,
|
||||
None => {
|
||||
// We initialize a backup pool with the default
|
||||
@@ -770,6 +770,7 @@ mod tests {
|
||||
use crate::progress::Progress;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use crate::update::Setting;
|
||||
use crate::{db_snap, Filter, Search, UserError};
|
||||
|
||||
@@ -1967,6 +1968,8 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string()),
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -2115,6 +2118,9 @@ mod tests {
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
indexer.add_documents(&documents).unwrap();
|
||||
indexer.delete_documents(&["2"]);
|
||||
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let (document_changes, _operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
&indexer_alloc,
|
||||
@@ -2124,12 +2130,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2177,6 +2186,9 @@ mod tests {
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
|
||||
let (document_changes, _operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
&indexer_alloc,
|
||||
@@ -2186,12 +2198,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2229,6 +2244,8 @@ mod tests {
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
|
||||
indexer.add_documents(&documents).unwrap();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
|
||||
let (document_changes, _operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
@@ -2239,12 +2256,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2291,12 +2311,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2327,6 +2350,8 @@ mod tests {
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
|
||||
indexer.delete_documents(&["1", "2"]);
|
||||
|
||||
@@ -2345,12 +2370,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2382,6 +2410,8 @@ mod tests {
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
|
||||
|
||||
indexer.delete_documents(&["1", "2", "1", "2"]);
|
||||
@@ -2404,12 +2434,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2440,6 +2473,8 @@ mod tests {
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
|
||||
|
||||
let documents = documents!([
|
||||
@@ -2456,12 +2491,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2508,12 +2546,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2683,6 +2724,8 @@ mod tests {
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
|
||||
// OP
|
||||
@@ -2702,12 +2745,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2761,12 +2807,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
@@ -2817,12 +2866,15 @@ mod tests {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index.inner,
|
||||
&thread_pool,
|
||||
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
indexer_config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -11,7 +11,8 @@ pub struct IndexerConfig {
|
||||
pub max_memory: Option<usize>,
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub thread_pool: Option<ThreadPoolNoAbort>,
|
||||
pub rayon_thread_pool: Option<ThreadPoolNoAbort>,
|
||||
pub thread_pool: Option<scoped_thread_pool::ThreadPool<crate::Error>>,
|
||||
pub max_positions_per_attributes: Option<u32>,
|
||||
pub skip_index_budget: bool,
|
||||
}
|
||||
@@ -36,6 +37,7 @@ impl Default for IndexerConfig {
|
||||
max_memory: None,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
rayon_thread_pool: None,
|
||||
thread_pool: None,
|
||||
max_positions_per_attributes: None,
|
||||
skip_index_budget: false,
|
||||
|
||||
@@ -51,12 +51,13 @@ const MAX_FRAME_HEADER_SIZE: usize = 9;
|
||||
/// when new stuff is available in any BBQueue buffer but we send
|
||||
/// a message in this queue only if it is empty to avoid filling
|
||||
/// the channel *and* the BBQueue.
|
||||
pub fn extractor_writer_bbqueue(
|
||||
bbbuffers: &mut Vec<BBBuffer>,
|
||||
pub fn extractor_writer_bbqueue<'a>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
bbbuffers: &'a mut Vec<BBBuffer>,
|
||||
total_bbbuffer_capacity: usize,
|
||||
channel_capacity: usize,
|
||||
) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) {
|
||||
let current_num_threads = rayon::current_num_threads();
|
||||
) -> (ExtractorBbqueueSender<'a>, WriterBbqueueReceiver<'a>) {
|
||||
let current_num_threads = thread_pool.thread_count();
|
||||
let bbbuffer_capacity = total_bbbuffer_capacity.checked_div(current_num_threads).unwrap();
|
||||
bbbuffers.resize_with(current_num_threads, || BBBuffer::new(bbbuffer_capacity));
|
||||
|
||||
@@ -66,12 +67,18 @@ pub fn extractor_writer_bbqueue(
|
||||
let max_grant = capacity.saturating_div(2).checked_sub(MAX_FRAME_HEADER_SIZE).unwrap();
|
||||
|
||||
let producers = ThreadLocal::with_capacity(bbbuffers.len());
|
||||
let consumers = rayon::broadcast(|bi| {
|
||||
let bbqueue = &bbbuffers[bi.index()];
|
||||
let (producer, consumer) = bbqueue.try_split_framed().unwrap();
|
||||
producers.get_or(|| FullySend(RefCell::new(producer)));
|
||||
consumer
|
||||
});
|
||||
let consumers = ThreadLocal::with_capacity(bbbuffers.len());
|
||||
thread_pool
|
||||
.broadcast(|thread_index| {
|
||||
let bbqueue: &BBBuffer = &bbbuffers[thread_index];
|
||||
let (producer, consumer) = bbqueue.try_split_framed().unwrap();
|
||||
producers.get_or(|| FullySend(RefCell::new(producer)));
|
||||
consumers.get_or(|| FullySend(consumer));
|
||||
Ok(())
|
||||
})
|
||||
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))
|
||||
.unwrap();
|
||||
let consumers: Vec<_> = consumers.into_iter().map(|consumer| consumer.0).collect();
|
||||
|
||||
let sent_messages_attempts = Arc::new(AtomicUsize::new(0));
|
||||
let blocking_sent_messages_attempts = Arc::new(AtomicUsize::new(0));
|
||||
@@ -963,28 +970,70 @@ impl GeoSender<'_, '_> {
|
||||
.map_err(|_| SendError(()))
|
||||
}
|
||||
|
||||
pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> {
|
||||
let database = Database::Main;
|
||||
let value_length = bitmap.serialized_size();
|
||||
let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes();
|
||||
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
|
||||
InternalError::StorePut {
|
||||
database_name: database.database_name(),
|
||||
key: key.into(),
|
||||
value_length,
|
||||
error: MdbError::BadValSize.into(),
|
||||
}
|
||||
})?;
|
||||
pub fn set_geo_faceted(
|
||||
&self,
|
||||
bitmap: &RoaringBitmap,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> crate::Result<()> {
|
||||
let writer = GeoWriter { bitmap, channel: *self };
|
||||
thread_pool
|
||||
.execute(&writer)
|
||||
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))
|
||||
}
|
||||
}
|
||||
|
||||
self.0.write_key_value_with(
|
||||
struct GeoWriter<'a, 'b> {
|
||||
bitmap: &'a RoaringBitmap,
|
||||
channel: GeoSender<'a, 'b>,
|
||||
}
|
||||
impl<'a, 'b> scoped_thread_pool::Workload<'static> for GeoWriter<'a, 'b> {
|
||||
type Context = ();
|
||||
|
||||
type Error = crate::Error;
|
||||
|
||||
fn context(
|
||||
&self,
|
||||
_thread_count: usize,
|
||||
_thread_index: usize,
|
||||
) -> Result<Self::Context, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_task(
|
||||
&self,
|
||||
_thread_count: usize,
|
||||
thread_index: usize,
|
||||
task_index: usize,
|
||||
_context: &mut Self::Context,
|
||||
) -> Option<Result<(), Self::Error>> {
|
||||
if thread_index != 0 || task_index != 0 {
|
||||
return None;
|
||||
}
|
||||
let database = Database::Main;
|
||||
let value_length = self.bitmap.serialized_size();
|
||||
let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes();
|
||||
let key_length = match key.len().try_into().ok().and_then(NonZeroU16::new) {
|
||||
Some(key_length) => key_length,
|
||||
None => {
|
||||
return Some(Err(InternalError::StorePut {
|
||||
database_name: database.database_name(),
|
||||
key: key.into(),
|
||||
value_length,
|
||||
error: MdbError::BadValSize.into(),
|
||||
}
|
||||
.into()))
|
||||
}
|
||||
};
|
||||
|
||||
Some(self.channel.0.write_key_value_with(
|
||||
database,
|
||||
key_length,
|
||||
value_length,
|
||||
|key_buffer, value_buffer| {
|
||||
key_buffer.copy_from_slice(key);
|
||||
bitmap.serialize_into(value_buffer)?;
|
||||
self.bitmap.serialize_into(value_buffer)?;
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -144,7 +144,7 @@ impl<'doc> Update<'doc> {
|
||||
)?)
|
||||
}
|
||||
|
||||
pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> {
|
||||
pub fn only_changed_fields(&self) -> DocumentFromVersions<'_, 'doc> {
|
||||
DocumentFromVersions::new(&self.new)
|
||||
}
|
||||
|
||||
@@ -182,7 +182,7 @@ impl<'doc> Update<'doc> {
|
||||
let mut cached_current = None;
|
||||
let mut updated_selected_field_count = 0;
|
||||
|
||||
for entry in self.updated().iter_top_level_fields() {
|
||||
for entry in self.only_changed_fields().iter_top_level_fields() {
|
||||
let (key, updated_value) = entry?;
|
||||
|
||||
if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
|
||||
@@ -241,7 +241,7 @@ impl<'doc> Update<'doc> {
|
||||
Ok(has_deleted_fields)
|
||||
}
|
||||
|
||||
pub fn updated_vectors(
|
||||
pub fn only_changed_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
|
||||
@@ -38,7 +38,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b>
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(BalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory_by_thread(),
|
||||
self.grenad_parameters.max_memory_by_thread(self.buckets),
|
||||
extractor_alloc,
|
||||
)))
|
||||
}
|
||||
@@ -388,6 +388,7 @@ fn truncate_str(s: &str) -> &str {
|
||||
impl FacetedDocidsExtractor {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
@@ -412,10 +413,11 @@ impl FacetedDocidsExtractor {
|
||||
let extractor = FacetedExtractorData {
|
||||
attributes_to_extract: &attributes_to_extract,
|
||||
grenad_parameters: indexing_context.grenad_parameters,
|
||||
buckets: rayon::current_num_threads(),
|
||||
buckets: thread_pool.thread_count(),
|
||||
sender,
|
||||
};
|
||||
extract(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
|
||||
@@ -21,6 +21,7 @@ use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Result};
|
||||
|
||||
pub struct GeoExtractor {
|
||||
grenad_parameters: GrenadParameters,
|
||||
thread_count: usize,
|
||||
}
|
||||
|
||||
impl GeoExtractor {
|
||||
@@ -28,11 +29,12 @@ impl GeoExtractor {
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
grenad_parameters: GrenadParameters,
|
||||
thread_count: usize,
|
||||
) -> Result<Option<Self>> {
|
||||
let is_sortable = index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
|
||||
let is_filterable = index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
|
||||
if is_sortable || is_filterable {
|
||||
Ok(Some(GeoExtractor { grenad_parameters }))
|
||||
Ok(Some(GeoExtractor { grenad_parameters, thread_count }))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
@@ -157,7 +159,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
||||
) -> Result<()> {
|
||||
let rtxn = &context.rtxn;
|
||||
let index = context.index;
|
||||
let max_memory = self.grenad_parameters.max_memory_by_thread();
|
||||
let max_memory = self.grenad_parameters.max_memory_by_thread(self.thread_count);
|
||||
let db_fields_ids_map = context.db_fields_ids_map;
|
||||
let mut data_ref = context.data.borrow_mut_or_yield();
|
||||
|
||||
@@ -199,7 +201,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
||||
.transpose()?;
|
||||
|
||||
let updated_geo = update
|
||||
.updated()
|
||||
.merged(rtxn, index, db_fields_ids_map)?
|
||||
.geo_field()?
|
||||
.map(|geo| extract_geo_coordinates(external_id, geo))
|
||||
.transpose()?;
|
||||
|
||||
@@ -5,7 +5,6 @@ mod geo;
|
||||
mod searchable;
|
||||
mod vectors;
|
||||
|
||||
use bumpalo::Bump;
|
||||
pub use cache::{
|
||||
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
|
||||
};
|
||||
@@ -15,22 +14,6 @@ pub use geo::*;
|
||||
pub use searchable::*;
|
||||
pub use vectors::EmbeddingExtractor;
|
||||
|
||||
use super::indexer::document_changes::{DocumentChanges, IndexingContext};
|
||||
use super::steps::IndexingStep;
|
||||
use super::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::Result;
|
||||
|
||||
pub trait DocidsExtractor {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync;
|
||||
}
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
pub mod perm_json_p {
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
@@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory_by_thread(),
|
||||
self.grenad_parameters.max_memory_by_thread(self.buckets),
|
||||
extractor_alloc,
|
||||
))))
|
||||
}
|
||||
@@ -240,6 +240,7 @@ pub struct WordDocidsExtractors;
|
||||
|
||||
impl WordDocidsExtractors {
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
@@ -288,10 +289,11 @@ impl WordDocidsExtractors {
|
||||
let extractor = WordDocidsExtractorData {
|
||||
tokenizer: &document_tokenizer,
|
||||
grenad_parameters: indexing_context.grenad_parameters,
|
||||
buckets: rayon::current_num_threads(),
|
||||
buckets: thread_pool.thread_count(),
|
||||
};
|
||||
|
||||
extract(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
|
||||
@@ -2,29 +2,62 @@ use std::cell::RefCell;
|
||||
use std::collections::VecDeque;
|
||||
use std::rc::Rc;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use super::SearchableExtractor;
|
||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::indexer::document_changes::DocumentChangeContext;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
pub struct WordPairProximityDocidsExtractor;
|
||||
impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractor<'a> {
|
||||
type Data = RefCell<BalancedCaches<'extractor>>;
|
||||
|
||||
impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(BalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory_by_thread(self.buckets),
|
||||
extractor_alloc,
|
||||
)))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
self.extract_document_change(context, change)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordPairProximityDocidsExtractor<'a> {
|
||||
tokenizer: &'a DocumentTokenizer<'a>,
|
||||
grenad_parameters: &'a GrenadParameters,
|
||||
buckets: usize,
|
||||
}
|
||||
|
||||
impl<'a> WordPairProximityDocidsExtractor<'a> {
|
||||
fn attributes_to_extract<'b>(
|
||||
rtxn: &'b RoTxn,
|
||||
index: &'b Index,
|
||||
) -> Result<Option<Vec<&'b str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
fn attributes_to_skip<'b>(_rtxn: &'b RoTxn, _index: &'b Index) -> Result<Vec<&'b str>> {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
@@ -32,10 +65,11 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
// and to store the docids of the documents that have a number of words in a given field
|
||||
// equal to or under than MAX_COUNTED_WORDS.
|
||||
fn extract_document_change(
|
||||
&self,
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let document_tokenizer = self.tokenizer;
|
||||
let doc_alloc = &context.doc_alloc;
|
||||
|
||||
let index = context.index;
|
||||
@@ -129,6 +163,70 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = indexing_context.index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
|
||||
let localized_attributes_rules =
|
||||
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
let extractor_data: WordPairProximityDocidsExtractor = WordPairProximityDocidsExtractor {
|
||||
tokenizer: &document_tokenizer,
|
||||
grenad_parameters: indexing_context.grenad_parameters,
|
||||
buckets: thread_pool.thread_count(),
|
||||
};
|
||||
|
||||
let datastore = ThreadLocal::new();
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
extract(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
&extractor_data,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
step,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_key<'a>(
|
||||
|
||||
@@ -1,146 +1,5 @@
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod tokenize_document;
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use bumpalo::Bump;
|
||||
pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
|
||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||
use heed::RoTxn;
|
||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
|
||||
use super::cache::BalancedCaches;
|
||||
use super::DocidsExtractor;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
|
||||
tokenizer: &'a DocumentTokenizer<'a>,
|
||||
grenad_parameters: &'a GrenadParameters,
|
||||
buckets: usize,
|
||||
_ex: PhantomData<EX>,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
||||
for SearchableExtractorData<'a, EX>
|
||||
{
|
||||
type Data = RefCell<BalancedCaches<'extractor>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(BalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory_by_thread(),
|
||||
extractor_alloc,
|
||||
)))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
EX::extract_document_change(context, self.tokenizer, change)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SearchableExtractor: Sized + Sync {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = indexing_context.index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
|
||||
let localized_attributes_rules =
|
||||
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
|
||||
tokenizer: &document_tokenizer,
|
||||
grenad_parameters: indexing_context.grenad_parameters,
|
||||
buckets: rayon::current_num_threads(),
|
||||
_ex: PhantomData,
|
||||
};
|
||||
|
||||
let datastore = ThreadLocal::new();
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
extract(
|
||||
document_changes,
|
||||
&extractor_data,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
step,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
|
||||
}
|
||||
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()>;
|
||||
|
||||
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
|
||||
-> Result<Option<Vec<&'a str>>>;
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
|
||||
}
|
||||
|
||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
Self::run_extraction(document_changes, indexing_context, extractor_allocs, step)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -99,7 +99,8 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
||||
context.db_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?;
|
||||
let new_vectors =
|
||||
update.only_changed_vectors(&context.doc_alloc, self.embedders)?;
|
||||
|
||||
if let Some(new_vectors) = &new_vectors {
|
||||
unused_vectors_distribution.append(new_vectors)?;
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
use std::cell::{Cell, RefCell};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::progress::{AtomicDocumentStep, Progress};
|
||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::GrenadParameters;
|
||||
@@ -114,7 +113,7 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload
|
||||
>: Sync {
|
||||
type Item: Send;
|
||||
|
||||
fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>>;
|
||||
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]>;
|
||||
|
||||
fn len(&self) -> usize;
|
||||
|
||||
@@ -186,9 +185,10 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
const CHUNK_SIZE: usize = 100;
|
||||
pub const CHUNK_SIZE: usize = 100;
|
||||
|
||||
pub fn extract<
|
||||
struct Extract<
|
||||
'shared, // covariant lifetime for shared borrows
|
||||
'pl, // covariant lifetime of the underlying payload
|
||||
'extractor, // invariant lifetime of extractor_alloc
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
@@ -196,31 +196,121 @@ pub fn extract<
|
||||
'data, // invariant on EX::Data lifetime of datastore
|
||||
'index, // covariant lifetime of the index
|
||||
EX,
|
||||
DC,
|
||||
MSP,
|
||||
> where
|
||||
DC: DocumentChanges<'pl>,
|
||||
EX: Extractor<'extractor>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
document_changes: &'shared DC,
|
||||
extractor: &'shared EX,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor ThreadLocal<FullySend<Bump>>,
|
||||
datastore: &'data ThreadLocal<EX::Data>,
|
||||
step: Arc<AtomicU32>,
|
||||
_marker: PhantomData<&'pl ()>,
|
||||
}
|
||||
|
||||
impl<
|
||||
'doc,
|
||||
'extractor: 'doc, // invariant lifetime of extractor_alloc
|
||||
'shared,
|
||||
'pl, // covariant lifetime of the underlying payload
|
||||
'fid: 'doc, // invariant lifetime of fields ids map
|
||||
'indexer: 'doc, // covariant lifetime of objects that are borrowed during the entire indexing
|
||||
'data: 'doc, // invariant on EX::Data lifetime of datastore
|
||||
'index: 'doc + 'indexer, // covariant lifetime of the index
|
||||
EX,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
> scoped_thread_pool::Workload<'doc>
|
||||
for Extract<'shared, 'pl, 'extractor, 'fid, 'indexer, 'data, 'index, EX, DC, MSP>
|
||||
where
|
||||
EX: Extractor<'extractor>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
type Context = DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, EX::Data>;
|
||||
|
||||
type Error = crate::Error;
|
||||
|
||||
fn context(
|
||||
&self,
|
||||
_thread_count: usize,
|
||||
_thread_index: usize,
|
||||
) -> std::result::Result<
|
||||
DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, EX::Data>,
|
||||
Self::Error,
|
||||
> {
|
||||
let extractor = self.extractor;
|
||||
DocumentChangeContext::new(
|
||||
self.indexing_context.index,
|
||||
self.indexing_context.db_fields_ids_map,
|
||||
self.indexing_context.new_fields_ids_map,
|
||||
self.extractor_allocs,
|
||||
self.indexing_context.doc_allocs,
|
||||
self.datastore,
|
||||
self.indexing_context.fields_ids_map_store,
|
||||
move |index_alloc| extractor.init_data(index_alloc),
|
||||
)
|
||||
}
|
||||
|
||||
fn run_task(
|
||||
&self,
|
||||
_thread_count: usize,
|
||||
thread_index: usize,
|
||||
task_index: usize,
|
||||
context: &mut Self::Context,
|
||||
) -> Option<std::result::Result<(), Self::Error>> {
|
||||
let items = self.document_changes.items(thread_index, task_index)?;
|
||||
if (self.indexing_context.must_stop_processing)() {
|
||||
return Some(Err(InternalError::AbortedIndexation.into()));
|
||||
}
|
||||
|
||||
// Clean up and reuse the document-specific allocator
|
||||
context.doc_alloc.reset();
|
||||
|
||||
let changes = items.iter().filter_map(|item| {
|
||||
self.document_changes.item_to_document_change(context, item).transpose()
|
||||
});
|
||||
|
||||
let res = self.extractor.process(changes, context);
|
||||
self.step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
|
||||
|
||||
// send back the doc_alloc in the pool
|
||||
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||
|
||||
Some(res)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extract<
|
||||
'pool, // invariant lifetime of the thread pool
|
||||
'pl, // covariant lifetime of the underlying payload
|
||||
'extractor, // invariant lifetime of extractor_alloc
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing
|
||||
'data, // invariant on EX::Data lifetime of datastore
|
||||
'index, // covariant lifetime of the index
|
||||
EX,
|
||||
DC,
|
||||
MSP,
|
||||
>(
|
||||
thread_pool: &'pool scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
document_changes: &DC,
|
||||
extractor: &EX,
|
||||
IndexingContext {
|
||||
index,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
doc_allocs,
|
||||
fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
progress,
|
||||
grenad_parameters: _,
|
||||
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
datastore: &'data ThreadLocal<EX::Data>,
|
||||
step: IndexingStep,
|
||||
) -> Result<()>
|
||||
where
|
||||
DC: DocumentChanges<'pl>,
|
||||
EX: Extractor<'extractor>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
tracing::trace!("We are resetting the extractor allocators");
|
||||
progress.update_progress(step);
|
||||
indexing_context.progress.update_progress(step);
|
||||
// Clean up and reuse the extractor allocs
|
||||
for extractor_alloc in extractor_allocs.iter_mut() {
|
||||
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
|
||||
@@ -229,45 +319,22 @@ where
|
||||
|
||||
let total_documents = document_changes.len() as u32;
|
||||
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
|
||||
progress.update_progress(progress_step);
|
||||
indexing_context.progress.update_progress(progress_step);
|
||||
|
||||
let pi = document_changes.iter(CHUNK_SIZE);
|
||||
pi.try_arc_for_each_try_init(
|
||||
|| {
|
||||
DocumentChangeContext::new(
|
||||
index,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
extractor_allocs,
|
||||
doc_allocs,
|
||||
datastore,
|
||||
fields_ids_map_store,
|
||||
move |index_alloc| extractor.init_data(index_alloc),
|
||||
)
|
||||
},
|
||||
|context, items| {
|
||||
if (must_stop_processing)() {
|
||||
return Err(Arc::new(InternalError::AbortedIndexation.into()));
|
||||
}
|
||||
let extract = Extract {
|
||||
document_changes,
|
||||
extractor,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
datastore,
|
||||
step,
|
||||
_marker: PhantomData,
|
||||
};
|
||||
thread_pool
|
||||
.execute(&extract)
|
||||
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))?;
|
||||
|
||||
// Clean up and reuse the document-specific allocator
|
||||
context.doc_alloc.reset();
|
||||
|
||||
let items = items.as_ref();
|
||||
let changes = items.iter().filter_map(|item| {
|
||||
document_changes.item_to_document_change(context, item).transpose()
|
||||
});
|
||||
|
||||
let res = extractor.process(changes, context).map_err(Arc::new);
|
||||
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
|
||||
|
||||
// send back the doc_alloc in the pool
|
||||
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||
|
||||
res
|
||||
},
|
||||
)?;
|
||||
step.store(total_documents, Ordering::Relaxed);
|
||||
extract.step.store(total_documents, Ordering::Relaxed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
use bumpalo::collections::CollectIn;
|
||||
use bumpalo::Bump;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rayon::slice::ParallelSlice as _;
|
||||
use roaring::RoaringBitmap;
|
||||
use scoped_thread_pool::PartitionChunks;
|
||||
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
use crate::documents::PrimaryKey;
|
||||
@@ -28,31 +27,28 @@ impl DocumentDeletion {
|
||||
self,
|
||||
indexer_alloc: &'indexer Bump,
|
||||
primary_key: PrimaryKey<'indexer>,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
chunk_size: usize,
|
||||
) -> DocumentDeletionChanges<'indexer> {
|
||||
let to_delete: bumpalo::collections::Vec<_> =
|
||||
self.to_delete.into_iter().collect_in(indexer_alloc);
|
||||
|
||||
let to_delete = to_delete.into_bump_slice();
|
||||
|
||||
let to_delete = PartitionChunks::new(to_delete, chunk_size, thread_pool.thread_count());
|
||||
|
||||
DocumentDeletionChanges { to_delete, primary_key }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentDeletionChanges<'indexer> {
|
||||
to_delete: &'indexer [DocumentId],
|
||||
to_delete: scoped_thread_pool::PartitionChunks<'indexer, DocumentId>,
|
||||
primary_key: PrimaryKey<'indexer>,
|
||||
}
|
||||
|
||||
impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||
type Item = DocumentId;
|
||||
|
||||
fn iter(
|
||||
&self,
|
||||
chunk_size: usize,
|
||||
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||
self.to_delete.par_chunks(chunk_size)
|
||||
}
|
||||
|
||||
fn item_to_document_change<
|
||||
'doc, // lifetime of a single `process` call
|
||||
T: MostlySend,
|
||||
@@ -78,7 +74,11 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.to_delete.len()
|
||||
self.to_delete.slice().len()
|
||||
}
|
||||
|
||||
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
|
||||
self.to_delete.partition(thread_index, task_index)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,6 +86,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||
mod test {
|
||||
use std::cell::RefCell;
|
||||
use std::marker::PhantomData;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use bumpalo::Bump;
|
||||
@@ -94,7 +95,7 @@ mod test {
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, Extractor, IndexingContext,
|
||||
extract, DocumentChangeContext, Extractor, IndexingContext, CHUNK_SIZE,
|
||||
};
|
||||
use crate::update::new::indexer::DocumentDeletion;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
@@ -135,6 +136,9 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::new(NonZeroUsize::new(1).unwrap(), "test".into());
|
||||
|
||||
let mut deletions = DocumentDeletion::new();
|
||||
deletions.delete_documents_by_docids(Vec::<u32>::new().into_iter().collect());
|
||||
let indexer = Bump::new();
|
||||
@@ -155,8 +159,12 @@ mod test {
|
||||
|
||||
let deletion_tracker = TrackDeletion(PhantomData);
|
||||
|
||||
let changes = deletions
|
||||
.into_changes(&indexer, crate::documents::PrimaryKey::Flat { name: "id", field_id: 0 });
|
||||
let changes = deletions.into_changes(
|
||||
&indexer,
|
||||
crate::documents::PrimaryKey::Flat { name: "id", field_id: 0 },
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
);
|
||||
|
||||
let context = IndexingContext {
|
||||
index: &index,
|
||||
@@ -173,6 +181,7 @@ mod test {
|
||||
let datastore = ThreadLocal::new();
|
||||
|
||||
extract(
|
||||
&thread_pool,
|
||||
&changes,
|
||||
&deletion_tracker,
|
||||
context,
|
||||
|
||||
@@ -6,8 +6,8 @@ use bumparaw_collections::RawMap;
|
||||
use hashbrown::hash_map::Entry;
|
||||
use heed::RoTxn;
|
||||
use memmap2::Mmap;
|
||||
use rayon::slice::ParallelSlice;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use scoped_thread_pool::PartitionChunks;
|
||||
use serde_json::value::RawValue;
|
||||
use serde_json::Deserializer;
|
||||
|
||||
@@ -57,6 +57,8 @@ impl<'pl> DocumentOperation<'pl> {
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
must_stop_processing: &MSP,
|
||||
progress: Progress,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
chunk_size: usize,
|
||||
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
|
||||
where
|
||||
MSP: Fn() -> bool,
|
||||
@@ -130,6 +132,8 @@ impl<'pl> DocumentOperation<'pl> {
|
||||
docids_version_offsets.sort_unstable_by_key(|(_, po)| method.sort_key(&po.operations));
|
||||
|
||||
let docids_version_offsets = docids_version_offsets.into_bump_slice();
|
||||
let docids_version_offsets =
|
||||
PartitionChunks::new(docids_version_offsets, chunk_size, thread_pool.thread_count());
|
||||
Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats, primary_key))
|
||||
}
|
||||
}
|
||||
@@ -353,13 +357,6 @@ fn merge_version_offsets<'s, 'pl>(
|
||||
impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
||||
type Item = (&'pl str, PayloadOperations<'pl>);
|
||||
|
||||
fn iter(
|
||||
&self,
|
||||
chunk_size: usize,
|
||||
) -> impl rayon::prelude::IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||
self.docids_version_offsets.par_chunks(chunk_size)
|
||||
}
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
@@ -379,12 +376,16 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.docids_version_offsets.len()
|
||||
self.docids_version_offsets.slice().len()
|
||||
}
|
||||
|
||||
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
|
||||
self.docids_version_offsets.partition(thread_index, task_index)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentOperationChanges<'pl> {
|
||||
docids_version_offsets: &'pl [(&'pl str, PayloadOperations<'pl>)],
|
||||
docids_version_offsets: PartitionChunks<'pl, (&'pl str, PayloadOperations<'pl>)>,
|
||||
}
|
||||
|
||||
pub enum Payload<'pl> {
|
||||
|
||||
@@ -22,6 +22,7 @@ use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<MSP>,
|
||||
indexer_span: Span,
|
||||
@@ -47,11 +48,12 @@ where
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
let document_sender = extractor_sender.documents();
|
||||
let document_extractor = DocumentsExtractor::new(document_sender, embedders);
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let datastore = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents");
|
||||
let _entered = span.enter();
|
||||
extract(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
&document_extractor,
|
||||
indexing_context,
|
||||
@@ -84,6 +86,7 @@ where
|
||||
let _entered = span.enter();
|
||||
|
||||
FacetedDocidsExtractor::run_extraction(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
@@ -97,6 +100,7 @@ where
|
||||
let _entered = span.enter();
|
||||
|
||||
facet_field_ids_delta = merge_and_send_facet_docids(
|
||||
thread_pool,
|
||||
caches,
|
||||
FacetDatabases::new(index),
|
||||
index,
|
||||
@@ -118,6 +122,7 @@ where
|
||||
let _entered = span.enter();
|
||||
|
||||
WordDocidsExtractors::run_extraction(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
@@ -129,6 +134,7 @@ where
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
thread_pool,
|
||||
word_docids,
|
||||
index.word_docids.remap_types(),
|
||||
index,
|
||||
@@ -142,6 +148,7 @@ where
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
thread_pool,
|
||||
word_fid_docids,
|
||||
index.word_fid_docids.remap_types(),
|
||||
index,
|
||||
@@ -155,6 +162,7 @@ where
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
thread_pool,
|
||||
exact_word_docids,
|
||||
index.exact_word_docids.remap_types(),
|
||||
index,
|
||||
@@ -168,6 +176,7 @@ where
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
thread_pool,
|
||||
word_position_docids,
|
||||
index.word_position_docids.remap_types(),
|
||||
index,
|
||||
@@ -181,6 +190,7 @@ where
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
merge_and_send_docids(
|
||||
thread_pool,
|
||||
fid_word_count_docids,
|
||||
index.field_id_word_count_docids.remap_types(),
|
||||
index,
|
||||
@@ -198,7 +208,8 @@ where
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
<WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
|
||||
WordPairProximityDocidsExtractor::run_extraction(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
@@ -211,6 +222,7 @@ where
|
||||
let _entered = span.enter();
|
||||
|
||||
merge_and_send_docids(
|
||||
thread_pool,
|
||||
caches,
|
||||
index.word_pair_proximity_docids.remap_types(),
|
||||
index,
|
||||
@@ -232,12 +244,13 @@ where
|
||||
field_distribution,
|
||||
request_threads(),
|
||||
);
|
||||
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let mut datastore = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
{
|
||||
let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
extract(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
@@ -263,17 +276,23 @@ where
|
||||
}
|
||||
|
||||
'geo: {
|
||||
let Some(extractor) = GeoExtractor::new(&rtxn, index, *indexing_context.grenad_parameters)?
|
||||
let Some(extractor) = GeoExtractor::new(
|
||||
&rtxn,
|
||||
index,
|
||||
*indexing_context.grenad_parameters,
|
||||
thread_pool.thread_count(),
|
||||
)?
|
||||
else {
|
||||
break 'geo;
|
||||
};
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let datastore = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
|
||||
let _entered = span.enter();
|
||||
|
||||
extract(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
@@ -289,6 +308,7 @@ where
|
||||
index,
|
||||
extractor_sender.geo(),
|
||||
&indexing_context.must_stop_processing,
|
||||
thread_pool,
|
||||
)?;
|
||||
}
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingToDatabase);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{Once, RwLock};
|
||||
use std::thread::{self, Builder};
|
||||
|
||||
use big_s::S;
|
||||
@@ -21,7 +21,6 @@ use crate::progress::Progress;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::vector::{ArroyWrapper, EmbeddingConfigs};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
||||
use std::sync::Once;
|
||||
|
||||
pub(crate) mod de;
|
||||
pub mod document_changes;
|
||||
@@ -45,6 +44,7 @@ static LOG_MEMORY_METRICS_ONCE: Once = Once::new();
|
||||
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &'index Index,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
pool: &ThreadPoolNoAbort,
|
||||
grenad_parameters: GrenadParameters,
|
||||
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
@@ -105,16 +105,15 @@ where
|
||||
);
|
||||
});
|
||||
|
||||
let (extractor_sender, writer_receiver) = pool
|
||||
.install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000))
|
||||
.unwrap();
|
||||
let (extractor_sender, writer_receiver) =
|
||||
extractor_writer_bbqueue(thread_pool, &mut bbbuffers, total_bbbuffer_capacity, 1000);
|
||||
|
||||
let metadata_builder = MetadataBuilder::from_index(index, wtxn)?;
|
||||
let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder);
|
||||
let new_fields_ids_map = RwLock::new(new_fields_ids_map);
|
||||
let fields_ids_map_store = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let mut extractor_allocs = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let doc_allocs = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let fields_ids_map_store = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
let mut extractor_allocs = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
let doc_allocs = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
|
||||
let indexing_context = IndexingContext {
|
||||
index,
|
||||
@@ -140,21 +139,19 @@ where
|
||||
let document_ids = &mut document_ids;
|
||||
let extractor_handle =
|
||||
Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
||||
pool.install(move || {
|
||||
extract::extract_all(
|
||||
document_changes,
|
||||
indexing_context,
|
||||
indexer_span,
|
||||
extractor_sender,
|
||||
embedders,
|
||||
&mut extractor_allocs,
|
||||
finished_extraction,
|
||||
field_distribution,
|
||||
index_embeddings,
|
||||
document_ids,
|
||||
)
|
||||
})
|
||||
.unwrap()
|
||||
extract::extract_all(
|
||||
thread_pool,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
indexer_span,
|
||||
extractor_sender,
|
||||
embedders,
|
||||
&mut extractor_allocs,
|
||||
finished_extraction,
|
||||
field_distribution,
|
||||
index_embeddings,
|
||||
document_ids,
|
||||
)
|
||||
})?;
|
||||
|
||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
|
||||
@@ -191,19 +188,23 @@ where
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
|
||||
|
||||
build_vectors(
|
||||
index,
|
||||
wtxn,
|
||||
index_embeddings,
|
||||
&mut arroy_writers,
|
||||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
pool.install(|| {
|
||||
build_vectors(
|
||||
index,
|
||||
wtxn,
|
||||
index_embeddings,
|
||||
&mut arroy_writers,
|
||||
&indexing_context.must_stop_processing,
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
|
||||
post_processing::post_process(
|
||||
indexing_context,
|
||||
wtxn,
|
||||
global_fields_ids_map,
|
||||
facet_field_ids_delta,
|
||||
thread_pool,
|
||||
)?;
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::Finalizing);
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use std::ops::DerefMut;
|
||||
|
||||
use bumparaw_collections::RawMap;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use scoped_thread_pool::ThreadPool;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
@@ -14,45 +14,34 @@ use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{DocumentChange, Insertion};
|
||||
use crate::{Error, InternalError, Result, UserError};
|
||||
|
||||
pub struct PartialDump<I> {
|
||||
iter: I,
|
||||
}
|
||||
pub struct PartialDump;
|
||||
|
||||
impl<I> PartialDump<I> {
|
||||
pub fn new_from_jsonlines(iter: I) -> Self {
|
||||
PartialDump { iter }
|
||||
impl PartialDump {
|
||||
pub fn new_from_jsonlines() -> Self {
|
||||
PartialDump
|
||||
}
|
||||
|
||||
pub fn into_changes<'index>(
|
||||
self,
|
||||
concurrent_available_ids: &'index ConcurrentAvailableIds,
|
||||
primary_key: &'index PrimaryKey,
|
||||
) -> PartialDumpChanges<'index, I> {
|
||||
_thread_pool: &ThreadPool<crate::Error>,
|
||||
_chunk_size: usize,
|
||||
) -> PartialDumpChanges<'index> {
|
||||
// Note for future self:
|
||||
// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
|
||||
PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key }
|
||||
PartialDumpChanges { concurrent_available_ids, primary_key }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PartialDumpChanges<'doc, I> {
|
||||
iter: I,
|
||||
pub struct PartialDumpChanges<'doc> {
|
||||
concurrent_available_ids: &'doc ConcurrentAvailableIds,
|
||||
primary_key: &'doc PrimaryKey<'doc>,
|
||||
}
|
||||
|
||||
impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter>
|
||||
where
|
||||
Iter: IndexedParallelIterator<Item = Box<RawValue>> + Clone + Sync + 'index,
|
||||
{
|
||||
impl<'index> DocumentChanges<'index> for PartialDumpChanges<'index> {
|
||||
type Item = Box<RawValue>;
|
||||
|
||||
fn iter(
|
||||
&self,
|
||||
chunk_size: usize,
|
||||
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||
self.iter.clone().chunks(chunk_size)
|
||||
}
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
@@ -85,6 +74,10 @@ where
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.iter.len()
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@ pub(super) fn post_process<MSP>(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
global_fields_ids_map: GlobalFieldsIdsMap<'_>,
|
||||
facet_field_ids_delta: FacetFieldIdsDelta,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
@@ -39,7 +40,13 @@ where
|
||||
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
|
||||
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
|
||||
compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?;
|
||||
compute_prefix_database(
|
||||
index,
|
||||
wtxn,
|
||||
prefix_delta,
|
||||
indexing_context.grenad_parameters,
|
||||
thread_pool,
|
||||
)?;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
@@ -50,16 +57,38 @@ fn compute_prefix_database(
|
||||
wtxn: &mut RwTxn,
|
||||
prefix_delta: PrefixDelta,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
let PrefixDelta { modified, deleted } = prefix_delta;
|
||||
// Compute word prefix docids
|
||||
compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
|
||||
compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters, thread_pool)?;
|
||||
// Compute exact word prefix docids
|
||||
compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
|
||||
compute_exact_word_prefix_docids(
|
||||
wtxn,
|
||||
index,
|
||||
&modified,
|
||||
&deleted,
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)?;
|
||||
// Compute word prefix fid docids
|
||||
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
|
||||
compute_word_prefix_fid_docids(
|
||||
wtxn,
|
||||
index,
|
||||
&modified,
|
||||
&deleted,
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)?;
|
||||
// Compute word prefix position docids
|
||||
compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters)
|
||||
compute_word_prefix_position_docids(
|
||||
wtxn,
|
||||
index,
|
||||
&modified,
|
||||
&deleted,
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing")]
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use bumpalo::collections::CollectIn;
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rayon::slice::ParallelSlice as _;
|
||||
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
|
||||
use roaring::RoaringBitmap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use scoped_thread_pool::{PartitionChunks, ThreadPool};
|
||||
|
||||
use super::document_changes::DocumentChangeContext;
|
||||
use super::DocumentChanges;
|
||||
@@ -22,14 +23,12 @@ pub struct UpdateByFunction {
|
||||
code: String,
|
||||
}
|
||||
|
||||
pub struct UpdateByFunctionChanges<'doc> {
|
||||
primary_key: &'doc PrimaryKey<'doc>,
|
||||
pub struct UpdateByFunctionChanges<'index> {
|
||||
primary_key: &'index PrimaryKey<'index>,
|
||||
engine: Engine,
|
||||
ast: AST,
|
||||
context: Option<Dynamic>,
|
||||
// It is sad that the RoaringBitmap doesn't
|
||||
// implement IndexedParallelIterator
|
||||
documents: Vec<u32>,
|
||||
documents: PartitionChunks<'index, u32>,
|
||||
}
|
||||
|
||||
impl UpdateByFunction {
|
||||
@@ -40,6 +39,9 @@ impl UpdateByFunction {
|
||||
pub fn into_changes<'index>(
|
||||
self,
|
||||
primary_key: &'index PrimaryKey,
|
||||
allocator: &'index Bump,
|
||||
thread_pool: &ThreadPool<crate::Error>,
|
||||
chunk_size: usize,
|
||||
) -> Result<UpdateByFunctionChanges<'index>> {
|
||||
let Self { documents, context, code } = self;
|
||||
|
||||
@@ -64,26 +66,19 @@ impl UpdateByFunction {
|
||||
None => None,
|
||||
};
|
||||
|
||||
Ok(UpdateByFunctionChanges {
|
||||
primary_key,
|
||||
engine,
|
||||
ast,
|
||||
context,
|
||||
documents: documents.into_iter().collect(),
|
||||
})
|
||||
let documents: bumpalo::collections::Vec<'_, _> =
|
||||
documents.into_iter().collect_in(allocator);
|
||||
let documents = documents.into_bump_slice();
|
||||
|
||||
let documents = PartitionChunks::new(documents, chunk_size, thread_pool.thread_count());
|
||||
|
||||
Ok(UpdateByFunctionChanges { primary_key, engine, ast, context, documents })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
||||
type Item = u32;
|
||||
|
||||
fn iter(
|
||||
&self,
|
||||
chunk_size: usize,
|
||||
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
|
||||
self.documents.as_slice().par_chunks(chunk_size)
|
||||
}
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
@@ -185,7 +180,11 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.documents.len()
|
||||
self.documents.slice().len()
|
||||
}
|
||||
|
||||
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
|
||||
self.documents.partition(thread_index, task_index)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use std::cell::RefCell;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use hashbrown::HashMap;
|
||||
use heed::types::Bytes;
|
||||
use heed::{Database, RoTxn};
|
||||
use memmap2::Mmap;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::channel::*;
|
||||
@@ -22,6 +22,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>(
|
||||
index: &Index,
|
||||
geo_sender: GeoSender<'_, '_>,
|
||||
must_stop_processing: &MSP,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
@@ -57,13 +58,14 @@ where
|
||||
|
||||
let rtree_mmap = unsafe { Mmap::map(&file)? };
|
||||
geo_sender.set_rtree(rtree_mmap).unwrap();
|
||||
geo_sender.set_geo_faceted(&faceted)?;
|
||||
geo_sender.set_geo_faceted(&faceted, thread_pool)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
||||
pub fn merge_and_send_docids<'extractor, MSP, D>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
mut caches: Vec<BalancedCaches<'extractor>>,
|
||||
database: Database<Bytes, Bytes>,
|
||||
index: &Index,
|
||||
@@ -74,7 +76,10 @@ where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
D: DatabaseType + Sync,
|
||||
{
|
||||
transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
|
||||
let frozen_caches = Mutex::new(transpose_and_freeze_caches(&mut caches)?);
|
||||
|
||||
match thread_pool.broadcast(|thread_index| {
|
||||
let frozen = std::mem::take(frozen_caches.lock().unwrap().get_mut(thread_index).unwrap());
|
||||
let rtxn = index.read_txn()?;
|
||||
if must_stop_processing() {
|
||||
return Err(InternalError::AbortedIndexation.into());
|
||||
@@ -92,12 +97,17 @@ where
|
||||
}
|
||||
Operation::Ignore => Ok(()),
|
||||
}
|
||||
})
|
||||
})
|
||||
})?;
|
||||
Ok(())
|
||||
}) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(errors) => Err(crate::Error::from_scoped_thread_pool_errors(thread_pool, errors)),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
||||
pub fn merge_and_send_facet_docids<'extractor>(
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
mut caches: Vec<BalancedCaches<'extractor>>,
|
||||
database: FacetDatabases,
|
||||
index: &Index,
|
||||
@@ -108,9 +118,15 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
||||
let max_number_count = (index.facet_id_f64_docids.len(rtxn)? / 500) as usize;
|
||||
let max_string_count = max_string_count.clamp(1000, 100_000);
|
||||
let max_number_count = max_number_count.clamp(1000, 100_000);
|
||||
transpose_and_freeze_caches(&mut caches)?
|
||||
.into_par_iter()
|
||||
.map(|frozen| {
|
||||
let transposed_frozen_caches = Mutex::new(transpose_and_freeze_caches(&mut caches)?);
|
||||
let output = Mutex::new(FacetFieldIdsDelta::new(max_string_count, max_number_count));
|
||||
thread_pool
|
||||
.broadcast(|thread_index| {
|
||||
// TODO: we can probably spare the mutex here since it is guaranteed that each thread will access its own cell of the vec
|
||||
let frozen = std::mem::take(
|
||||
transposed_frozen_caches.lock().unwrap().get_mut(thread_index).unwrap(),
|
||||
);
|
||||
|
||||
let mut facet_field_ids_delta =
|
||||
FacetFieldIdsDelta::new(max_string_count, max_number_count);
|
||||
let rtxn = index.read_txn()?;
|
||||
@@ -130,13 +146,18 @@ pub fn merge_and_send_facet_docids<'extractor>(
|
||||
Operation::Ignore => Ok(()),
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(facet_field_ids_delta)
|
||||
{
|
||||
let mut common = output.lock().unwrap();
|
||||
*common = std::mem::replace(
|
||||
&mut *common,
|
||||
FacetFieldIdsDelta::new(max_string_count, max_number_count),
|
||||
)
|
||||
.merge(facet_field_ids_delta);
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
.reduce(
|
||||
|| Ok(FacetFieldIdsDelta::new(max_string_count, max_number_count)),
|
||||
|lhs, rhs| Ok(lhs?.merge(rhs?)),
|
||||
)
|
||||
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))?;
|
||||
Ok(output.into_inner().unwrap())
|
||||
}
|
||||
|
||||
pub struct FacetDatabases<'a> {
|
||||
|
||||
@@ -26,11 +26,13 @@ impl WordPrefixDocids {
|
||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> WordPrefixDocids {
|
||||
WordPrefixDocids {
|
||||
database,
|
||||
prefix_database,
|
||||
max_memory_by_thread: grenad_parameters.max_memory_by_thread(),
|
||||
max_memory_by_thread: grenad_parameters
|
||||
.max_memory_by_thread(thread_pool.thread_count()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,9 +41,10 @@ impl WordPrefixDocids {
|
||||
wtxn: &mut heed::RwTxn,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
|
||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute, thread_pool)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||
@@ -49,6 +52,7 @@ impl WordPrefixDocids {
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
prefixes: &BTreeSet<Prefix>,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
// And collect the CboRoaringBitmaps pointers in an HashMap.
|
||||
@@ -56,7 +60,7 @@ impl WordPrefixDocids {
|
||||
|
||||
// We access this HashMap in parallel to compute the *union* of all
|
||||
// of them and *serialize* them into files. There is one file by CPU.
|
||||
let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let local_entries = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| {
|
||||
let refcell = local_entries.get_or(|| {
|
||||
let file = BufWriter::new(spooled_tempfile(
|
||||
@@ -162,11 +166,13 @@ impl WordPrefixIntegerDocids {
|
||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> WordPrefixIntegerDocids {
|
||||
WordPrefixIntegerDocids {
|
||||
database,
|
||||
prefix_database,
|
||||
max_memory_by_thread: grenad_parameters.max_memory_by_thread(),
|
||||
max_memory_by_thread: grenad_parameters
|
||||
.max_memory_by_thread(thread_pool.thread_count()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,9 +181,10 @@ impl WordPrefixIntegerDocids {
|
||||
wtxn: &mut heed::RwTxn,
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
|
||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
|
||||
self.recompute_modified_prefixes(wtxn, prefix_to_compute, thread_pool)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||
@@ -185,6 +192,7 @@ impl WordPrefixIntegerDocids {
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
prefixes: &BTreeSet<Prefix>,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
// And collect the CboRoaringBitmaps pointers in an HashMap.
|
||||
@@ -192,7 +200,7 @@ impl WordPrefixIntegerDocids {
|
||||
|
||||
// We access this HashMap in parallel to compute the *union* of all
|
||||
// of them and *serialize* them into files. There is one file by CPU.
|
||||
let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let local_entries = ThreadLocal::with_capacity(thread_pool.thread_count());
|
||||
prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| {
|
||||
let refcell = local_entries.get_or(|| {
|
||||
let file = BufWriter::new(spooled_tempfile(
|
||||
@@ -312,13 +320,15 @@ pub fn compute_word_prefix_docids(
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
WordPrefixDocids::new(
|
||||
index.word_docids.remap_key_type(),
|
||||
index.word_prefix_docids.remap_key_type(),
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||
@@ -328,13 +338,15 @@ pub fn compute_exact_word_prefix_docids(
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
WordPrefixDocids::new(
|
||||
index.exact_word_docids.remap_key_type(),
|
||||
index.exact_word_prefix_docids.remap_key_type(),
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||
@@ -344,13 +356,15 @@ pub fn compute_word_prefix_fid_docids(
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
WordPrefixIntegerDocids::new(
|
||||
index.word_fid_docids.remap_key_type(),
|
||||
index.word_prefix_fid_docids.remap_key_type(),
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||
@@ -360,11 +374,13 @@ pub fn compute_word_prefix_position_docids(
|
||||
prefix_to_compute: &BTreeSet<Prefix>,
|
||||
prefix_to_delete: &BTreeSet<Prefix>,
|
||||
grenad_parameters: &GrenadParameters,
|
||||
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
|
||||
) -> Result<()> {
|
||||
WordPrefixIntegerDocids::new(
|
||||
index.word_position_docids.remap_key_type(),
|
||||
index.word_prefix_position_docids.remap_key_type(),
|
||||
grenad_parameters,
|
||||
thread_pool,
|
||||
)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete)
|
||||
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
mod v1_12;
|
||||
mod v1_13;
|
||||
|
||||
use heed::RwTxn;
|
||||
use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3};
|
||||
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
|
||||
use v1_13::V1_13_0_To_Current;
|
||||
|
||||
use crate::progress::{Progress, VariableNameStep};
|
||||
use crate::{Index, InternalError, Result};
|
||||
@@ -26,11 +28,13 @@ pub fn upgrade(
|
||||
progress: Progress,
|
||||
) -> Result<bool> {
|
||||
let from = index.get_version(wtxn)?.unwrap_or(db_version);
|
||||
let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()];
|
||||
let upgrade_functions: &[&dyn UpgradeIndex] =
|
||||
&[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()];
|
||||
|
||||
let start = match from {
|
||||
(1, 12, 0..=2) => 0,
|
||||
(1, 12, 3..) => 1,
|
||||
(1, 13, 0) => 2,
|
||||
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
|
||||
(1, 13, _) => return Ok(false),
|
||||
(major, minor, patch) => {
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
use heed::RwTxn;
|
||||
|
||||
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
||||
use super::UpgradeIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::{make_enum_progress, Index, Result};
|
||||
|
||||
use super::UpgradeIndex;
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub(super) struct V1_12_To_V1_12_3 {}
|
||||
|
||||
@@ -33,9 +31,9 @@ impl UpgradeIndex for V1_12_To_V1_12_3 {
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub(super) struct V1_12_3_To_Current();
|
||||
pub(super) struct V1_12_3_To_V1_13_0 {}
|
||||
|
||||
impl UpgradeIndex for V1_12_3_To_Current {
|
||||
impl UpgradeIndex for V1_12_3_To_V1_13_0 {
|
||||
fn upgrade(
|
||||
&self,
|
||||
_wtxn: &mut RwTxn,
|
||||
@@ -43,14 +41,11 @@ impl UpgradeIndex for V1_12_3_To_Current {
|
||||
_original: (u32, u32, u32),
|
||||
_progress: Progress,
|
||||
) -> Result<bool> {
|
||||
Ok(false)
|
||||
// recompute the indexes stats
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn target_version(&self) -> (u32, u32, u32) {
|
||||
(
|
||||
VERSION_MAJOR.parse().unwrap(),
|
||||
VERSION_MINOR.parse().unwrap(),
|
||||
VERSION_PATCH.parse().unwrap(),
|
||||
)
|
||||
(1, 13, 0)
|
||||
}
|
||||
}
|
||||
|
||||
29
crates/milli/src/update/upgrade/v1_13.rs
Normal file
29
crates/milli/src/update/upgrade/v1_13.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use heed::RwTxn;
|
||||
|
||||
use super::UpgradeIndex;
|
||||
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
||||
use crate::progress::Progress;
|
||||
use crate::{Index, Result};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub(super) struct V1_13_0_To_Current();
|
||||
|
||||
impl UpgradeIndex for V1_13_0_To_Current {
|
||||
fn upgrade(
|
||||
&self,
|
||||
_wtxn: &mut RwTxn,
|
||||
_index: &Index,
|
||||
_original: (u32, u32, u32),
|
||||
_progress: Progress,
|
||||
) -> Result<bool> {
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn target_version(&self) -> (u32, u32, u32) {
|
||||
(
|
||||
VERSION_MAJOR.parse().unwrap(),
|
||||
VERSION_MINOR.parse().unwrap(),
|
||||
VERSION_PATCH.parse().unwrap(),
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -410,8 +410,43 @@ impl ArroyWrapper {
|
||||
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
|
||||
self.database.remap_data_type()
|
||||
}
|
||||
|
||||
pub fn aggregate_stats(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
stats: &mut ArroyStats,
|
||||
) -> Result<(), arroy::Error> {
|
||||
if self.quantized {
|
||||
for reader in self.readers(rtxn, self.quantized_db()) {
|
||||
let reader = reader?;
|
||||
let documents = reader.item_ids();
|
||||
if documents.is_empty() {
|
||||
break;
|
||||
}
|
||||
stats.documents |= documents;
|
||||
stats.number_of_embeddings += documents.len();
|
||||
}
|
||||
} else {
|
||||
for reader in self.readers(rtxn, self.angular_db()) {
|
||||
let reader = reader?;
|
||||
let documents = reader.item_ids();
|
||||
if documents.is_empty() {
|
||||
break;
|
||||
}
|
||||
stats.documents |= documents;
|
||||
stats.number_of_embeddings += documents.len();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct ArroyStats {
|
||||
pub number_of_embeddings: u64,
|
||||
pub documents: RoaringBitmap,
|
||||
}
|
||||
/// One or multiple embeddings stored consecutively in a flat vector.
|
||||
pub struct Embeddings<F> {
|
||||
data: Vec<F>,
|
||||
|
||||
@@ -130,6 +130,7 @@ impl Embedder {
|
||||
let client = ureq::AgentBuilder::new()
|
||||
.max_idle_connections(REQUEST_PARALLELISM * 2)
|
||||
.max_idle_connections_per_host(REQUEST_PARALLELISM * 2)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.build();
|
||||
|
||||
let request = Request::new(options.request)?;
|
||||
|
||||
@@ -5,6 +5,7 @@ use maplit::hashset;
|
||||
use milli::documents::mmap_from_objects;
|
||||
use milli::progress::Progress;
|
||||
use milli::update::new::indexer;
|
||||
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use milli::vector::EmbeddingConfigs;
|
||||
use milli::{FacetDistribution, Index, Object, OrderBy};
|
||||
@@ -36,6 +37,8 @@ fn test_facet_distribution_with_no_facet_values() {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
|
||||
let doc1: Object = from_value(
|
||||
@@ -59,12 +62,15 @@ fn test_facet_distribution_with_no_facet_values() {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -9,6 +9,7 @@ use heed::EnvOpenOptions;
|
||||
use maplit::{btreemap, hashset};
|
||||
use milli::progress::Progress;
|
||||
use milli::update::new::indexer;
|
||||
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use milli::vector::EmbeddingConfigs;
|
||||
use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy};
|
||||
@@ -72,6 +73,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
|
||||
let mut file = tempfile::tempfile().unwrap();
|
||||
@@ -92,6 +95,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -102,6 +107,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -7,6 +7,7 @@ use itertools::Itertools;
|
||||
use maplit::hashset;
|
||||
use milli::progress::Progress;
|
||||
use milli::update::new::indexer;
|
||||
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use milli::vector::EmbeddingConfigs;
|
||||
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
|
||||
@@ -288,6 +289,8 @@ fn criteria_ascdesc() {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
|
||||
let mut file = tempfile::tempfile().unwrap();
|
||||
@@ -328,12 +331,15 @@ fn criteria_ascdesc() {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -5,6 +5,7 @@ use heed::EnvOpenOptions;
|
||||
use milli::documents::mmap_from_objects;
|
||||
use milli::progress::Progress;
|
||||
use milli::update::new::indexer;
|
||||
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
|
||||
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
|
||||
use milli::vector::EmbeddingConfigs;
|
||||
use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
||||
@@ -123,6 +124,8 @@ fn test_typo_disabled_on_word() {
|
||||
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let thread_pool =
|
||||
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
|
||||
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
|
||||
|
||||
indexer.add_documents(&documents).unwrap();
|
||||
@@ -137,12 +140,15 @@ fn test_typo_disabled_on_word() {
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
Progress::default(),
|
||||
&thread_pool,
|
||||
CHUNK_SIZE,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
indexer::index(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&thread_pool,
|
||||
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
|
||||
config.grenad_parameters(),
|
||||
&db_fields_ids_map,
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
"hackernews-modified-number-filters.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/01-modified-filters.ndjson",
|
||||
"sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802"
|
||||
"sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c"
|
||||
}
|
||||
},
|
||||
"precommands": [
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
"hackernews-modified-string-filters.ndjson": {
|
||||
"local_location": null,
|
||||
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/hackernews/modification/02-modified-filters.ndjson",
|
||||
"sha256": "b80c245ce1b1df80b9b38800f677f3bd11947ebc62716fb108269d50e796c35c"
|
||||
"sha256": "7272cbfd41110d32d7fe168424a0000f07589bfe40f664652b34f4f20aaf3802"
|
||||
}
|
||||
},
|
||||
"precommands": [
|
||||
|
||||
Reference in New Issue
Block a user