Compare commits

..

8 Commits

Author SHA1 Message Date
Louis Dureuil
50268b930c integrate thread_pool 2025-03-01 23:49:16 +01:00
Louis Dureuil
93ba4b924a Use thread pool in process index op 2025-03-01 23:46:37 +01:00
Louis Dureuil
b7d5576347 benchmarks and fuzzers 2025-03-01 23:46:04 +01:00
Louis Dureuil
f67b246108 Change cargo toml 2025-03-01 14:50:55 +01:00
Louis Dureuil
a1f60c61e8 Reasonable changes 2025-02-26 22:16:31 +01:00
Louis Dureuil
de2fedaa9d Use thread_pool broadcast 2025-02-26 22:12:19 +01:00
Louis Dureuil
89717ba0f1 error support 2025-02-26 22:11:34 +01:00
Louis Dureuil
8d93de28b8 Add thread pool to cargo toml 2025-02-26 22:11:16 +01:00
82 changed files with 1281 additions and 1704 deletions

46
Cargo.lock generated
View File

@@ -503,7 +503,7 @@ source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2
[[package]]
name = "benchmarks"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"anyhow",
"bumpalo",
@@ -519,6 +519,7 @@ dependencies = [
"rand_chacha",
"reqwest",
"roaring",
"scoped_thread_pool",
"serde_json",
"tempfile",
]
@@ -694,7 +695,7 @@ dependencies = [
[[package]]
name = "build-info"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"anyhow",
"time",
@@ -1671,7 +1672,7 @@ dependencies = [
[[package]]
name = "dump"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"anyhow",
"big_s",
@@ -1873,7 +1874,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "file-store"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"tempfile",
"thiserror 2.0.9",
@@ -1895,7 +1896,7 @@ dependencies = [
[[package]]
name = "filter-parser"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"insta",
"nom",
@@ -1915,7 +1916,7 @@ dependencies = [
[[package]]
name = "flatten-serde-json"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"criterion",
"serde_json",
@@ -2054,7 +2055,7 @@ dependencies = [
[[package]]
name = "fuzzers"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"arbitrary",
"bumpalo",
@@ -2062,6 +2063,7 @@ dependencies = [
"either",
"fastrand",
"milli",
"scoped_thread_pool",
"serde",
"serde_json",
"tempfile",
@@ -2743,7 +2745,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
[[package]]
name = "index-scheduler"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"anyhow",
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -2768,6 +2770,7 @@ dependencies = [
"page_size",
"rayon",
"roaring",
"scoped_thread_pool",
"serde",
"serde_json",
"synchronoise",
@@ -2950,7 +2953,7 @@ dependencies = [
[[package]]
name = "json-depth-checker"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"criterion",
"serde_json",
@@ -3569,7 +3572,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "meili-snap"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"insta",
"md5",
@@ -3578,7 +3581,7 @@ dependencies = [
[[package]]
name = "meilisearch"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"actix-cors",
"actix-http",
@@ -3636,6 +3639,7 @@ dependencies = [
"rustls",
"rustls-pemfile",
"rustls-pki-types",
"scoped_thread_pool",
"segment",
"serde",
"serde_json",
@@ -3670,7 +3674,7 @@ dependencies = [
[[package]]
name = "meilisearch-auth"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"base64 0.22.1",
"enum-iterator",
@@ -3689,7 +3693,7 @@ dependencies = [
[[package]]
name = "meilisearch-types"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"actix-web",
"anyhow",
@@ -3723,7 +3727,7 @@ dependencies = [
[[package]]
name = "meilitool"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"anyhow",
"arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)",
@@ -3758,7 +3762,7 @@ dependencies = [
[[package]]
name = "milli"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"allocator-api2",
"arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -3814,6 +3818,7 @@ dependencies = [
"roaring",
"rstar",
"rustc-hash 2.1.0",
"scoped_thread_pool",
"serde",
"serde_json",
"slice-group-by",
@@ -4270,7 +4275,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "permissive-json-pointer"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"big_s",
"serde_json",
@@ -5088,6 +5093,13 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "scoped_thread_pool"
version = "0.1.0"
dependencies = [
"crossbeam-channel",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@@ -6847,7 +6859,7 @@ dependencies = [
[[package]]
name = "xtask"
version = "1.13.2"
version = "1.13.0"
dependencies = [
"anyhow",
"build-info",

View File

@@ -22,7 +22,7 @@ members = [
]
[workspace.package]
version = "1.13.2"
version = "1.13.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",

View File

@@ -17,6 +17,7 @@ csv = "1.3.1"
memmap2 = "0.9.5"
milli = { path = "../milli" }
mimalloc = { version = "0.1.43", default-features = false }
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
serde_json = { version = "1.0.135", features = ["preserve_order"] }
tempfile = "3.15.0"

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,7 @@
use std::fs::{create_dir_all, remove_dir_all, File};
use std::io::{self, BufReader, BufWriter, Read};
use std::num::NonZeroUsize;
use std::path::Path;
use std::str::FromStr as _;
@@ -9,9 +10,11 @@ use anyhow::Context;
use bumpalo::Bump;
use criterion::BenchmarkId;
use memmap2::Mmap;
use milli::heed::EnvOpenOptions;
use milli::documents::PrimaryKey;
use milli::heed::{EnvOpenOptions, RwTxn};
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy};
@@ -96,28 +99,59 @@ pub fn base_setup(conf: &Conf) -> Index {
let mut wtxn = index.write_txn().unwrap();
let rtxn = index.read_txn().unwrap();
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut new_fields_ids_map = db_fields_ids_map.clone();
let new_fields_ids_map = db_fields_ids_map.clone();
let documents = documents_from(conf.dataset, conf.dataset_format);
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
indexer.add_documents(&documents).unwrap();
index_documents(
indexer,
&index,
&rtxn,
new_fields_ids_map,
&mut wtxn,
config,
db_fields_ids_map,
);
wtxn.commit().unwrap();
drop(rtxn);
index
}
pub fn index_documents(
indexer: indexer::DocumentOperation,
index: &Index,
rtxn: &milli::heed::RoTxn,
mut new_fields_ids_map: milli::FieldsIdsMap,
wtxn: &mut RwTxn,
config: IndexerConfig,
db_fields_ids_map: milli::FieldsIdsMap,
) {
let indexer_alloc = Bump::new();
let thread_count =
std::thread::available_parallelism().unwrap_or(NonZeroUsize::new(1).unwrap());
let thread_pool = scoped_thread_pool::ThreadPool::new(thread_count, "index".into());
let (document_changes, _operation_stats, primary_key) = indexer
.into_changes(
&indexer_alloc,
&index,
&rtxn,
index,
rtxn,
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index,
wtxn,
index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,
@@ -129,11 +163,38 @@ pub fn base_setup(conf: &Conf) -> Index {
&Progress::default(),
)
.unwrap();
}
wtxn.commit().unwrap();
drop(rtxn);
index
pub fn index_delete_documents(
indexer: indexer::DocumentDeletion,
primary_key: PrimaryKey,
wtxn: &mut RwTxn,
index: &Index,
config: &IndexerConfig,
db_fields_ids_map: milli::FieldsIdsMap,
new_fields_ids_map: milli::FieldsIdsMap,
) {
let indexer_alloc = Bump::new();
let thread_count =
std::thread::available_parallelism().unwrap_or(NonZeroUsize::new(1).unwrap());
let thread_pool = scoped_thread_pool::ThreadPool::new(thread_count, "index".into());
let document_changes =
indexer.into_changes(&indexer_alloc, primary_key, &thread_pool, CHUNK_SIZE);
indexer::index(
wtxn,
index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
Some(primary_key),
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
)
.unwrap();
}
pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {

View File

@@ -17,6 +17,7 @@ clap = { version = "4.5.24", features = ["derive"] }
either = "1.13.0"
fastrand = "2.3.0"
milli = { path = "../milli" }
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
serde = { version = "1.0.217", features = ["derive"] }
serde_json = { version = "1.0.135", features = ["preserve_order"] }
tempfile = "3.15.0"

View File

@@ -12,6 +12,7 @@ use milli::documents::mmap_from_objects;
use milli::heed::EnvOpenOptions;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
use milli::update::{IndexDocumentsMethod, IndexerConfig};
use milli::vector::EmbeddingConfigs;
use milli::Index;
@@ -121,6 +122,11 @@ fn main() {
}
}
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism(
"index".into(),
);
let (document_changes, _operation_stats, primary_key) = indexer
.into_changes(
&indexer_alloc,
@@ -130,12 +136,15 @@ fn main() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -28,6 +28,7 @@ memmap2 = "0.9.5"
page_size = "0.6.0"
rayon = "1.10.0"
roaring = { version = "0.10.10", features = ["serde"] }
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
serde = { version = "1.0.217", features = ["derive"] }
serde_json = { version = "1.0.135", features = ["preserve_order"] }
synchronoise = "1.0.1"

View File

@@ -6,7 +6,6 @@ use std::{fs, thread};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
use meilisearch_types::milli;
use meilisearch_types::milli::database_stats::DatabaseStats;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::{FieldDistribution, Index};
use serde::{Deserialize, Serialize};
@@ -99,13 +98,8 @@ pub enum IndexStatus {
/// The statistics that can be computed from an `Index` object.
#[derive(Serialize, Deserialize, Debug)]
pub struct IndexStats {
/// Stats of the documents database.
#[serde(default)]
pub documents_database_stats: DatabaseStats,
#[serde(default, skip_serializing)]
pub number_of_documents: Option<u64>,
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
@@ -144,10 +138,9 @@ impl IndexStats {
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
let arroy_stats = index.arroy_stats(rtxn)?;
Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?,
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
number_of_embedded_documents: Some(arroy_stats.documents.len()),
documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(),
number_of_documents: None,
database_size: index.on_disk_size()?,
used_database_size: index.used_size()?,
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),

View File

@@ -365,8 +365,7 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String {
let stats = mapper.stats_of(rtxn, &name).unwrap();
s.push_str(&format!(
"{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n",
stats.documents_database_stats.number_of_entries(),
stats.field_distribution
stats.number_of_documents, stats.field_distribution
));
}

View File

@@ -3,6 +3,7 @@ use bumpalo::Bump;
use meilisearch_types::heed::RwTxn;
use meilisearch_types::milli::documents::PrimaryKey;
use meilisearch_types::milli::progress::Progress;
use meilisearch_types::milli::update::new::indexer::document_changes::CHUNK_SIZE;
use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction};
use meilisearch_types::milli::update::DocumentAdditionResult;
use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder};
@@ -112,17 +113,24 @@ impl IndexScheduler {
let local_pool;
let indexer_config = self.index_mapper.indexer_config();
let pool = match &indexer_config.thread_pool {
let pool = match &indexer_config.rayon_thread_pool {
Some(pool) => pool,
None => {
local_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|i| format!("indexing-thread-{i}"))
.thread_name(|i| format!("rayon-{i}"))
.build()
.unwrap();
&local_pool
}
};
let thread_pool = match &indexer_config.thread_pool {
Some(thread_pool) => thread_pool,
None => {
&scoped_thread_pool::ThreadPool::with_available_parallelism("index".into())
}
};
progress.update_progress(DocumentOperationProgress::ComputingDocumentChanges);
let (document_changes, operation_stats, primary_key) = indexer
.into_changes(
@@ -133,6 +141,8 @@ impl IndexScheduler {
&mut new_fields_ids_map,
&|| must_stop_processing.get(),
progress.clone(),
thread_pool,
CHUNK_SIZE,
)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
@@ -173,6 +183,7 @@ impl IndexScheduler {
indexer::index(
index_wtxn,
index,
thread_pool,
pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -261,7 +272,7 @@ impl IndexScheduler {
if task.error.is_none() {
let local_pool;
let indexer_config = self.index_mapper.indexer_config();
let pool = match &indexer_config.thread_pool {
let pool = match &indexer_config.rayon_thread_pool {
Some(pool) => pool,
None => {
local_pool = ThreadPoolNoAbortBuilder::new()
@@ -272,16 +283,19 @@ impl IndexScheduler {
}
};
let thread_pool = match &indexer_config.thread_pool {
Some(thread_pool) => thread_pool,
None => &scoped_thread_pool::ThreadPool::with_available_parallelism(
"index".into(),
),
};
let candidates_count = candidates.len();
progress.update_progress(DocumentEditionProgress::ComputingDocumentChanges);
let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone());
let document_changes = pool
.install(|| {
indexer
.into_changes(&primary_key)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))
})
.unwrap()?;
let document_changes = indexer
.into_changes(&primary_key, &indexer_alloc, thread_pool, CHUNK_SIZE)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
let embedders = index
.embedding_configs(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
@@ -291,6 +305,7 @@ impl IndexScheduler {
indexer::index(
index_wtxn,
index,
thread_pool,
pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -421,7 +436,7 @@ impl IndexScheduler {
if !tasks.iter().all(|res| res.error.is_some()) {
let local_pool;
let indexer_config = self.index_mapper.indexer_config();
let pool = match &indexer_config.thread_pool {
let pool = match &indexer_config.rayon_thread_pool {
Some(pool) => pool,
None => {
local_pool = ThreadPoolNoAbortBuilder::new()
@@ -432,11 +447,19 @@ impl IndexScheduler {
}
};
let thread_pool = match &indexer_config.thread_pool {
Some(thread_pool) => thread_pool,
None => &scoped_thread_pool::ThreadPool::with_available_parallelism(
"index".into(),
),
};
progress.update_progress(DocumentDeletionProgress::DeleteDocuments);
let mut indexer = indexer::DocumentDeletion::new();
let candidates_count = to_delete.len();
indexer.delete_documents_by_docids(to_delete);
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
let document_changes =
indexer.into_changes(&indexer_alloc, primary_key, thread_pool, CHUNK_SIZE);
let embedders = index
.embedding_configs(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
@@ -446,6 +469,7 @@ impl IndexScheduler {
indexer::index(
index_wtxn,
index,
thread_pool,
pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -1,12 +1,13 @@
---
source: crates/index-scheduler/src/scheduler/test_failure.rs
snapshot_kind: text
---
### Autobatching Enabled = true
### Processing batch None:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
@@ -57,7 +58,7 @@ girafo: { number_of_documents: 0, field_distribution: {} }
[timestamp] [4,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, }
2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, }
3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, }

View File

@@ -1,12 +1,13 @@
---
source: crates/index-scheduler/src/scheduler/test_failure.rs
snapshot_kind: text
---
### Autobatching Enabled = true
### Processing batch None:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
----------------------------------------------------------------------
### Status:
enqueued [0,]

View File

@@ -1,12 +1,13 @@
---
source: crates/index-scheduler/src/scheduler/test_failure.rs
snapshot_kind: text
---
### Autobatching Enabled = true
### Processing batch None:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
----------------------------------------------------------------------
### Status:

View File

@@ -1,12 +1,13 @@
---
source: crates/index-scheduler/src/scheduler/test_failure.rs
snapshot_kind: text
---
### Autobatching Enabled = true
### Processing batch None:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
----------------------------------------------------------------------
### Status:
@@ -37,7 +38,7 @@ catto [1,]
[timestamp] [0,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
----------------------------------------------------------------------
### Batch to tasks mapping:
0 [0,]

View File

@@ -1,12 +1,13 @@
---
source: crates/index-scheduler/src/scheduler/test_failure.rs
snapshot_kind: text
---
### Autobatching Enabled = true
### Processing batch None:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
----------------------------------------------------------------------
@@ -40,7 +41,7 @@ doggo [2,]
[timestamp] [0,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
----------------------------------------------------------------------
### Batch to tasks mapping:
0 [0,]

View File

@@ -1,12 +1,13 @@
---
source: crates/index-scheduler/src/scheduler/test_failure.rs
snapshot_kind: text
---
### Autobatching Enabled = true
### Processing batch None:
[]
----------------------------------------------------------------------
### All Tasks:
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 13, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }}
1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
3 {uid: 3, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }}
@@ -43,7 +44,7 @@ doggo [2,3,]
[timestamp] [0,]
----------------------------------------------------------------------
### All Batches:
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.13.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, }
----------------------------------------------------------------------
### Batch to tasks mapping:
0 [0,]

View File

@@ -910,11 +910,7 @@ fn create_and_list_index() {
[
"kefir",
{
"documents_database_stats": {
"numberOfEntries": 0,
"totalKeySize": 0,
"totalValueSize": 0
},
"number_of_documents": 0,
"database_size": "[bytes]",
"number_of_embeddings": 0,
"number_of_embedded_documents": 0,

View File

@@ -24,11 +24,10 @@ pub fn upgrade_index_scheduler(
let current_minor = to.1;
let current_patch = to.2;
let upgrade_functions: &[&dyn UpgradeIndexScheduler] = &[&ToCurrentNoOp {}];
let upgrade_functions: &[&dyn UpgradeIndexScheduler] = &[&V1_12_ToCurrent {}];
let start = match from {
(1, 12, _) => 0,
(1, 13, _) => 0,
(major, minor, patch) => {
if major > current_major
|| (major == current_major && minor > current_minor)
@@ -47,19 +46,20 @@ pub fn upgrade_index_scheduler(
}
};
let mut current_version = from;
info!("Upgrading the task queue");
let mut local_from = from;
for upgrade in upgrade_functions[start..].iter() {
let target = upgrade.target_version();
info!(
"Upgrading from v{}.{}.{} to v{}.{}.{}",
local_from.0, local_from.1, local_from.2, target.0, target.1, target.2
from.0, from.1, from.2, current_version.0, current_version.1, current_version.2
);
let mut wtxn = env.write_txn()?;
upgrade.upgrade(env, &mut wtxn, local_from)?;
upgrade.upgrade(env, &mut wtxn, from)?;
versioning.set_version(&mut wtxn, target)?;
wtxn.commit()?;
local_from = target;
current_version = target;
}
let mut wtxn = env.write_txn()?;
@@ -86,9 +86,9 @@ pub fn upgrade_index_scheduler(
}
#[allow(non_camel_case_types)]
struct ToCurrentNoOp {}
struct V1_12_ToCurrent {}
impl UpgradeIndexScheduler for ToCurrentNoOp {
impl UpgradeIndexScheduler for V1_12_ToCurrent {
fn upgrade(
&self,
_env: &Env,

View File

@@ -115,6 +115,7 @@ utoipa = { version = "5.3.1", features = [
"openapi_extensions",
] }
utoipa-scalar = { version = "0.3.0", optional = true, features = ["actix-web"] }
scoped_thread_pool = { version = "0.1.0", path = "../../../../../../../dev/scoped_thread_pool" }
[dev-dependencies]
actix-rt = "2.10.0"
@@ -169,5 +170,5 @@ german = ["meilisearch-types/german"]
turkish = ["meilisearch-types/turkish"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.17/build.zip"
sha1 = "29e92ce25f306208a9c86f013279c736bdc1e034"
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.16/build.zip"
sha1 = "68f83438a114aabbe76bc9fe480071e741996662"

View File

@@ -364,7 +364,7 @@ fn check_version(
let (bin_major, bin_minor, bin_patch) = binary_version;
let (db_major, db_minor, db_patch) = get_version(&opt.db_path)?;
if db_major != bin_major || db_minor != bin_minor || db_patch != bin_patch {
if db_major != bin_major || db_minor != bin_minor || db_patch > bin_patch {
if opt.experimental_dumpless_upgrade {
update_version_file_for_dumpless_upgrade(
opt,

View File

@@ -743,15 +743,21 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
type Error = anyhow::Error;
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
let thread_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|index| format!("indexing-thread:{index}"))
let rayon_thread_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|index| format!("rayon-{index}"))
.num_threads(*other.max_indexing_threads)
.build()?;
let thread_pool = Some(scoped_thread_pool::ThreadPool::new(
NonZeroUsize::new(*other.max_indexing_threads).unwrap_or(NonZeroUsize::new(1).unwrap()),
"index".to_string(),
));
Ok(Self {
log_every_n: Some(DEFAULT_LOG_EVERY_N),
max_memory: other.max_indexing_memory.map(|b| b.as_u64() as usize),
thread_pool: Some(thread_pool),
rayon_thread_pool: Some(rayon_thread_pool),
thread_pool,
max_positions_per_attributes: None,
skip_index_budget: other.skip_index_budget,
..Default::default()

View File

@@ -494,10 +494,6 @@ pub async fn delete_index(
pub struct IndexStats {
/// Number of documents in the index
pub number_of_documents: u64,
/// Size of the documents database, in bytes.
pub raw_document_db_size: u64,
/// Average size of a document in the documents database.
pub avg_document_size: u64,
/// Whether or not the index is currently ingesting document
pub is_indexing: bool,
/// Number of embeddings in the index
@@ -514,12 +510,7 @@ pub struct IndexStats {
impl From<index_scheduler::IndexStats> for IndexStats {
fn from(stats: index_scheduler::IndexStats) -> Self {
IndexStats {
number_of_documents: stats
.inner_stats
.number_of_documents
.unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()),
raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
number_of_documents: stats.inner_stats.number_of_documents,
is_indexing: stats.is_indexing,
number_of_embeddings: stats.inner_stats.number_of_embeddings,
number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents,
@@ -541,8 +532,6 @@ impl From<index_scheduler::IndexStats> for IndexStats {
(status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!(
{
"numberOfDocuments": 10,
"rawDocumentDbSize": 10,
"avgDocumentSize": 10,
"numberOfEmbeddings": 10,
"numberOfEmbeddedDocuments": 10,
"isIndexing": true,

View File

@@ -392,9 +392,6 @@ pub struct Stats {
"indexes": {
"movies": {
"numberOfDocuments": 10,
"rawDocumentDbSize": 100,
"maxDocumentSize": 16,
"avgDocumentSize": 10,
"isIndexing": true,
"fieldDistribution": {
"genre": 10,

View File

@@ -160,8 +160,6 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 4,
"rawDocumentDbSize": 42,
"avgDocumentSize": 10,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -211,8 +209,6 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 16,
"avgDocumentSize": 8,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -281,8 +277,6 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 12,
"avgDocumentSize": 12,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@@ -32,8 +32,6 @@ async fn import_dump_v1_movie_raw() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -189,8 +187,6 @@ async fn import_dump_v1_movie_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -359,8 +355,6 @@ async fn import_dump_v1_rubygems_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -526,8 +520,6 @@ async fn import_dump_v2_movie_raw() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -683,8 +675,6 @@ async fn import_dump_v2_movie_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -850,8 +840,6 @@ async fn import_dump_v2_rubygems_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1014,8 +1002,6 @@ async fn import_dump_v3_movie_raw() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1171,8 +1157,6 @@ async fn import_dump_v3_movie_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1338,8 +1322,6 @@ async fn import_dump_v3_rubygems_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1502,8 +1484,6 @@ async fn import_dump_v4_movie_raw() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1659,8 +1639,6 @@ async fn import_dump_v4_movie_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1826,8 +1804,6 @@ async fn import_dump_v4_rubygems_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 8606,
"avgDocumentSize": 162,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -1997,8 +1973,6 @@ async fn import_dump_v5() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 10,
"rawDocumentDbSize": 6782,
"avgDocumentSize": 678,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -2035,8 +2009,6 @@ async fn import_dump_v5() {
@r###"
{
"numberOfDocuments": 10,
"rawDocumentDbSize": 6782,
"avgDocumentSize": 678,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@@ -128,40 +128,6 @@ async fn search_with_stop_word() {
.await;
}
#[actix_rt::test]
async fn search_with_typo_settings() {
// related to https://github.com/meilisearch/meilisearch/issues/5240
let server = Server::new().await;
let index = server.index("test");
let (_, code) = index
.update_settings(json!({"typoTolerance": { "disableOnAttributes": ["title", "id"]}}))
.await;
meili_snap::snapshot!(code, @"202 Accepted");
let documents = DOCUMENTS.clone();
let (task, _status_code) = index.add_documents(documents, None).await;
index.wait_task(task.uid()).await.succeeded();
index
.search(json!({"q": "287947" }), |response, code| {
assert_eq!(code, 200, "{}", response);
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Shazam!",
"id": "287947",
"color": [
"green",
"blue"
]
}
]
"###);
})
.await;
}
#[actix_rt::test]
async fn phrase_search_with_stop_word() {
// related to https://github.com/meilisearch/meilisearch/issues/3521

View File

@@ -113,8 +113,6 @@ async fn add_remove_embeddings() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"isIndexing": false,
"numberOfEmbeddings": 5,
"numberOfEmbeddedDocuments": 2,
@@ -138,8 +136,6 @@ async fn add_remove_embeddings() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"isIndexing": false,
"numberOfEmbeddings": 3,
"numberOfEmbeddedDocuments": 2,
@@ -163,8 +159,6 @@ async fn add_remove_embeddings() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"isIndexing": false,
"numberOfEmbeddings": 2,
"numberOfEmbeddedDocuments": 2,
@@ -189,8 +183,6 @@ async fn add_remove_embeddings() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"isIndexing": false,
"numberOfEmbeddings": 2,
"numberOfEmbeddedDocuments": 1,
@@ -239,8 +231,6 @@ async fn add_remove_embedded_documents() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 27,
"avgDocumentSize": 13,
"isIndexing": false,
"numberOfEmbeddings": 5,
"numberOfEmbeddedDocuments": 2,
@@ -260,8 +250,6 @@ async fn add_remove_embedded_documents() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 13,
"avgDocumentSize": 13,
"isIndexing": false,
"numberOfEmbeddings": 3,
"numberOfEmbeddedDocuments": 1,
@@ -293,8 +281,6 @@ async fn update_embedder_settings() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 108,
"avgDocumentSize": 54,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -329,8 +315,6 @@ async fn update_embedder_settings() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 108,
"avgDocumentSize": 54,
"isIndexing": false,
"numberOfEmbeddings": 3,
"numberOfEmbeddedDocuments": 2,

View File

@@ -43,7 +43,7 @@ async fn version_too_old() {
std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap();
let options = Opt { experimental_dumpless_upgrade: true, ..default_settings };
let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err();
snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.13.2");
snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.13.0");
}
#[actix_rt::test]
@@ -58,7 +58,7 @@ async fn version_requires_downgrade() {
std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap();
let options = Opt { experimental_dumpless_upgrade: true, ..default_settings };
let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err();
snapshot!(err, @"Database version 1.13.3 is higher than the Meilisearch version 1.13.2. Downgrade is not supported");
snapshot!(err, @"Database version 1.13.1 is higher than the Meilisearch version 1.13.0. Downgrade is not supported");
}
#[actix_rt::test]

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -1,5 +1,6 @@
---
source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
snapshot_kind: text
---
{
"results": [
@@ -8,7 +9,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"progress": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"stats": {
"totalNbTasks": 1,

View File

@@ -1,5 +1,6 @@
---
source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
snapshot_kind: text
---
{
"results": [
@@ -12,7 +13,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
"canceledBy": null,
"details": {
"upgradeFrom": "v1.12.0",
"upgradeTo": "v1.13.2"
"upgradeTo": "v1.13.0"
},
"error": null,
"duration": "[duration]",

View File

@@ -134,8 +134,6 @@ async fn check_the_index_scheduler(server: &Server) {
"indexes": {
"kefir": {
"numberOfDocuments": 1,
"rawDocumentDbSize": 109,
"avgDocumentSize": 109,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -218,8 +216,6 @@ async fn check_the_index_scheduler(server: &Server) {
"indexes": {
"kefir": {
"numberOfDocuments": 1,
"rawDocumentDbSize": 109,
"avgDocumentSize": 109,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@@ -239,8 +235,6 @@ async fn check_the_index_scheduler(server: &Server) {
snapshot!(stats, @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 109,
"avgDocumentSize": 109,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@@ -1,120 +1,121 @@
[package]
name = "milli"
edition = "2021"
name = "milli"
publish = false
version.workspace = true
authors.workspace = true
description.workspace = true
homepage.workspace = true
readme.workspace = true
version.workspace = true
# edition.workspace = true
license.workspace = true
[dependencies]
big_s = "1.0.2"
bimap = { version = "0.6.3", features = ["serde"] }
bimap = {version = "0.6.3", features = ["serde"]}
bincode = "1.3.3"
bstr = "1.11.3"
bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] }
bytemuck = {version = "1.21.0", features = ["extern_crate_alloc"]}
byteorder = "1.5.0"
charabia = { version = "0.9.2", default-features = false }
charabia = {version = "0.9.2", default-features = false}
concat-arrays = "0.1.2"
convert_case = "0.6.0"
crossbeam-channel = "0.5.14"
deserr = "0.6.3"
either = { version = "1.13.0", features = ["serde"] }
flatten-serde-json = { path = "../flatten-serde-json" }
either = {version = "1.13.0", features = ["serde"]}
flatten-serde-json = {path = "../flatten-serde-json"}
fst = "0.4.7"
fxhash = "0.2.1"
geoutils = "0.5.1"
grenad = { version = "0.5.0", default-features = false, features = ["rayon", "tempfile"] }
heed = { version = "0.20.5", default-features = false, features = [
"serde-json",
"serde-bincode",
"read-txn-no-tls",
] }
indexmap = { version = "2.7.0", features = ["serde"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
grenad = {version = "0.5.0", default-features = false, features = ["rayon", "tempfile"]}
heed = {version = "0.20.5", default-features = false, features = [
"serde-json",
"serde-bincode",
"read-txn-no-tls",
]}
indexmap = {version = "2.7.0", features = ["serde"]}
json-depth-checker = {path = "../json-depth-checker"}
levenshtein_automata = {version = "0.2.1", features = ["fst_automaton"]}
memchr = "2.7.4"
memmap2 = "0.9.5"
obkv = "0.3.0"
once_cell = "1.20.2"
ordered-float = "4.6.0"
rayon = "1.10.0"
roaring = { version = "0.10.10", features = ["serde"] }
rstar = { version = "0.12.2", features = ["serde"] }
serde = { version = "1.0.217", features = ["derive"] }
serde_json = { version = "1.0.135", features = ["preserve_order", "raw_value"] }
roaring = {version = "0.10.10", features = ["serde"]}
rstar = {version = "0.12.2", features = ["serde"]}
serde = {version = "1.0.217", features = ["derive"]}
serde_json = {version = "1.0.135", features = ["preserve_order", "raw_value"]}
slice-group-by = "0.3.1"
smallstr = { version = "0.3.0", features = ["serde"] }
smallstr = {version = "0.3.0", features = ["serde"]}
smallvec = "1.13.2"
smartstring = "1.0.1"
tempfile = "3.15.0"
thiserror = "2.0.9"
time = { version = "0.3.37", features = [
"serde-well-known",
"formatting",
"parsing",
"macros",
] }
uuid = { version = "1.11.0", features = ["v4"] }
time = {version = "0.3.37", features = [
"serde-well-known",
"formatting",
"parsing",
"macros",
]}
uuid = {version = "1.11.0", features = ["v4"]}
filter-parser = { path = "../filter-parser" }
filter-parser = {path = "../filter-parser"}
scoped_thread_pool = {path = "/home/dureuill/dev/scoped_thread_pool"}
# documents words self-join
itertools = "0.14.0"
csv = "1.3.1"
candle-core = { version = "0.8.2" }
candle-transformers = { version = "0.8.2" }
candle-nn = { version = "0.8.2" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
"onig",
] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
"online",
] }
tiktoken-rs = "0.6.0"
liquid = "0.26.9"
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = [
"serde",
"no_module",
"no_custom_syntax",
"no_time",
"sync",
] }
allocator-api2 = "0.2.21"
arroy = "0.5.0"
rand = "0.8.5"
tracing = "0.1.41"
ureq = { version = "2.12.1", features = ["json"] }
url = "2.5.4"
rayon-par-bridge = "0.1.0"
hashbrown = "0.15.2"
bbqueue = {git = "https://github.com/meilisearch/bbqueue"}
bumpalo = "3.16.0"
bumparaw-collections = "0.1.4"
thread_local = "1.1.8"
allocator-api2 = "0.2.21"
rustc-hash = "2.1.0"
uell = "0.1.0"
candle-core = {version = "0.8.2"}
candle-nn = {version = "0.8.2"}
candle-transformers = {version = "0.8.2"}
csv = "1.3.1"
enum-iterator = "2.1.0"
bbqueue = { git = "https://github.com/meilisearch/bbqueue" }
flume = { version = "0.11.1", default-features = false }
utoipa = { version = "5.3.1", features = ["non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions"] }
flume = {version = "0.11.1", default-features = false}
hashbrown = "0.15.2"
hf-hub = {git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
"online",
]}
liquid = "0.26.9"
rand = "0.8.5"
rayon-par-bridge = "0.1.0"
rhai = {git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = [
"serde",
"no_module",
"no_custom_syntax",
"no_time",
"sync",
]}
rustc-hash = "2.1.0"
thread_local = "1.1.8"
tiktoken-rs = "0.6.0"
tokenizers = {git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
"onig",
]}
tracing = "0.1.41"
uell = "0.1.0"
ureq = {version = "2.12.1", features = ["json"]}
url = "2.5.4"
utoipa = {version = "5.3.1", features = ["non_strict_integers", "preserve_order", "uuid", "time", "openapi_extensions"]}
[dev-dependencies]
mimalloc = { version = "0.1.43", default-features = false }
mimalloc = {version = "0.1.43", default-features = false}
# fixed version due to format breakages in v1.40
insta = "=1.39.0"
maplit = "1.0.2"
md5 = "0.7.0"
meili-snap = { path = "../meili-snap" }
rand = { version = "0.8.5", features = ["small_rng"] }
meili-snap = {path = "../meili-snap"}
rand = {version = "0.8.5", features = ["small_rng"]}
[features]
all-tokenizations = [
"charabia/default",
"charabia/default",
]
# Use POSIX semaphores instead of SysV semaphores in LMDB

View File

@@ -1,96 +0,0 @@
use heed::types::Bytes;
use heed::Database;
use heed::RoTxn;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
/// The stats of a database.
pub struct DatabaseStats {
/// The number of entries in the database.
number_of_entries: u64,
/// The total size of the keys in the database.
total_key_size: u64,
/// The total size of the values in the database.
total_value_size: u64,
}
impl DatabaseStats {
/// Returns the stats of the database.
///
/// This function iterates over the whole database and computes the stats.
/// It is not efficient and should be cached somewhere.
pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> {
let mut database_stats =
Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 };
let mut iter = database.iter(rtxn)?;
while let Some((key, value)) = iter.next().transpose()? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
database_stats.total_key_size += key_size;
database_stats.total_value_size += value_size;
}
database_stats.number_of_entries = database.len(rtxn)?;
Ok(database_stats)
}
/// Recomputes the stats of the database and returns the new stats.
///
/// This function is used to update the stats of the database when some keys are modified.
/// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states.
pub(crate) fn recompute<I, K>(
mut stats: Self,
database: Database<Bytes, Bytes>,
before_rtxn: &RoTxn<'_>,
after_rtxn: &RoTxn<'_>,
modified_keys: I,
) -> heed::Result<Self>
where
I: IntoIterator<Item = K>,
K: AsRef<[u8]>,
{
for key in modified_keys {
let key = key.as_ref();
if let Some(value) = database.get(after_rtxn, key)? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
stats.total_key_size = stats.total_key_size.saturating_add(key_size);
stats.total_value_size = stats.total_value_size.saturating_add(value_size);
}
if let Some(value) = database.get(before_rtxn, key)? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
stats.total_key_size = stats.total_key_size.saturating_sub(key_size);
stats.total_value_size = stats.total_value_size.saturating_sub(value_size);
}
}
stats.number_of_entries = database.len(after_rtxn)?;
Ok(stats)
}
pub fn average_key_size(&self) -> u64 {
self.total_key_size.checked_div(self.number_of_entries).unwrap_or(0)
}
pub fn average_value_size(&self) -> u64 {
self.total_value_size.checked_div(self.number_of_entries).unwrap_or(0)
}
pub fn number_of_entries(&self) -> u64 {
self.number_of_entries
}
pub fn total_key_size(&self) -> u64 {
self.total_key_size
}
pub fn total_value_size(&self) -> u64 {
self.total_value_size
}
}

View File

@@ -515,3 +515,68 @@ fn conditionally_lookup_for_error_message() {
assert_eq!(err.to_string(), format!("{} {}", prefix, suffix));
}
}
impl Error {
pub fn from_scoped_thread_pool_error(
thread_pool: &scoped_thread_pool::ThreadPool<Self>,
thread_id: usize,
error: scoped_thread_pool::Error<Self>,
) -> Self {
match error {
scoped_thread_pool::Error::Err(error) => error,
scoped_thread_pool::Error::Panic(payload)
| scoped_thread_pool::Error::ThreadExited(Some(payload)) => {
let msg = match payload.downcast_ref::<&'static str>() {
Some(s) => *s,
None => match payload.downcast_ref::<String>() {
Some(s) => &s[..],
None => "Box<dyn Any>",
},
};
tracing::error!(
thread_name = thread_pool.thread_name(thread_id),
"Thread panicked with {msg}"
);
Error::InternalError(InternalError::PanicInThreadPool(PanicCatched))
}
scoped_thread_pool::Error::ThreadExited(None) => {
Error::InternalError(InternalError::PanicInThreadPool(PanicCatched))
}
}
}
pub fn from_scoped_thread_pool_errors(
thread_pool: &scoped_thread_pool::ThreadPool<Self>,
value: scoped_thread_pool::Errors<Error>,
) -> Self {
// iterate all errors, keeping the "max" one
// such that AbortedIndexing < regular error < panic
let mut max = None;
for (thread_id, error) in value.0 {
max = match (max, error) {
(None, error) => Some((thread_id, error)),
(max @ Some((_, scoped_thread_pool::Error::Panic(_))), _) => max,
(_, new @ scoped_thread_pool::Error::Panic(_)) => Some((thread_id, new)),
(max @ Some((_, scoped_thread_pool::Error::ThreadExited(Some(_)))), _) => max,
(_, new @ scoped_thread_pool::Error::ThreadExited(Some(_))) => {
Some((thread_id, new))
}
(max @ Some((_, scoped_thread_pool::Error::ThreadExited(None))), _) => max,
(_, new @ scoped_thread_pool::Error::ThreadExited(None)) => Some((thread_id, new)),
(
Some((
_,
scoped_thread_pool::Error::Err(Error::InternalError(
InternalError::AbortedIndexation,
)),
)),
new,
) => Some((thread_id, new)),
(max @ Some((_, scoped_thread_pool::Error::Err(_))), _) => max,
};
}
// Errors never have an empty list
let (thread_id, error) = max.unwrap();
Self::from_scoped_thread_pool_error(thread_pool, thread_id, error)
}
}

View File

@@ -11,7 +11,6 @@ use rstar::RTree;
use serde::{Deserialize, Serialize};
use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME};
use crate::database_stats::DatabaseStats;
use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError};
use crate::fields_ids_map::FieldsIdsMap;
@@ -75,7 +74,6 @@ pub mod main_key {
pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules";
pub const FACET_SEARCH: &str = "facet_search";
pub const PREFIX_SEARCH: &str = "prefix_search";
pub const DOCUMENTS_STATS: &str = "documents_stats";
}
pub mod db_name {
@@ -405,58 +403,6 @@ impl Index {
Ok(count.unwrap_or_default())
}
/// Updates the stats of the documents database based on the previous stats and the modified docids.
pub fn update_documents_stats(
&self,
wtxn: &mut RwTxn<'_>,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
let before_rtxn = self.read_txn()?;
let document_stats = match self.documents_stats(&before_rtxn)? {
Some(before_stats) => DatabaseStats::recompute(
before_stats,
self.documents.remap_types(),
&before_rtxn,
wtxn,
modified_docids.iter().map(|docid| docid.to_be_bytes()),
)?,
None => {
// This should never happen when there are already documents in the index, the documents stats should be present.
// If it happens, it means that the index was not properly initialized/upgraded.
debug_assert_eq!(
self.documents.len(&before_rtxn)?,
0,
"The documents stats should be present when there are documents in the index"
);
tracing::warn!("No documents stats found, creating new ones");
DatabaseStats::new(self.documents.remap_types(), &*wtxn)?
}
};
self.put_documents_stats(wtxn, document_stats)?;
Ok(())
}
/// Writes the stats of the documents database.
pub fn put_documents_stats(
&self,
wtxn: &mut RwTxn<'_>,
stats: DatabaseStats,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<DatabaseStats>>().put(
wtxn,
main_key::DOCUMENTS_STATS,
&stats,
)
}
/// Returns the stats of the documents database.
pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result<Option<DatabaseStats>> {
self.main
.remap_types::<Str, SerdeJson<DatabaseStats>>()
.get(rtxn, main_key::DOCUMENTS_STATS)
}
/* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id.
@@ -1842,6 +1788,7 @@ pub(crate) mod tests {
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::progress::Progress;
use crate::update::new::indexer;
use crate::update::new::indexer::document_changes::CHUNK_SIZE;
use crate::update::settings::InnerIndexSettings;
use crate::update::{
self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings,
@@ -1891,7 +1838,7 @@ pub(crate) mod tests {
) -> Result<(), crate::error::Error> {
let local_pool;
let indexer_config = &self.indexer_config;
let pool = match &indexer_config.thread_pool {
let pool = match &indexer_config.rayon_thread_pool {
Some(pool) => pool,
None => {
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
@@ -1899,6 +1846,11 @@ pub(crate) mod tests {
}
};
let thread_pool = match &indexer_config.thread_pool {
Some(thread_pool) => thread_pool,
None => &scoped_thread_pool::ThreadPool::with_available_parallelism("index".into()),
};
let rtxn = self.inner.read_txn()?;
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
let mut new_fields_ids_map = db_fields_ids_map.clone();
@@ -1918,29 +1870,28 @@ pub(crate) mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
thread_pool,
CHUNK_SIZE,
)?;
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
return Err(error.into());
}
pool.install(|| {
indexer::index(
wtxn,
&self.inner,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false,
&Progress::default(),
)
})
.unwrap()?;
indexer::index(
wtxn,
&self.inner,
thread_pool,
&pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false,
&Progress::default(),
)?;
Ok(())
}
@@ -1979,7 +1930,7 @@ pub(crate) mod tests {
) -> Result<(), crate::error::Error> {
let local_pool;
let indexer_config = &self.indexer_config;
let pool = match &indexer_config.thread_pool {
let pool = match &indexer_config.rayon_thread_pool {
Some(pool) => pool,
None => {
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
@@ -1987,6 +1938,11 @@ pub(crate) mod tests {
}
};
let thread_pool = match &indexer_config.thread_pool {
Some(thread_pool) => thread_pool,
None => &scoped_thread_pool::ThreadPool::with_available_parallelism("index".into()),
};
let rtxn = self.inner.read_txn()?;
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
let mut new_fields_ids_map = db_fields_ids_map.clone();
@@ -2009,28 +1965,28 @@ pub(crate) mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
thread_pool,
CHUNK_SIZE,
)?;
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
return Err(error.into());
}
pool.install(|| {
indexer::index(
wtxn,
&self.inner,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false,
&Progress::default(),
)
})
.unwrap()?;
indexer::index(
wtxn,
&self.inner,
thread_pool,
&pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false,
&Progress::default(),
)?;
Ok(())
}
@@ -2059,7 +2015,7 @@ pub(crate) mod tests {
let local_pool;
let indexer_config = &index.indexer_config;
let pool = match &indexer_config.thread_pool {
let pool = match &indexer_config.rayon_thread_pool {
Some(pool) => pool,
None => {
local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
@@ -2067,6 +2023,11 @@ pub(crate) mod tests {
}
};
let thread_pool = match &indexer_config.thread_pool {
Some(thread_pool) => thread_pool,
None => &scoped_thread_pool::ThreadPool::with_available_parallelism("index".into()),
};
let rtxn = index.inner.read_txn().unwrap();
let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
let mut new_fields_ids_map = db_fields_ids_map.clone();
@@ -2090,6 +2051,8 @@ pub(crate) mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
thread_pool,
CHUNK_SIZE,
)
.unwrap();
@@ -2100,7 +2063,8 @@ pub(crate) mod tests {
indexer::index(
&mut wtxn,
&index.inner,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
thread_pool,
&pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,

View File

@@ -10,7 +10,6 @@ pub mod documents;
mod asc_desc;
mod criterion;
pub mod database_stats;
mod error;
mod external_documents_ids;
pub mod facet;

View File

@@ -215,7 +215,7 @@ pub fn partially_initialized_term_from_word(
let mut zero_typo = None;
let mut prefix_of = BTreeSet::new();
if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() {
if fst.contains(word) {
zero_typo = Some(word_interned);
}

View File

@@ -7,6 +7,7 @@ use maplit::{btreemap, hashset};
use crate::progress::Progress;
use crate::update::new::indexer;
use crate::update::new::indexer::document_changes::CHUNK_SIZE;
use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use crate::vector::EmbeddingConfigs;
use crate::{db_snap, Criterion, Index};
@@ -65,6 +66,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
// index documents
indexer.add_documents(&payload).unwrap();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let indexer_alloc = Bump::new();
let (document_changes, operation_stats, primary_key) = indexer
.into_changes(
@@ -75,6 +79,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
@@ -85,6 +91,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
indexer::index(
&mut wtxn,
&index,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -28,7 +28,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
) -> Result<grenad::Reader<BufReader<File>>> {
let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let force_reindexing = settings_diff.reindex_searchable();
// initialize destination values.

View File

@@ -23,7 +23,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut facet_number_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,

View File

@@ -55,7 +55,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
localized_field_ids: &LocalizedFieldIds,
facet_search: bool,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,
@@ -145,7 +145,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable,

View File

@@ -44,7 +44,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<ExtractedFacetValues> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut fid_docid_facet_numbers_sorter = create_sorter(
grenad::SortAlgorithm::Stable,

View File

@@ -26,7 +26,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut fid_word_count_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,

View File

@@ -35,7 +35,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut word_fid_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,

View File

@@ -39,7 +39,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
.map(|_| {
create_sorter(

View File

@@ -24,7 +24,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
let max_memory = indexer.max_memory_by_thread();
let max_memory = indexer.max_memory_by_rayon_thread();
let mut word_position_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,

View File

@@ -119,7 +119,11 @@ impl GrenadParameters {
///
/// This should be called inside of a rayon thread pool,
/// otherwise, it will take the global number of threads.
pub fn max_memory_by_thread(&self) -> Option<usize> {
pub fn max_memory_by_thread(&self, thread_count: usize) -> Option<usize> {
self.max_memory.map(|max_memory| (max_memory / thread_count))
}
pub fn max_memory_by_rayon_thread(&self) -> Option<usize> {
self.max_memory.map(|max_memory| (max_memory / rayon::current_num_threads()))
}
}

View File

@@ -227,7 +227,7 @@ where
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
let backup_pool;
let pool = match self.indexer_config.thread_pool {
let pool = match self.indexer_config.rayon_thread_pool {
Some(ref pool) => pool,
None => {
// We initialize a backup pool with the default
@@ -307,7 +307,6 @@ where
let current_span = tracing::Span::current();
// Run extraction pipeline in parallel.
let mut modified_docids = RoaringBitmap::new();
pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || {
@@ -368,7 +367,7 @@ where
Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) =
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?;
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
@@ -468,10 +467,6 @@ where
Ok(())
}).map_err(InternalError::from)??;
if !settings_diff.settings_update_only {
// Update the stats of the documents database when there is a document update.
self.index.update_documents_stats(self.wtxn, modified_docids)?;
}
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@@ -775,6 +770,7 @@ mod tests {
use crate::progress::Progress;
use crate::search::TermsMatchingStrategy;
use crate::update::new::indexer;
use crate::update::new::indexer::document_changes::CHUNK_SIZE;
use crate::update::Setting;
use crate::{db_snap, Filter, Search, UserError};
@@ -1972,6 +1968,8 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string()),
CHUNK_SIZE,
)
.unwrap();
@@ -2120,6 +2118,9 @@ mod tests {
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
indexer.add_documents(&documents).unwrap();
indexer.delete_documents(&["2"]);
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let (document_changes, _operation_stats, primary_key) = indexer
.into_changes(
&indexer_alloc,
@@ -2129,12 +2130,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2182,6 +2186,9 @@ mod tests {
let indexer_alloc = Bump::new();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let (document_changes, _operation_stats, primary_key) = indexer
.into_changes(
&indexer_alloc,
@@ -2191,12 +2198,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2234,6 +2244,8 @@ mod tests {
let embedders = EmbeddingConfigs::default();
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
indexer.add_documents(&documents).unwrap();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let (document_changes, _operation_stats, primary_key) = indexer
.into_changes(
@@ -2244,12 +2256,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2296,12 +2311,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2332,6 +2350,8 @@ mod tests {
let indexer_alloc = Bump::new();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
indexer.delete_documents(&["1", "2"]);
@@ -2350,12 +2370,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2387,6 +2410,8 @@ mod tests {
let indexer_alloc = Bump::new();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
indexer.delete_documents(&["1", "2", "1", "2"]);
@@ -2409,12 +2434,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2445,6 +2473,8 @@ mod tests {
let indexer_alloc = Bump::new();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::UpdateDocuments);
let documents = documents!([
@@ -2461,12 +2491,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2513,12 +2546,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2688,6 +2724,8 @@ mod tests {
let indexer_alloc = Bump::new();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
// OP
@@ -2707,12 +2745,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2766,12 +2807,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,
@@ -2822,12 +2866,15 @@ mod tests {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index.inner,
&thread_pool,
&crate::ThreadPoolNoAbortBuilder::new().build().unwrap(),
indexer_config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -129,7 +129,6 @@ pub(crate) fn write_typed_chunk_into_index(
index: &Index,
settings_diff: &InnerIndexSettingsDiff,
typed_chunks: Vec<TypedChunk>,
modified_docids: &mut RoaringBitmap,
) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false;
match typed_chunks[0] {
@@ -215,7 +214,6 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Create,
});
docids.insert(docid);
modified_docids.insert(docid);
} else {
db.delete(wtxn, &docid)?;
operations.push(DocumentOperation {
@@ -224,7 +222,6 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Delete,
});
docids.remove(docid);
modified_docids.insert(docid);
}
}
let external_documents_docids = index.external_documents_ids();

View File

@@ -11,7 +11,8 @@ pub struct IndexerConfig {
pub max_memory: Option<usize>,
pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>,
pub thread_pool: Option<ThreadPoolNoAbort>,
pub rayon_thread_pool: Option<ThreadPoolNoAbort>,
pub thread_pool: Option<scoped_thread_pool::ThreadPool<crate::Error>>,
pub max_positions_per_attributes: Option<u32>,
pub skip_index_budget: bool,
}
@@ -36,6 +37,7 @@ impl Default for IndexerConfig {
max_memory: None,
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
rayon_thread_pool: None,
thread_pool: None,
max_positions_per_attributes: None,
skip_index_budget: false,

View File

@@ -51,12 +51,13 @@ const MAX_FRAME_HEADER_SIZE: usize = 9;
/// when new stuff is available in any BBQueue buffer but we send
/// a message in this queue only if it is empty to avoid filling
/// the channel *and* the BBQueue.
pub fn extractor_writer_bbqueue(
bbbuffers: &mut Vec<BBBuffer>,
pub fn extractor_writer_bbqueue<'a>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
bbbuffers: &'a mut Vec<BBBuffer>,
total_bbbuffer_capacity: usize,
channel_capacity: usize,
) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) {
let current_num_threads = rayon::current_num_threads();
) -> (ExtractorBbqueueSender<'a>, WriterBbqueueReceiver<'a>) {
let current_num_threads = thread_pool.thread_count();
let bbbuffer_capacity = total_bbbuffer_capacity.checked_div(current_num_threads).unwrap();
bbbuffers.resize_with(current_num_threads, || BBBuffer::new(bbbuffer_capacity));
@@ -66,12 +67,18 @@ pub fn extractor_writer_bbqueue(
let max_grant = capacity.saturating_div(2).checked_sub(MAX_FRAME_HEADER_SIZE).unwrap();
let producers = ThreadLocal::with_capacity(bbbuffers.len());
let consumers = rayon::broadcast(|bi| {
let bbqueue = &bbbuffers[bi.index()];
let (producer, consumer) = bbqueue.try_split_framed().unwrap();
producers.get_or(|| FullySend(RefCell::new(producer)));
consumer
});
let consumers = ThreadLocal::with_capacity(bbbuffers.len());
thread_pool
.broadcast(|thread_index| {
let bbqueue: &BBBuffer = &bbbuffers[thread_index];
let (producer, consumer) = bbqueue.try_split_framed().unwrap();
producers.get_or(|| FullySend(RefCell::new(producer)));
consumers.get_or(|| FullySend(consumer));
Ok(())
})
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))
.unwrap();
let consumers: Vec<_> = consumers.into_iter().map(|consumer| consumer.0).collect();
let sent_messages_attempts = Arc::new(AtomicUsize::new(0));
let blocking_sent_messages_attempts = Arc::new(AtomicUsize::new(0));
@@ -963,28 +970,70 @@ impl GeoSender<'_, '_> {
.map_err(|_| SendError(()))
}
pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> {
let database = Database::Main;
let value_length = bitmap.serialized_size();
let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes();
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
InternalError::StorePut {
database_name: database.database_name(),
key: key.into(),
value_length,
error: MdbError::BadValSize.into(),
}
})?;
pub fn set_geo_faceted(
&self,
bitmap: &RoaringBitmap,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> crate::Result<()> {
let writer = GeoWriter { bitmap, channel: *self };
thread_pool
.execute(&writer)
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))
}
}
self.0.write_key_value_with(
struct GeoWriter<'a, 'b> {
bitmap: &'a RoaringBitmap,
channel: GeoSender<'a, 'b>,
}
impl<'a, 'b> scoped_thread_pool::Workload<'static> for GeoWriter<'a, 'b> {
type Context = ();
type Error = crate::Error;
fn context(
&self,
_thread_count: usize,
_thread_index: usize,
) -> Result<Self::Context, Self::Error> {
Ok(())
}
fn run_task(
&self,
_thread_count: usize,
thread_index: usize,
task_index: usize,
_context: &mut Self::Context,
) -> Option<Result<(), Self::Error>> {
if thread_index != 0 || task_index != 0 {
return None;
}
let database = Database::Main;
let value_length = self.bitmap.serialized_size();
let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes();
let key_length = match key.len().try_into().ok().and_then(NonZeroU16::new) {
Some(key_length) => key_length,
None => {
return Some(Err(InternalError::StorePut {
database_name: database.database_name(),
key: key.into(),
value_length,
error: MdbError::BadValSize.into(),
}
.into()))
}
};
Some(self.channel.0.write_key_value_with(
database,
key_length,
value_length,
|key_buffer, value_buffer| {
key_buffer.copy_from_slice(key);
bitmap.serialize_into(value_buffer)?;
self.bitmap.serialize_into(value_buffer)?;
Ok(())
},
)
))
}
}

View File

@@ -711,17 +711,15 @@ impl DelAddRoaringBitmap {
DelAddRoaringBitmap { del, add }
}
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap) {
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) {
let DelAddRoaringBitmap { del, add } = self;
if let Some(del) = del {
*documents_ids -= del;
*modified_docids |= del;
}
if let Some(add) = add {
*documents_ids |= add;
*modified_docids |= add;
}
}
}

View File

@@ -38,7 +38,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b>
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
Ok(RefCell::new(BalancedCaches::new_in(
self.buckets,
self.grenad_parameters.max_memory_by_thread(),
self.grenad_parameters.max_memory_by_thread(self.buckets),
extractor_alloc,
)))
}
@@ -388,6 +388,7 @@ fn truncate_str(s: &str) -> &str {
impl FacetedDocidsExtractor {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
@@ -412,10 +413,11 @@ impl FacetedDocidsExtractor {
let extractor = FacetedExtractorData {
attributes_to_extract: &attributes_to_extract,
grenad_parameters: indexing_context.grenad_parameters,
buckets: rayon::current_num_threads(),
buckets: thread_pool.thread_count(),
sender,
};
extract(
thread_pool,
document_changes,
&extractor,
indexing_context,

View File

@@ -21,6 +21,7 @@ use crate::{lat_lng_to_xyz, DocumentId, GeoPoint, Index, InternalError, Result};
pub struct GeoExtractor {
grenad_parameters: GrenadParameters,
thread_count: usize,
}
impl GeoExtractor {
@@ -28,11 +29,12 @@ impl GeoExtractor {
rtxn: &RoTxn,
index: &Index,
grenad_parameters: GrenadParameters,
thread_count: usize,
) -> Result<Option<Self>> {
let is_sortable = index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
let is_filterable = index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME);
if is_sortable || is_filterable {
Ok(Some(GeoExtractor { grenad_parameters }))
Ok(Some(GeoExtractor { grenad_parameters, thread_count }))
} else {
Ok(None)
}
@@ -157,7 +159,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
) -> Result<()> {
let rtxn = &context.rtxn;
let index = context.index;
let max_memory = self.grenad_parameters.max_memory_by_thread();
let max_memory = self.grenad_parameters.max_memory_by_thread(self.thread_count);
let db_fields_ids_map = context.db_fields_ids_map;
let mut data_ref = context.data.borrow_mut_or_yield();

View File

@@ -5,7 +5,6 @@ mod geo;
mod searchable;
mod vectors;
use bumpalo::Bump;
pub use cache::{
merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap,
};
@@ -15,22 +14,6 @@ pub use geo::*;
pub use searchable::*;
pub use vectors::EmbeddingExtractor;
use super::indexer::document_changes::{DocumentChanges, IndexingContext};
use super::steps::IndexingStep;
use super::thread_local::{FullySend, ThreadLocal};
use crate::Result;
pub trait DocidsExtractor {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync;
}
/// TODO move in permissive json pointer
pub mod perm_json_p {
use serde_json::{Map, Value};

View File

@@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
self.buckets,
self.grenad_parameters.max_memory_by_thread(),
self.grenad_parameters.max_memory_by_thread(self.buckets),
extractor_alloc,
))))
}
@@ -240,6 +240,7 @@ pub struct WordDocidsExtractors;
impl WordDocidsExtractors {
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
@@ -288,10 +289,11 @@ impl WordDocidsExtractors {
let extractor = WordDocidsExtractorData {
tokenizer: &document_tokenizer,
grenad_parameters: indexing_context.grenad_parameters,
buckets: rayon::current_num_threads(),
buckets: thread_pool.thread_count(),
};
extract(
thread_pool,
document_changes,
&extractor,
indexing_context,

View File

@@ -2,29 +2,62 @@ use std::cell::RefCell;
use std::collections::VecDeque;
use std::rc::Rc;
use bumpalo::Bump;
use heed::RoTxn;
use super::tokenize_document::DocumentTokenizer;
use super::SearchableExtractor;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::update::new::document::Document;
use crate::update::new::extract::cache::BalancedCaches;
use crate::update::new::indexer::document_changes::DocumentChangeContext;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
};
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
use crate::update::GrenadParameters;
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
pub struct WordPairProximityDocidsExtractor;
impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractor<'a> {
type Data = RefCell<BalancedCaches<'extractor>>;
impl SearchableExtractor for WordPairProximityDocidsExtractor {
fn attributes_to_extract<'a>(
rtxn: &'a RoTxn,
index: &'a Index,
) -> Result<Option<Vec<&'a str>>> {
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
Ok(RefCell::new(BalancedCaches::new_in(
self.buckets,
self.grenad_parameters.max_memory_by_thread(self.buckets),
extractor_alloc,
)))
}
fn process<'doc>(
&self,
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
context: &DocumentChangeContext<Self::Data>,
) -> Result<()> {
for change in changes {
let change = change?;
self.extract_document_change(context, change)?;
}
Ok(())
}
}
pub struct WordPairProximityDocidsExtractor<'a> {
tokenizer: &'a DocumentTokenizer<'a>,
grenad_parameters: &'a GrenadParameters,
buckets: usize,
}
impl<'a> WordPairProximityDocidsExtractor<'a> {
fn attributes_to_extract<'b>(
rtxn: &'b RoTxn,
index: &'b Index,
) -> Result<Option<Vec<&'b str>>> {
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
}
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
fn attributes_to_skip<'b>(_rtxn: &'b RoTxn, _index: &'b Index) -> Result<Vec<&'b str>> {
Ok(Vec::new())
}
@@ -32,10 +65,11 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
// and to store the docids of the documents that have a number of words in a given field
// equal to or under than MAX_COUNTED_WORDS.
fn extract_document_change(
&self,
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
document_tokenizer: &DocumentTokenizer,
document_change: DocumentChange,
) -> Result<()> {
let document_tokenizer = self.tokenizer;
let doc_alloc = &context.doc_alloc;
let index = context.index;
@@ -129,6 +163,70 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
}
Ok(())
}
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
{
let rtxn = indexing_context.index.read_txn()?;
let stop_words = indexing_context.index.stop_words(&rtxn)?;
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
let allowed_separators: Option<Vec<_>> =
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
let dictionary = indexing_context.index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder = tokenizer_builder(
stop_words.as_ref(),
allowed_separators.as_deref(),
dictionary.as_deref(),
);
let tokenizer = builder.build();
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
let localized_attributes_rules =
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
let document_tokenizer = DocumentTokenizer {
tokenizer: &tokenizer,
attribute_to_extract: attributes_to_extract.as_deref(),
attribute_to_skip: attributes_to_skip.as_slice(),
localized_attributes_rules: &localized_attributes_rules,
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
};
let extractor_data: WordPairProximityDocidsExtractor = WordPairProximityDocidsExtractor {
tokenizer: &document_tokenizer,
grenad_parameters: indexing_context.grenad_parameters,
buckets: thread_pool.thread_count(),
};
let datastore = ThreadLocal::new();
{
let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter();
extract(
thread_pool,
document_changes,
&extractor_data,
indexing_context,
extractor_allocs,
&datastore,
step,
)?;
}
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
}
}
fn build_key<'a>(

View File

@@ -1,146 +1,5 @@
mod extract_word_docids;
mod extract_word_pair_proximity_docids;
mod tokenize_document;
use std::cell::RefCell;
use std::marker::PhantomData;
use bumpalo::Bump;
pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
use heed::RoTxn;
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::BalancedCaches;
use super::DocidsExtractor;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
};
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
tokenizer: &'a DocumentTokenizer<'a>,
grenad_parameters: &'a GrenadParameters,
buckets: usize,
_ex: PhantomData<EX>,
}
impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
for SearchableExtractorData<'a, EX>
{
type Data = RefCell<BalancedCaches<'extractor>>;
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
Ok(RefCell::new(BalancedCaches::new_in(
self.buckets,
self.grenad_parameters.max_memory_by_thread(),
extractor_alloc,
)))
}
fn process<'doc>(
&self,
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
context: &DocumentChangeContext<Self::Data>,
) -> Result<()> {
for change in changes {
let change = change?;
EX::extract_document_change(context, self.tokenizer, change)?;
}
Ok(())
}
}
pub trait SearchableExtractor: Sized + Sync {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
{
let rtxn = indexing_context.index.read_txn()?;
let stop_words = indexing_context.index.stop_words(&rtxn)?;
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
let allowed_separators: Option<Vec<_>> =
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
let dictionary = indexing_context.index.dictionary(&rtxn)?;
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder = tokenizer_builder(
stop_words.as_ref(),
allowed_separators.as_deref(),
dictionary.as_deref(),
);
let tokenizer = builder.build();
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
let localized_attributes_rules =
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
let document_tokenizer = DocumentTokenizer {
tokenizer: &tokenizer,
attribute_to_extract: attributes_to_extract.as_deref(),
attribute_to_skip: attributes_to_skip.as_slice(),
localized_attributes_rules: &localized_attributes_rules,
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
};
let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
tokenizer: &document_tokenizer,
grenad_parameters: indexing_context.grenad_parameters,
buckets: rayon::current_num_threads(),
_ex: PhantomData,
};
let datastore = ThreadLocal::new();
{
let span =
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
let _entered = span.enter();
extract(
document_changes,
&extractor_data,
indexing_context,
extractor_allocs,
&datastore,
step,
)?;
}
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
}
fn extract_document_change(
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
document_tokenizer: &DocumentTokenizer,
document_change: DocumentChange,
) -> Result<()>;
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
-> Result<Option<Vec<&'a str>>>;
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
}
impl<T: SearchableExtractor> DocidsExtractor for T {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
{
Self::run_extraction(document_changes, indexing_context, extractor_allocs, step)
}
}

View File

@@ -1,15 +1,14 @@
use std::cell::{Cell, RefCell};
use std::sync::atomic::Ordering;
use std::marker::PhantomData;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, RwLock};
use bumpalo::Bump;
use heed::RoTxn;
use rayon::iter::IndexedParallelIterator;
use super::super::document_change::DocumentChange;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::progress::{AtomicDocumentStep, Progress};
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::update::GrenadParameters;
@@ -114,7 +113,7 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload
>: Sync {
type Item: Send;
fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>>;
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]>;
fn len(&self) -> usize;
@@ -186,9 +185,10 @@ where
}
}
const CHUNK_SIZE: usize = 100;
pub const CHUNK_SIZE: usize = 100;
pub fn extract<
struct Extract<
'shared, // covariant lifetime for shared borrows
'pl, // covariant lifetime of the underlying payload
'extractor, // invariant lifetime of extractor_alloc
'fid, // invariant lifetime of fields ids map
@@ -196,31 +196,121 @@ pub fn extract<
'data, // invariant on EX::Data lifetime of datastore
'index, // covariant lifetime of the index
EX,
DC,
MSP,
> where
DC: DocumentChanges<'pl>,
EX: Extractor<'extractor>,
MSP: Fn() -> bool + Sync,
{
document_changes: &'shared DC,
extractor: &'shared EX,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor ThreadLocal<FullySend<Bump>>,
datastore: &'data ThreadLocal<EX::Data>,
step: Arc<AtomicU32>,
_marker: PhantomData<&'pl ()>,
}
impl<
'doc,
'extractor: 'doc, // invariant lifetime of extractor_alloc
'shared,
'pl, // covariant lifetime of the underlying payload
'fid: 'doc, // invariant lifetime of fields ids map
'indexer: 'doc, // covariant lifetime of objects that are borrowed during the entire indexing
'data: 'doc, // invariant on EX::Data lifetime of datastore
'index: 'doc + 'indexer, // covariant lifetime of the index
EX,
DC: DocumentChanges<'pl>,
MSP,
> scoped_thread_pool::Workload<'doc>
for Extract<'shared, 'pl, 'extractor, 'fid, 'indexer, 'data, 'index, EX, DC, MSP>
where
EX: Extractor<'extractor>,
MSP: Fn() -> bool + Sync,
{
type Context = DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, EX::Data>;
type Error = crate::Error;
fn context(
&self,
_thread_count: usize,
_thread_index: usize,
) -> std::result::Result<
DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, EX::Data>,
Self::Error,
> {
let extractor = self.extractor;
DocumentChangeContext::new(
self.indexing_context.index,
self.indexing_context.db_fields_ids_map,
self.indexing_context.new_fields_ids_map,
self.extractor_allocs,
self.indexing_context.doc_allocs,
self.datastore,
self.indexing_context.fields_ids_map_store,
move |index_alloc| extractor.init_data(index_alloc),
)
}
fn run_task(
&self,
_thread_count: usize,
thread_index: usize,
task_index: usize,
context: &mut Self::Context,
) -> Option<std::result::Result<(), Self::Error>> {
let items = self.document_changes.items(thread_index, task_index)?;
if (self.indexing_context.must_stop_processing)() {
return Some(Err(InternalError::AbortedIndexation.into()));
}
// Clean up and reuse the document-specific allocator
context.doc_alloc.reset();
let changes = items.iter().filter_map(|item| {
self.document_changes.item_to_document_change(context, item).transpose()
});
let res = self.extractor.process(changes, context);
self.step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
// send back the doc_alloc in the pool
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
Some(res)
}
}
pub fn extract<
'pool, // invariant lifetime of the thread pool
'pl, // covariant lifetime of the underlying payload
'extractor, // invariant lifetime of extractor_alloc
'fid, // invariant lifetime of fields ids map
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing
'data, // invariant on EX::Data lifetime of datastore
'index, // covariant lifetime of the index
EX,
DC,
MSP,
>(
thread_pool: &'pool scoped_thread_pool::ThreadPool<crate::Error>,
document_changes: &DC,
extractor: &EX,
IndexingContext {
index,
db_fields_ids_map,
new_fields_ids_map,
doc_allocs,
fields_ids_map_store,
must_stop_processing,
progress,
grenad_parameters: _,
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
datastore: &'data ThreadLocal<EX::Data>,
step: IndexingStep,
) -> Result<()>
where
DC: DocumentChanges<'pl>,
EX: Extractor<'extractor>,
MSP: Fn() -> bool + Sync,
{
tracing::trace!("We are resetting the extractor allocators");
progress.update_progress(step);
indexing_context.progress.update_progress(step);
// Clean up and reuse the extractor allocs
for extractor_alloc in extractor_allocs.iter_mut() {
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
@@ -229,45 +319,22 @@ where
let total_documents = document_changes.len() as u32;
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
progress.update_progress(progress_step);
indexing_context.progress.update_progress(progress_step);
let pi = document_changes.iter(CHUNK_SIZE);
pi.try_arc_for_each_try_init(
|| {
DocumentChangeContext::new(
index,
db_fields_ids_map,
new_fields_ids_map,
extractor_allocs,
doc_allocs,
datastore,
fields_ids_map_store,
move |index_alloc| extractor.init_data(index_alloc),
)
},
|context, items| {
if (must_stop_processing)() {
return Err(Arc::new(InternalError::AbortedIndexation.into()));
}
let extract = Extract {
document_changes,
extractor,
indexing_context,
extractor_allocs,
datastore,
step,
_marker: PhantomData,
};
thread_pool
.execute(&extract)
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))?;
// Clean up and reuse the document-specific allocator
context.doc_alloc.reset();
let items = items.as_ref();
let changes = items.iter().filter_map(|item| {
document_changes.item_to_document_change(context, item).transpose()
});
let res = extractor.process(changes, context).map_err(Arc::new);
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
// send back the doc_alloc in the pool
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
res
},
)?;
step.store(total_documents, Ordering::Relaxed);
extract.step.store(total_documents, Ordering::Relaxed);
Ok(())
}

View File

@@ -1,8 +1,7 @@
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use rayon::iter::IndexedParallelIterator;
use rayon::slice::ParallelSlice as _;
use roaring::RoaringBitmap;
use scoped_thread_pool::PartitionChunks;
use super::document_changes::{DocumentChangeContext, DocumentChanges};
use crate::documents::PrimaryKey;
@@ -28,31 +27,28 @@ impl DocumentDeletion {
self,
indexer_alloc: &'indexer Bump,
primary_key: PrimaryKey<'indexer>,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
chunk_size: usize,
) -> DocumentDeletionChanges<'indexer> {
let to_delete: bumpalo::collections::Vec<_> =
self.to_delete.into_iter().collect_in(indexer_alloc);
let to_delete = to_delete.into_bump_slice();
let to_delete = PartitionChunks::new(to_delete, chunk_size, thread_pool.thread_count());
DocumentDeletionChanges { to_delete, primary_key }
}
}
pub struct DocumentDeletionChanges<'indexer> {
to_delete: &'indexer [DocumentId],
to_delete: scoped_thread_pool::PartitionChunks<'indexer, DocumentId>,
primary_key: PrimaryKey<'indexer>,
}
impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
type Item = DocumentId;
fn iter(
&self,
chunk_size: usize,
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
self.to_delete.par_chunks(chunk_size)
}
fn item_to_document_change<
'doc, // lifetime of a single `process` call
T: MostlySend,
@@ -78,7 +74,11 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
}
fn len(&self) -> usize {
self.to_delete.len()
self.to_delete.slice().len()
}
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
self.to_delete.partition(thread_index, task_index)
}
}
@@ -86,6 +86,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
mod test {
use std::cell::RefCell;
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use std::sync::RwLock;
use bumpalo::Bump;
@@ -94,7 +95,7 @@ mod test {
use crate::index::tests::TempIndex;
use crate::progress::Progress;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, Extractor, IndexingContext,
extract, DocumentChangeContext, Extractor, IndexingContext, CHUNK_SIZE,
};
use crate::update::new::indexer::DocumentDeletion;
use crate::update::new::steps::IndexingStep;
@@ -135,6 +136,9 @@ mod test {
}
}
let thread_pool =
scoped_thread_pool::ThreadPool::new(NonZeroUsize::new(1).unwrap(), "test".into());
let mut deletions = DocumentDeletion::new();
deletions.delete_documents_by_docids(Vec::<u32>::new().into_iter().collect());
let indexer = Bump::new();
@@ -155,8 +159,12 @@ mod test {
let deletion_tracker = TrackDeletion(PhantomData);
let changes = deletions
.into_changes(&indexer, crate::documents::PrimaryKey::Flat { name: "id", field_id: 0 });
let changes = deletions.into_changes(
&indexer,
crate::documents::PrimaryKey::Flat { name: "id", field_id: 0 },
&thread_pool,
CHUNK_SIZE,
);
let context = IndexingContext {
index: &index,
@@ -173,6 +181,7 @@ mod test {
let datastore = ThreadLocal::new();
extract(
&thread_pool,
&changes,
&deletion_tracker,
context,

View File

@@ -6,8 +6,8 @@ use bumparaw_collections::RawMap;
use hashbrown::hash_map::Entry;
use heed::RoTxn;
use memmap2::Mmap;
use rayon::slice::ParallelSlice;
use rustc_hash::FxBuildHasher;
use scoped_thread_pool::PartitionChunks;
use serde_json::value::RawValue;
use serde_json::Deserializer;
@@ -57,6 +57,8 @@ impl<'pl> DocumentOperation<'pl> {
new_fields_ids_map: &mut FieldsIdsMap,
must_stop_processing: &MSP,
progress: Progress,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
chunk_size: usize,
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
where
MSP: Fn() -> bool,
@@ -130,6 +132,8 @@ impl<'pl> DocumentOperation<'pl> {
docids_version_offsets.sort_unstable_by_key(|(_, po)| method.sort_key(&po.operations));
let docids_version_offsets = docids_version_offsets.into_bump_slice();
let docids_version_offsets =
PartitionChunks::new(docids_version_offsets, chunk_size, thread_pool.thread_count());
Ok((DocumentOperationChanges { docids_version_offsets }, operations_stats, primary_key))
}
}
@@ -353,13 +357,6 @@ fn merge_version_offsets<'s, 'pl>(
impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
type Item = (&'pl str, PayloadOperations<'pl>);
fn iter(
&self,
chunk_size: usize,
) -> impl rayon::prelude::IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
self.docids_version_offsets.par_chunks(chunk_size)
}
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
&'doc self,
context: &'doc DocumentChangeContext<T>,
@@ -379,12 +376,16 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
}
fn len(&self) -> usize {
self.docids_version_offsets.len()
self.docids_version_offsets.slice().len()
}
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
self.docids_version_offsets.partition(thread_index, task_index)
}
}
pub struct DocumentOperationChanges<'pl> {
docids_version_offsets: &'pl [(&'pl str, PayloadOperations<'pl>)],
docids_version_offsets: PartitionChunks<'pl, (&'pl str, PayloadOperations<'pl>)>,
}
pub enum Payload<'pl> {

View File

@@ -22,6 +22,7 @@ use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
#[allow(clippy::too_many_arguments)]
pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
document_changes: &DC,
indexing_context: IndexingContext<MSP>,
indexer_span: Span,
@@ -32,7 +33,6 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
field_distribution: &mut BTreeMap<String, u64>,
mut index_embeddings: Vec<IndexEmbeddingConfig>,
document_ids: &mut RoaringBitmap,
modified_docids: &mut RoaringBitmap,
) -> Result<(FacetFieldIdsDelta, Vec<IndexEmbeddingConfig>)>
where
DC: DocumentChanges<'pl>,
@@ -48,11 +48,12 @@ where
// document but we need to create a function that collects and compresses documents.
let document_sender = extractor_sender.documents();
let document_extractor = DocumentsExtractor::new(document_sender, embedders);
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
let datastore = ThreadLocal::with_capacity(thread_pool.thread_count());
{
let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents");
let _entered = span.enter();
extract(
thread_pool,
document_changes,
&document_extractor,
indexing_context,
@@ -71,7 +72,7 @@ where
// adding the delta should never cause a negative result, as we are removing fields that previously existed.
*current = current.saturating_add_signed(delta);
}
document_extractor_data.docids_delta.apply_to(document_ids, modified_docids);
document_extractor_data.docids_delta.apply_to(document_ids);
}
field_distribution.retain(|_, v| *v != 0);
@@ -85,6 +86,7 @@ where
let _entered = span.enter();
FacetedDocidsExtractor::run_extraction(
thread_pool,
document_changes,
indexing_context,
extractor_allocs,
@@ -98,6 +100,7 @@ where
let _entered = span.enter();
facet_field_ids_delta = merge_and_send_facet_docids(
thread_pool,
caches,
FacetDatabases::new(index),
index,
@@ -119,6 +122,7 @@ where
let _entered = span.enter();
WordDocidsExtractors::run_extraction(
thread_pool,
document_changes,
indexing_context,
extractor_allocs,
@@ -130,6 +134,7 @@ where
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
let _entered = span.enter();
merge_and_send_docids(
thread_pool,
word_docids,
index.word_docids.remap_types(),
index,
@@ -143,6 +148,7 @@ where
tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
let _entered = span.enter();
merge_and_send_docids(
thread_pool,
word_fid_docids,
index.word_fid_docids.remap_types(),
index,
@@ -156,6 +162,7 @@ where
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
let _entered = span.enter();
merge_and_send_docids(
thread_pool,
exact_word_docids,
index.exact_word_docids.remap_types(),
index,
@@ -169,6 +176,7 @@ where
tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
let _entered = span.enter();
merge_and_send_docids(
thread_pool,
word_position_docids,
index.word_position_docids.remap_types(),
index,
@@ -182,6 +190,7 @@ where
tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
let _entered = span.enter();
merge_and_send_docids(
thread_pool,
fid_word_count_docids,
index.field_id_word_count_docids.remap_types(),
index,
@@ -199,7 +208,8 @@ where
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
let _entered = span.enter();
<WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
WordPairProximityDocidsExtractor::run_extraction(
thread_pool,
document_changes,
indexing_context,
extractor_allocs,
@@ -212,6 +222,7 @@ where
let _entered = span.enter();
merge_and_send_docids(
thread_pool,
caches,
index.word_pair_proximity_docids.remap_types(),
index,
@@ -233,12 +244,13 @@ where
field_distribution,
request_threads(),
);
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
let mut datastore = ThreadLocal::with_capacity(thread_pool.thread_count());
{
let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors");
let _entered = span.enter();
extract(
thread_pool,
document_changes,
&extractor,
indexing_context,
@@ -257,24 +269,30 @@ where
let Some(deladd) = data.remove(&config.name) else {
continue 'data;
};
deladd.apply_to(&mut config.user_provided, modified_docids);
deladd.apply_to(&mut config.user_provided);
}
}
}
}
'geo: {
let Some(extractor) = GeoExtractor::new(&rtxn, index, *indexing_context.grenad_parameters)?
let Some(extractor) = GeoExtractor::new(
&rtxn,
index,
*indexing_context.grenad_parameters,
thread_pool.thread_count(),
)?
else {
break 'geo;
};
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
let datastore = ThreadLocal::with_capacity(thread_pool.thread_count());
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
let _entered = span.enter();
extract(
thread_pool,
document_changes,
&extractor,
indexing_context,
@@ -290,6 +308,7 @@ where
index,
extractor_sender.geo(),
&indexing_context.must_stop_processing,
thread_pool,
)?;
}
indexing_context.progress.update_progress(IndexingStep::WritingToDatabase);

View File

@@ -44,6 +44,7 @@ static LOG_MEMORY_METRICS_ONCE: Once = Once::new();
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
wtxn: &mut RwTxn,
index: &'index Index,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
pool: &ThreadPoolNoAbort,
grenad_parameters: GrenadParameters,
db_fields_ids_map: &'indexer FieldsIdsMap,
@@ -104,16 +105,15 @@ where
);
});
let (extractor_sender, writer_receiver) = pool
.install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000))
.unwrap();
let (extractor_sender, writer_receiver) =
extractor_writer_bbqueue(thread_pool, &mut bbbuffers, total_bbbuffer_capacity, 1000);
let metadata_builder = MetadataBuilder::from_index(index, wtxn)?;
let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder);
let new_fields_ids_map = RwLock::new(new_fields_ids_map);
let fields_ids_map_store = ThreadLocal::with_capacity(rayon::current_num_threads());
let mut extractor_allocs = ThreadLocal::with_capacity(rayon::current_num_threads());
let doc_allocs = ThreadLocal::with_capacity(rayon::current_num_threads());
let fields_ids_map_store = ThreadLocal::with_capacity(thread_pool.thread_count());
let mut extractor_allocs = ThreadLocal::with_capacity(thread_pool.thread_count());
let doc_allocs = ThreadLocal::with_capacity(thread_pool.thread_count());
let indexing_context = IndexingContext {
index,
@@ -129,7 +129,6 @@ where
let index_embeddings = index.embedding_configs(wtxn)?;
let mut field_distribution = index.field_distribution(wtxn)?;
let mut document_ids = index.documents_ids(wtxn)?;
let mut modified_docids = roaring::RoaringBitmap::new();
thread::scope(|s| -> Result<()> {
let indexer_span = tracing::Span::current();
@@ -138,25 +137,21 @@ where
// prevent moving the field_distribution and document_ids in the inner closure...
let field_distribution = &mut field_distribution;
let document_ids = &mut document_ids;
let modified_docids = &mut modified_docids;
let extractor_handle =
Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
pool.install(move || {
extract::extract_all(
document_changes,
indexing_context,
indexer_span,
extractor_sender,
embedders,
&mut extractor_allocs,
finished_extraction,
field_distribution,
index_embeddings,
document_ids,
modified_docids,
)
})
.unwrap()
extract::extract_all(
thread_pool,
document_changes,
indexing_context,
indexer_span,
extractor_sender,
embedders,
&mut extractor_allocs,
finished_extraction,
field_distribution,
index_embeddings,
document_ids,
)
})?;
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
@@ -209,6 +204,7 @@ where
wtxn,
global_fields_ids_map,
facet_field_ids_delta,
thread_pool,
)?;
indexing_context.progress.update_progress(IndexingStep::Finalizing);
@@ -228,7 +224,6 @@ where
embedders,
field_distribution,
document_ids,
modified_docids,
)?;
Ok(())

View File

@@ -1,8 +1,8 @@
use std::ops::DerefMut;
use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator;
use rustc_hash::FxBuildHasher;
use scoped_thread_pool::ThreadPool;
use serde_json::value::RawValue;
use super::document_changes::{DocumentChangeContext, DocumentChanges};
@@ -14,45 +14,34 @@ use crate::update::new::thread_local::MostlySend;
use crate::update::new::{DocumentChange, Insertion};
use crate::{Error, InternalError, Result, UserError};
pub struct PartialDump<I> {
iter: I,
}
pub struct PartialDump;
impl<I> PartialDump<I> {
pub fn new_from_jsonlines(iter: I) -> Self {
PartialDump { iter }
impl PartialDump {
pub fn new_from_jsonlines() -> Self {
PartialDump
}
pub fn into_changes<'index>(
self,
concurrent_available_ids: &'index ConcurrentAvailableIds,
primary_key: &'index PrimaryKey,
) -> PartialDumpChanges<'index, I> {
_thread_pool: &ThreadPool<crate::Error>,
_chunk_size: usize,
) -> PartialDumpChanges<'index> {
// Note for future self:
// - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
PartialDumpChanges { iter: self.iter, concurrent_available_ids, primary_key }
PartialDumpChanges { concurrent_available_ids, primary_key }
}
}
pub struct PartialDumpChanges<'doc, I> {
iter: I,
pub struct PartialDumpChanges<'doc> {
concurrent_available_ids: &'doc ConcurrentAvailableIds,
primary_key: &'doc PrimaryKey<'doc>,
}
impl<'index, Iter> DocumentChanges<'index> for PartialDumpChanges<'index, Iter>
where
Iter: IndexedParallelIterator<Item = Box<RawValue>> + Clone + Sync + 'index,
{
impl<'index> DocumentChanges<'index> for PartialDumpChanges<'index> {
type Item = Box<RawValue>;
fn iter(
&self,
chunk_size: usize,
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
self.iter.clone().chunks(chunk_size)
}
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
&'doc self,
context: &'doc DocumentChangeContext<T>,
@@ -85,6 +74,10 @@ where
}
fn len(&self) -> usize {
self.iter.len()
unimplemented!()
}
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
unimplemented!()
}
}

View File

@@ -27,6 +27,7 @@ pub(super) fn post_process<MSP>(
wtxn: &mut RwTxn<'_>,
global_fields_ids_map: GlobalFieldsIdsMap<'_>,
facet_field_ids_delta: FacetFieldIdsDelta,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()>
where
MSP: Fn() -> bool + Sync,
@@ -39,7 +40,13 @@ where
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?;
compute_prefix_database(
index,
wtxn,
prefix_delta,
indexing_context.grenad_parameters,
thread_pool,
)?;
};
Ok(())
}
@@ -50,16 +57,38 @@ fn compute_prefix_database(
wtxn: &mut RwTxn,
prefix_delta: PrefixDelta,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
let PrefixDelta { modified, deleted } = prefix_delta;
// Compute word prefix docids
compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters, thread_pool)?;
// Compute exact word prefix docids
compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
compute_exact_word_prefix_docids(
wtxn,
index,
&modified,
&deleted,
grenad_parameters,
thread_pool,
)?;
// Compute word prefix fid docids
compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
compute_word_prefix_fid_docids(
wtxn,
index,
&modified,
&deleted,
grenad_parameters,
thread_pool,
)?;
// Compute word prefix position docids
compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters)
compute_word_prefix_position_docids(
wtxn,
index,
&modified,
&deleted,
grenad_parameters,
thread_pool,
)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing")]

View File

@@ -1,9 +1,10 @@
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator;
use rayon::slice::ParallelSlice as _;
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher;
use scoped_thread_pool::{PartitionChunks, ThreadPool};
use super::document_changes::DocumentChangeContext;
use super::DocumentChanges;
@@ -22,14 +23,12 @@ pub struct UpdateByFunction {
code: String,
}
pub struct UpdateByFunctionChanges<'doc> {
primary_key: &'doc PrimaryKey<'doc>,
pub struct UpdateByFunctionChanges<'index> {
primary_key: &'index PrimaryKey<'index>,
engine: Engine,
ast: AST,
context: Option<Dynamic>,
// It is sad that the RoaringBitmap doesn't
// implement IndexedParallelIterator
documents: Vec<u32>,
documents: PartitionChunks<'index, u32>,
}
impl UpdateByFunction {
@@ -40,6 +39,9 @@ impl UpdateByFunction {
pub fn into_changes<'index>(
self,
primary_key: &'index PrimaryKey,
allocator: &'index Bump,
thread_pool: &ThreadPool<crate::Error>,
chunk_size: usize,
) -> Result<UpdateByFunctionChanges<'index>> {
let Self { documents, context, code } = self;
@@ -64,26 +66,19 @@ impl UpdateByFunction {
None => None,
};
Ok(UpdateByFunctionChanges {
primary_key,
engine,
ast,
context,
documents: documents.into_iter().collect(),
})
let documents: bumpalo::collections::Vec<'_, _> =
documents.into_iter().collect_in(allocator);
let documents = documents.into_bump_slice();
let documents = PartitionChunks::new(documents, chunk_size, thread_pool.thread_count());
Ok(UpdateByFunctionChanges { primary_key, engine, ast, context, documents })
}
}
impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
type Item = u32;
fn iter(
&self,
chunk_size: usize,
) -> impl IndexedParallelIterator<Item = impl AsRef<[Self::Item]>> {
self.documents.as_slice().par_chunks(chunk_size)
}
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
&self,
context: &'doc DocumentChangeContext<T>,
@@ -185,7 +180,11 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
}
fn len(&self) -> usize {
self.documents.len()
self.documents.slice().len()
}
fn items(&self, thread_index: usize, task_index: usize) -> Option<&[Self::Item]> {
self.documents.partition(thread_index, task_index)
}
}

View File

@@ -113,7 +113,6 @@ where
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn update_index(
index: &Index,
wtxn: &mut RwTxn<'_>,
@@ -122,7 +121,6 @@ pub(super) fn update_index(
embedders: EmbeddingConfigs,
field_distribution: std::collections::BTreeMap<String, u64>,
document_ids: roaring::RoaringBitmap,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
if let Some(new_primary_key) = new_primary_key {
@@ -134,7 +132,6 @@ pub(super) fn update_index(
index.put_field_distribution(wtxn, &field_distribution)?;
index.put_documents_ids(wtxn, &document_ids)?;
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
index.update_documents_stats(wtxn, modified_docids)?;
Ok(())
}

View File

@@ -1,10 +1,10 @@
use std::cell::RefCell;
use std::sync::Mutex;
use hashbrown::HashMap;
use heed::types::Bytes;
use heed::{Database, RoTxn};
use memmap2::Mmap;
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap;
use super::channel::*;
@@ -22,6 +22,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>(
index: &Index,
geo_sender: GeoSender<'_, '_>,
must_stop_processing: &MSP,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()>
where
MSP: Fn() -> bool + Sync,
@@ -57,13 +58,14 @@ where
let rtree_mmap = unsafe { Mmap::map(&file)? };
geo_sender.set_rtree(rtree_mmap).unwrap();
geo_sender.set_geo_faceted(&faceted)?;
geo_sender.set_geo_faceted(&faceted, thread_pool)?;
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_docids<'extractor, MSP, D>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
mut caches: Vec<BalancedCaches<'extractor>>,
database: Database<Bytes, Bytes>,
index: &Index,
@@ -74,7 +76,10 @@ where
MSP: Fn() -> bool + Sync,
D: DatabaseType + Sync,
{
transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
let frozen_caches = Mutex::new(transpose_and_freeze_caches(&mut caches)?);
match thread_pool.broadcast(|thread_index| {
let frozen = std::mem::take(frozen_caches.lock().unwrap().get_mut(thread_index).unwrap());
let rtxn = index.read_txn()?;
if must_stop_processing() {
return Err(InternalError::AbortedIndexation.into());
@@ -92,12 +97,17 @@ where
}
Operation::Ignore => Ok(()),
}
})
})
})?;
Ok(())
}) {
Ok(()) => Ok(()),
Err(errors) => Err(crate::Error::from_scoped_thread_pool_errors(thread_pool, errors)),
}
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_facet_docids<'extractor>(
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
mut caches: Vec<BalancedCaches<'extractor>>,
database: FacetDatabases,
index: &Index,
@@ -108,9 +118,15 @@ pub fn merge_and_send_facet_docids<'extractor>(
let max_number_count = (index.facet_id_f64_docids.len(rtxn)? / 500) as usize;
let max_string_count = max_string_count.clamp(1000, 100_000);
let max_number_count = max_number_count.clamp(1000, 100_000);
transpose_and_freeze_caches(&mut caches)?
.into_par_iter()
.map(|frozen| {
let transposed_frozen_caches = Mutex::new(transpose_and_freeze_caches(&mut caches)?);
let output = Mutex::new(FacetFieldIdsDelta::new(max_string_count, max_number_count));
thread_pool
.broadcast(|thread_index| {
// TODO: we can probably spare the mutex here since it is guaranteed that each thread will access its own cell of the vec
let frozen = std::mem::take(
transposed_frozen_caches.lock().unwrap().get_mut(thread_index).unwrap(),
);
let mut facet_field_ids_delta =
FacetFieldIdsDelta::new(max_string_count, max_number_count);
let rtxn = index.read_txn()?;
@@ -130,13 +146,18 @@ pub fn merge_and_send_facet_docids<'extractor>(
Operation::Ignore => Ok(()),
}
})?;
Ok(facet_field_ids_delta)
{
let mut common = output.lock().unwrap();
*common = std::mem::replace(
&mut *common,
FacetFieldIdsDelta::new(max_string_count, max_number_count),
)
.merge(facet_field_ids_delta);
}
Ok(())
})
.reduce(
|| Ok(FacetFieldIdsDelta::new(max_string_count, max_number_count)),
|lhs, rhs| Ok(lhs?.merge(rhs?)),
)
.map_err(|errors| crate::Error::from_scoped_thread_pool_errors(thread_pool, errors))?;
Ok(output.into_inner().unwrap())
}
pub struct FacetDatabases<'a> {

View File

@@ -26,11 +26,13 @@ impl WordPrefixDocids {
database: Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> WordPrefixDocids {
WordPrefixDocids {
database,
prefix_database,
max_memory_by_thread: grenad_parameters.max_memory_by_thread(),
max_memory_by_thread: grenad_parameters
.max_memory_by_thread(thread_pool.thread_count()),
}
}
@@ -39,9 +41,10 @@ impl WordPrefixDocids {
wtxn: &mut heed::RwTxn,
prefix_to_compute: &BTreeSet<Prefix>,
prefix_to_delete: &BTreeSet<Prefix>,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
self.recompute_modified_prefixes(wtxn, prefix_to_compute, thread_pool)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
@@ -49,6 +52,7 @@ impl WordPrefixDocids {
&self,
wtxn: &mut RwTxn,
prefixes: &BTreeSet<Prefix>,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
// We fetch the docids associated to the newly added word prefix fst only.
// And collect the CboRoaringBitmaps pointers in an HashMap.
@@ -56,7 +60,7 @@ impl WordPrefixDocids {
// We access this HashMap in parallel to compute the *union* of all
// of them and *serialize* them into files. There is one file by CPU.
let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads());
let local_entries = ThreadLocal::with_capacity(thread_pool.thread_count());
prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| {
let refcell = local_entries.get_or(|| {
let file = BufWriter::new(spooled_tempfile(
@@ -162,11 +166,13 @@ impl WordPrefixIntegerDocids {
database: Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> WordPrefixIntegerDocids {
WordPrefixIntegerDocids {
database,
prefix_database,
max_memory_by_thread: grenad_parameters.max_memory_by_thread(),
max_memory_by_thread: grenad_parameters
.max_memory_by_thread(thread_pool.thread_count()),
}
}
@@ -175,9 +181,10 @@ impl WordPrefixIntegerDocids {
wtxn: &mut heed::RwTxn,
prefix_to_compute: &BTreeSet<Prefix>,
prefix_to_delete: &BTreeSet<Prefix>,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?;
self.recompute_modified_prefixes(wtxn, prefix_to_compute)
self.recompute_modified_prefixes(wtxn, prefix_to_compute, thread_pool)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
@@ -185,6 +192,7 @@ impl WordPrefixIntegerDocids {
&self,
wtxn: &mut RwTxn,
prefixes: &BTreeSet<Prefix>,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
// We fetch the docids associated to the newly added word prefix fst only.
// And collect the CboRoaringBitmaps pointers in an HashMap.
@@ -192,7 +200,7 @@ impl WordPrefixIntegerDocids {
// We access this HashMap in parallel to compute the *union* of all
// of them and *serialize* them into files. There is one file by CPU.
let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads());
let local_entries = ThreadLocal::with_capacity(thread_pool.thread_count());
prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| {
let refcell = local_entries.get_or(|| {
let file = BufWriter::new(spooled_tempfile(
@@ -312,13 +320,15 @@ pub fn compute_word_prefix_docids(
prefix_to_compute: &BTreeSet<Prefix>,
prefix_to_delete: &BTreeSet<Prefix>,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
WordPrefixDocids::new(
index.word_docids.remap_key_type(),
index.word_prefix_docids.remap_key_type(),
grenad_parameters,
thread_pool,
)
.execute(wtxn, prefix_to_compute, prefix_to_delete)
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
@@ -328,13 +338,15 @@ pub fn compute_exact_word_prefix_docids(
prefix_to_compute: &BTreeSet<Prefix>,
prefix_to_delete: &BTreeSet<Prefix>,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
WordPrefixDocids::new(
index.exact_word_docids.remap_key_type(),
index.exact_word_prefix_docids.remap_key_type(),
grenad_parameters,
thread_pool,
)
.execute(wtxn, prefix_to_compute, prefix_to_delete)
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
@@ -344,13 +356,15 @@ pub fn compute_word_prefix_fid_docids(
prefix_to_compute: &BTreeSet<Prefix>,
prefix_to_delete: &BTreeSet<Prefix>,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
WordPrefixIntegerDocids::new(
index.word_fid_docids.remap_key_type(),
index.word_prefix_fid_docids.remap_key_type(),
grenad_parameters,
thread_pool,
)
.execute(wtxn, prefix_to_compute, prefix_to_delete)
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
@@ -360,11 +374,13 @@ pub fn compute_word_prefix_position_docids(
prefix_to_compute: &BTreeSet<Prefix>,
prefix_to_delete: &BTreeSet<Prefix>,
grenad_parameters: &GrenadParameters,
thread_pool: &scoped_thread_pool::ThreadPool<crate::Error>,
) -> Result<()> {
WordPrefixIntegerDocids::new(
index.word_position_docids.remap_key_type(),
index.word_prefix_position_docids.remap_key_type(),
grenad_parameters,
thread_pool,
)
.execute(wtxn, prefix_to_compute, prefix_to_delete)
.execute(wtxn, prefix_to_compute, prefix_to_delete, thread_pool)
}

View File

@@ -3,7 +3,7 @@ mod v1_13;
use heed::RwTxn;
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Current};
use v1_13::V1_13_0_To_Current;
use crate::progress::{Progress, VariableNameStep};
use crate::{Index, InternalError, Result};
@@ -28,19 +28,15 @@ pub fn upgrade(
progress: Progress,
) -> Result<bool> {
let from = index.get_version(wtxn)?.unwrap_or(db_version);
let upgrade_functions: &[&dyn UpgradeIndex] = &[
&V1_12_To_V1_12_3 {},
&V1_12_3_To_V1_13_0 {},
&V1_13_0_To_V1_13_1 {},
&V1_13_1_To_Current {},
];
let upgrade_functions: &[&dyn UpgradeIndex] =
&[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()];
let start = match from {
(1, 12, 0..=2) => 0,
(1, 12, 3..) => 1,
(1, 13, 0) => 2,
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 13, _) => 3,
(1, 13, _) => return Ok(false),
(major, minor, patch) => {
return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into())
}

View File

@@ -2,44 +2,13 @@ use heed::RwTxn;
use super::UpgradeIndex;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::database_stats::DatabaseStats;
use crate::progress::Progress;
use crate::{make_enum_progress, Index, Result};
use crate::{Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct V1_13_0_To_V1_13_1();
pub(super) struct V1_13_0_To_Current();
impl UpgradeIndex for V1_13_0_To_V1_13_1 {
fn upgrade(
&self,
wtxn: &mut RwTxn,
index: &Index,
_original: (u32, u32, u32),
progress: Progress,
) -> Result<bool> {
make_enum_progress! {
enum DocumentsStats {
CreatingDocumentsStats,
}
};
// Create the new documents stats.
progress.update_progress(DocumentsStats::CreatingDocumentsStats);
let stats = DatabaseStats::new(index.documents.remap_types(), wtxn)?;
index.put_documents_stats(wtxn, stats)?;
Ok(true)
}
fn target_version(&self) -> (u32, u32, u32) {
(1, 13, 1)
}
}
#[allow(non_camel_case_types)]
pub(super) struct V1_13_1_To_Current();
impl UpgradeIndex for V1_13_1_To_Current {
impl UpgradeIndex for V1_13_0_To_Current {
fn upgrade(
&self,
_wtxn: &mut RwTxn,

View File

@@ -5,6 +5,7 @@ use maplit::hashset;
use milli::documents::mmap_from_objects;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
use milli::{FacetDistribution, Index, Object, OrderBy};
@@ -36,6 +37,8 @@ fn test_facet_distribution_with_no_facet_values() {
let mut new_fields_ids_map = db_fields_ids_map.clone();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
let doc1: Object = from_value(
@@ -59,12 +62,15 @@ fn test_facet_distribution_with_no_facet_values() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -9,6 +9,7 @@ use heed::EnvOpenOptions;
use maplit::{btreemap, hashset};
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy};
@@ -72,6 +73,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let mut new_fields_ids_map = db_fields_ids_map.clone();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
let mut file = tempfile::tempfile().unwrap();
@@ -92,6 +95,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
@@ -102,6 +107,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
indexer::index(
&mut wtxn,
&index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -7,6 +7,7 @@ use itertools::Itertools;
use maplit::hashset;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
@@ -288,6 +289,8 @@ fn criteria_ascdesc() {
let mut new_fields_ids_map = db_fields_ids_map.clone();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
let mut file = tempfile::tempfile().unwrap();
@@ -328,12 +331,15 @@ fn criteria_ascdesc() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,

View File

@@ -5,6 +5,7 @@ use heed::EnvOpenOptions;
use milli::documents::mmap_from_objects;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::new::indexer::document_changes::CHUNK_SIZE;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy};
@@ -123,6 +124,8 @@ fn test_typo_disabled_on_word() {
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut new_fields_ids_map = db_fields_ids_map.clone();
let embedders = EmbeddingConfigs::default();
let thread_pool =
scoped_thread_pool::ThreadPool::with_available_parallelism("index".to_string());
let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
indexer.add_documents(&documents).unwrap();
@@ -137,12 +140,15 @@ fn test_typo_disabled_on_word() {
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&thread_pool,
CHUNK_SIZE,
)
.unwrap();
indexer::index(
&mut wtxn,
&index,
&thread_pool,
&milli::ThreadPoolNoAbortBuilder::new().build().unwrap(),
config.grenad_parameters(),
&db_fields_ids_map,