Merge pull request #5762 from meilisearch/new-document-indexer-for-dumps

Use the edition 2024 documents indexer in the dumps
This commit is contained in:
Louis Dureuil
2025-07-21 14:53:43 +00:00
committed by GitHub
26 changed files with 210 additions and 46 deletions

View File

@ -14,7 +14,7 @@ license.workspace = true
anyhow = "1.0.98"
bumpalo = "3.18.1"
csv = "1.3.1"
memmap2 = "0.9.5"
memmap2 = "0.9.7"
milli = { path = "../milli" }
mimalloc = { version = "0.1.47", default-features = false }
serde_json = { version = "1.0.140", features = ["preserve_order"] }
@ -55,4 +55,3 @@ harness = false
[[bench]]
name = "sort"
harness = false

View File

@ -1,3 +1,4 @@
use std::fs::File;
use std::str::FromStr;
use super::v2_to_v3::CompatV2ToV3;
@ -94,6 +95,10 @@ impl CompatIndexV1ToV2 {
self.from.documents().map(|it| Box::new(it) as Box<dyn Iterator<Item = _>>)
}
pub fn documents_file(&self) -> &File {
self.from.documents_file()
}
pub fn settings(&mut self) -> Result<v2::settings::Settings<v2::settings::Checked>> {
Ok(v2::settings::Settings::<v2::settings::Unchecked>::from(self.from.settings()?).check())
}

View File

@ -1,3 +1,4 @@
use std::fs::File;
use std::str::FromStr;
use time::OffsetDateTime;
@ -122,6 +123,13 @@ impl CompatIndexV2ToV3 {
}
}
pub fn documents_file(&self) -> &File {
match self {
CompatIndexV2ToV3::V2(v2) => v2.documents_file(),
CompatIndexV2ToV3::Compat(compat) => compat.documents_file(),
}
}
pub fn settings(&mut self) -> Result<v3::Settings<v3::Checked>> {
let settings = match self {
CompatIndexV2ToV3::V2(from) => from.settings()?,

View File

@ -1,3 +1,5 @@
use std::fs::File;
use super::v2_to_v3::{CompatIndexV2ToV3, CompatV2ToV3};
use super::v4_to_v5::CompatV4ToV5;
use crate::reader::{v3, v4, UpdateFile};
@ -252,6 +254,13 @@ impl CompatIndexV3ToV4 {
}
}
pub fn documents_file(&self) -> &File {
match self {
CompatIndexV3ToV4::V3(v3) => v3.documents_file(),
CompatIndexV3ToV4::Compat(compat) => compat.documents_file(),
}
}
pub fn settings(&mut self) -> Result<v4::Settings<v4::Checked>> {
Ok(match self {
CompatIndexV3ToV4::V3(v3) => {

View File

@ -1,3 +1,5 @@
use std::fs::File;
use super::v3_to_v4::{CompatIndexV3ToV4, CompatV3ToV4};
use super::v5_to_v6::CompatV5ToV6;
use crate::reader::{v4, v5, Document};
@ -241,6 +243,13 @@ impl CompatIndexV4ToV5 {
}
}
pub fn documents_file(&self) -> &File {
match self {
CompatIndexV4ToV5::V4(v4) => v4.documents_file(),
CompatIndexV4ToV5::Compat(compat) => compat.documents_file(),
}
}
pub fn settings(&mut self) -> Result<v5::Settings<v5::Checked>> {
match self {
CompatIndexV4ToV5::V4(v4) => Ok(v5::Settings::from(v4.settings()?).check()),

View File

@ -1,3 +1,4 @@
use std::fs::File;
use std::num::NonZeroUsize;
use std::str::FromStr;
@ -243,6 +244,13 @@ impl CompatIndexV5ToV6 {
}
}
pub fn documents_file(&self) -> &File {
match self {
CompatIndexV5ToV6::V5(v5) => v5.documents_file(),
CompatIndexV5ToV6::Compat(compat) => compat.documents_file(),
}
}
pub fn settings(&mut self) -> Result<v6::Settings<v6::Checked>> {
match self {
CompatIndexV5ToV6::V5(v5) => Ok(v6::Settings::from(v5.settings()?).check()),

View File

@ -192,6 +192,14 @@ impl DumpIndexReader {
}
}
/// A reference to a file in the NDJSON format containing all the documents of the index
pub fn documents_file(&self) -> &File {
match self {
DumpIndexReader::Current(v6) => v6.documents_file(),
DumpIndexReader::Compat(compat) => compat.documents_file(),
}
}
pub fn settings(&mut self) -> Result<v6::Settings<v6::Checked>> {
match self {
DumpIndexReader::Current(v6) => v6.settings(),

View File

@ -72,6 +72,10 @@ impl V1IndexReader {
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn documents_file(&self) -> &File {
self.documents.get_ref()
}
pub fn settings(&mut self) -> Result<self::settings::Settings> {
Ok(serde_json::from_reader(&mut self.settings)?)
}

View File

@ -203,6 +203,10 @@ impl V2IndexReader {
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn documents_file(&self) -> &File {
self.documents.get_ref()
}
pub fn settings(&mut self) -> Result<Settings<Checked>> {
Ok(self.settings.clone())
}

View File

@ -215,6 +215,10 @@ impl V3IndexReader {
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn documents_file(&self) -> &File {
self.documents.get_ref()
}
pub fn settings(&mut self) -> Result<Settings<Checked>> {
Ok(self.settings.clone())
}

View File

@ -210,6 +210,10 @@ impl V4IndexReader {
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn documents_file(&self) -> &File {
self.documents.get_ref()
}
pub fn settings(&mut self) -> Result<Settings<Checked>> {
Ok(self.settings.clone())
}

View File

@ -247,6 +247,10 @@ impl V5IndexReader {
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn documents_file(&self) -> &File {
self.documents.get_ref()
}
pub fn settings(&mut self) -> Result<Settings<Checked>> {
Ok(self.settings.clone())
}

View File

@ -284,6 +284,10 @@ impl V6IndexReader {
.map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) }))
}
pub fn documents_file(&self) -> &File {
self.documents.get_ref()
}
pub fn settings(&mut self) -> Result<Settings<Checked>> {
let mut settings: Settings<Unchecked> = serde_json::from_reader(&mut self.settings)?;
patch_embedders(&mut settings);

View File

@ -26,7 +26,7 @@ flate2 = "1.1.2"
indexmap = "2.9.0"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
memmap2 = "0.9.5"
memmap2 = "0.9.7"
page_size = "0.6.0"
rayon = "1.10.0"
roaring = { version = "0.10.12", features = ["serde"] }

View File

@ -20,6 +20,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
let IndexScheduler {
cleanup_enabled: _,
experimental_no_edition_2024_for_dumps: _,
processing_tasks,
env,
version,

View File

@ -168,6 +168,9 @@ pub struct IndexScheduler {
/// Whether we should automatically cleanup the task queue or not.
pub(crate) cleanup_enabled: bool,
/// Whether we should use the old document indexer or the new one.
pub(crate) experimental_no_edition_2024_for_dumps: bool,
/// The webhook url we should send tasks to after processing every batches.
pub(crate) webhook_url: Option<String>,
/// The Authorization header to send to the webhook URL.
@ -210,6 +213,7 @@ impl IndexScheduler {
index_mapper: self.index_mapper.clone(),
cleanup_enabled: self.cleanup_enabled,
experimental_no_edition_2024_for_dumps: self.experimental_no_edition_2024_for_dumps,
webhook_url: self.webhook_url.clone(),
webhook_authorization_header: self.webhook_authorization_header.clone(),
embedders: self.embedders.clone(),
@ -296,6 +300,9 @@ impl IndexScheduler {
index_mapper,
env,
cleanup_enabled: options.cleanup_enabled,
experimental_no_edition_2024_for_dumps: options
.indexer_config
.experimental_no_edition_2024_for_dumps,
webhook_url: options.webhook_url,
webhook_authorization_header: options.webhook_authorization_header,
embedders: Default::default(),
@ -594,6 +601,11 @@ impl IndexScheduler {
Ok(nbr_index_processing_tasks > 0)
}
/// Whether the index should use the old document indexer.
pub fn no_edition_2024_for_dumps(&self) -> bool {
self.experimental_no_edition_2024_for_dumps
}
/// Return the tasks matching the query from the user's point of view along
/// with the total number of tasks matching the query, ignoring from and limit.
///

View File

@ -24,7 +24,7 @@ enum-iterator = "2.1.0"
file-store = { path = "../file-store" }
flate2 = "1.1.2"
fst = "0.4.7"
memmap2 = "0.9.5"
memmap2 = "0.9.7"
milli = { path = "../milli" }
roaring = { version = "0.10.12", features = ["serde"] }
rustc-hash = "2.1.1"

View File

@ -50,6 +50,7 @@ jsonwebtoken = "9.3.1"
lazy_static = "1.5.0"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
memmap2 = "0.9.7"
mimalloc = { version = "0.1.47", default-features = false }
mime = "0.3.17"
num_cpus = "1.17.0"

View File

@ -203,6 +203,7 @@ struct Infos {
experimental_composite_embedders: bool,
experimental_embedding_cache_entries: usize,
experimental_no_snapshot_compaction: bool,
experimental_no_edition_2024_for_dumps: bool,
experimental_no_edition_2024_for_settings: bool,
gpu_enabled: bool,
db_path: bool,
@ -293,6 +294,7 @@ impl Infos {
max_indexing_threads,
skip_index_budget: _,
experimental_no_edition_2024_for_settings,
experimental_no_edition_2024_for_dumps,
} = indexer_options;
let RuntimeTogglableFeatures {
@ -329,6 +331,7 @@ impl Infos {
experimental_composite_embedders: composite_embedders,
experimental_embedding_cache_entries,
experimental_no_snapshot_compaction,
experimental_no_edition_2024_for_dumps,
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
db_path: db_path != PathBuf::from("./data.ms"),
import_dump: import_dump.is_some(),

View File

@ -30,6 +30,7 @@ use actix_web::web::Data;
use actix_web::{web, HttpRequest};
use analytics::Analytics;
use anyhow::bail;
use bumpalo::Bump;
use error::PayloadError;
use extractors::payload::PayloadConfig;
use index_scheduler::versioning::Versioning;
@ -38,6 +39,7 @@ use meilisearch_auth::{open_auth_store_env, AuthController};
use meilisearch_types::milli::constants::VERSION_MAJOR;
use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use meilisearch_types::milli::progress::{EmbedderStats, Progress};
use meilisearch_types::milli::update::new::indexer;
use meilisearch_types::milli::update::{
default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig,
};
@ -533,7 +535,7 @@ fn import_dump(
let mut index_reader = index_reader?;
let metadata = index_reader.metadata();
let uid = metadata.uid.clone();
tracing::info!("Importing index `{}`.", metadata.uid);
tracing::info!("Importing index `{uid}`.");
let date = Some((metadata.created_at, metadata.updated_at));
let index = index_scheduler.create_raw_index(&metadata.uid, date)?;
@ -552,48 +554,100 @@ fn import_dump(
apply_settings_to_builder(&settings, &mut builder);
let embedder_stats: Arc<EmbedderStats> = Default::default();
builder.execute(&|| false, &progress, embedder_stats.clone())?;
wtxn.commit()?;
// 5.3 Import the documents.
// 5.3.1 We need to recreate the grenad+obkv format accepted by the index.
tracing::info!("Importing the documents.");
let file = tempfile::tempfile()?;
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
for document in index_reader.documents()? {
builder.append_json_object(&document?)?;
let mut wtxn = index.write_txn()?;
let rtxn = index.read_txn()?;
if index_scheduler.no_edition_2024_for_dumps() {
// 5.3 Import the documents.
// 5.3.1 We need to recreate the grenad+obkv format accepted by the index.
tracing::info!("Importing the documents.");
let file = tempfile::tempfile()?;
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
for document in index_reader.documents()? {
builder.append_json_object(&document?)?;
}
// This flush the content of the batch builder.
let file = builder.into_inner()?.into_inner()?;
// 5.3.2 We feed it to the milli index.
let reader = BufReader::new(file);
let reader = DocumentsBatchReader::from_reader(reader)?;
let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?;
let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?;
let builder = milli::update::IndexDocuments::new(
&mut wtxn,
&index,
indexer_config,
IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
},
|indexing_step| tracing::trace!("update: {:?}", indexing_step),
|| false,
&embedder_stats,
)?;
let builder = builder.with_embedders(embedders);
let (builder, user_result) = builder.add_documents(reader)?;
let user_result = user_result?;
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
builder.execute()?;
} else {
let db_fields_ids_map = index.fields_ids_map(&rtxn)?;
let primary_key = index.primary_key(&rtxn)?;
let mut new_fields_ids_map = db_fields_ids_map.clone();
let mut indexer = indexer::DocumentOperation::new();
let embedders = index.embedding_configs().embedding_configs(&rtxn)?;
let embedders = index_scheduler.embedders(uid.clone(), embedders)?;
let mmap = unsafe { memmap2::Mmap::map(index_reader.documents_file())? };
indexer.replace_documents(&mmap)?;
let indexer_config = index_scheduler.indexer_config();
let pool = &indexer_config.thread_pool;
let indexer_alloc = Bump::new();
let (document_changes, mut operation_stats, primary_key) = indexer.into_changes(
&indexer_alloc,
&index,
&rtxn,
primary_key,
&mut new_fields_ids_map,
&|| false, // never stop processing a dump
progress.clone(),
)?;
let operation_stats = operation_stats.pop().unwrap();
if let Some(error) = operation_stats.error {
return Err(error.into());
}
let _congestion = indexer::index(
&mut wtxn,
&index,
pool,
indexer_config.grenad_parameters(),
&db_fields_ids_map,
new_fields_ids_map,
primary_key,
&document_changes,
embedders,
&|| false, // never stop processing a dump
&progress,
&embedder_stats,
)?;
}
// This flush the content of the batch builder.
let file = builder.into_inner()?.into_inner()?;
// 5.3.2 We feed it to the milli index.
let reader = BufReader::new(file);
let reader = DocumentsBatchReader::from_reader(reader)?;
let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?;
let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?;
let builder = milli::update::IndexDocuments::new(
&mut wtxn,
&index,
indexer_config,
IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
..Default::default()
},
|indexing_step| tracing::trace!("update: {:?}", indexing_step),
|| false,
&embedder_stats,
)?;
let builder = builder.with_embedders(embedders);
let (builder, user_result) = builder.add_documents(reader)?;
let user_result = user_result?;
tracing::info!(documents_found = user_result, "{} documents found.", user_result);
builder.execute()?;
wtxn.commit()?;
tracing::info!("All documents successfully imported.");
index_scheduler.refresh_index_stats(&uid)?;
}

View File

@ -68,6 +68,8 @@ const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str =
const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str =
"MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES";
const MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION: &str = "MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION";
const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS: &str =
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS";
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
const DEFAULT_DB_PATH: &str = "./data.ms";
const DEFAULT_HTTP_ADDR: &str = "localhost:7700";
@ -759,6 +761,15 @@ pub struct IndexerOpts {
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)]
#[serde(default)]
pub experimental_no_edition_2024_for_settings: bool,
/// Experimental make dump imports use the old document indexer.
///
/// When enabled, Meilisearch will use the old document indexer when importing dumps.
///
/// For more information, see <https://github.com/orgs/meilisearch/discussions/851>.
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)]
#[serde(default)]
pub experimental_no_edition_2024_for_dumps: bool,
}
impl IndexerOpts {
@ -769,6 +780,7 @@ impl IndexerOpts {
max_indexing_threads,
skip_index_budget: _,
experimental_no_edition_2024_for_settings,
experimental_no_edition_2024_for_dumps,
} = self;
if let Some(max_indexing_memory) = max_indexing_memory.0 {
export_to_env_if_not_present(
@ -788,6 +800,12 @@ impl IndexerOpts {
experimental_no_edition_2024_for_settings.to_string(),
);
}
if experimental_no_edition_2024_for_dumps {
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS,
experimental_no_edition_2024_for_dumps.to_string(),
);
}
}
}
@ -808,6 +826,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
skip_index_budget: other.skip_index_budget,
experimental_no_edition_2024_for_settings: other
.experimental_no_edition_2024_for_settings,
experimental_no_edition_2024_for_dumps: other.experimental_no_edition_2024_for_dumps,
chunk_compression_type: Default::default(),
chunk_compression_level: Default::default(),
documents_chunk_size: Default::default(),

View File

@ -466,6 +466,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
// Having 2 threads makes the tests way faster
max_indexing_threads: MaxThreads::from_str("2").unwrap(),
experimental_no_edition_2024_for_settings: false,
experimental_no_edition_2024_for_dumps: false,
},
experimental_enable_metrics: false,
..Parser::parse_from(None as Option<&str>)

View File

@ -40,7 +40,7 @@ indexmap = { version = "2.9.0", features = ["serde"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
memchr = "2.7.5"
memmap2 = "0.9.5"
memmap2 = "0.9.7"
obkv = "0.3.0"
once_cell = "1.21.3"
ordered-float = "5.0.0"

View File

@ -16,6 +16,7 @@ pub struct IndexerConfig {
pub max_positions_per_attributes: Option<u32>,
pub skip_index_budget: bool,
pub experimental_no_edition_2024_for_settings: bool,
pub experimental_no_edition_2024_for_dumps: bool,
}
impl IndexerConfig {
@ -65,6 +66,7 @@ impl Default for IndexerConfig {
max_positions_per_attributes: None,
skip_index_budget: false,
experimental_no_edition_2024_for_settings: false,
experimental_no_edition_2024_for_dumps: false,
}
}
}