From fe15e11c9de7ef9866c04fcd2b07d1d2644aad6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 16:12:23 +0200 Subject: [PATCH 1/9] Introduce a new CLI and env var to use the old document indexer when importing dumps --- crates/index-scheduler/src/lib.rs | 12 +++ .../src/analytics/segment_analytics.rs | 3 + crates/meilisearch/src/lib.rs | 79 ++++++++++--------- crates/meilisearch/src/option.rs | 16 ++++ 4 files changed, 73 insertions(+), 37 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index b2f27d66b..f91e45914 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -139,6 +139,8 @@ pub struct IndexSchedulerOptions { pub embedding_cache_cap: usize, /// Snapshot compaction status. pub experimental_no_snapshot_compaction: bool, + /// Whether dump import use the old document indexer or the new one. + pub experimental_no_edition_2024_for_dumps: bool, } /// Structure which holds meilisearch's indexes and schedules the tasks @@ -168,6 +170,9 @@ pub struct IndexScheduler { /// Whether we should automatically cleanup the task queue or not. pub(crate) cleanup_enabled: bool, + /// Whether we should use the old document indexer or the new one. + pub(crate) experimental_no_edition_2024_for_dumps: bool, + /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, /// The Authorization header to send to the webhook URL. @@ -210,6 +215,7 @@ impl IndexScheduler { index_mapper: self.index_mapper.clone(), cleanup_enabled: self.cleanup_enabled, + experimental_no_edition_2024_for_dumps: self.experimental_no_edition_2024_for_dumps, webhook_url: self.webhook_url.clone(), webhook_authorization_header: self.webhook_authorization_header.clone(), embedders: self.embedders.clone(), @@ -296,6 +302,7 @@ impl IndexScheduler { index_mapper, env, cleanup_enabled: options.cleanup_enabled, + experimental_no_edition_2024_for_dumps: options.experimental_no_edition_2024_for_dumps, webhook_url: options.webhook_url, webhook_authorization_header: options.webhook_authorization_header, embedders: Default::default(), @@ -594,6 +601,11 @@ impl IndexScheduler { Ok(nbr_index_processing_tasks > 0) } + /// Whether the index should use the old document indexer. + pub fn no_edition_2024_for_dumps(&self) -> bool { + self.experimental_no_edition_2024_for_dumps + } + /// Return the tasks matching the query from the user's point of view along /// with the total number of tasks matching the query, ignoring from and limit. /// diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 0abc5c817..a96ddf068 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -203,6 +203,7 @@ struct Infos { experimental_composite_embedders: bool, experimental_embedding_cache_entries: usize, experimental_no_snapshot_compaction: bool, + experimental_no_edition_2024_for_dumps: bool, experimental_no_edition_2024_for_settings: bool, gpu_enabled: bool, db_path: bool, @@ -253,6 +254,7 @@ impl Infos { experimental_limit_batched_tasks_total_size, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, + experimental_no_edition_2024_for_dumps, http_addr, master_key: _, env, @@ -329,6 +331,7 @@ impl Infos { experimental_composite_embedders: composite_embedders, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, + experimental_no_edition_2024_for_dumps, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 43d7afe0e..8907a5632 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -238,6 +238,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< auto_upgrade: opt.experimental_dumpless_upgrade, embedding_cache_cap: opt.experimental_embedding_cache_entries, experimental_no_snapshot_compaction: opt.experimental_no_snapshot_compaction, + experimental_no_edition_2024_for_dumps: opt.experimental_no_edition_2024_for_dumps, }; let binary_version = (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH); @@ -553,47 +554,51 @@ fn import_dump( let embedder_stats: Arc = Default::default(); builder.execute(&|| false, &progress, embedder_stats.clone())?; - // 5.3 Import the documents. - // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. - tracing::info!("Importing the documents."); - let file = tempfile::tempfile()?; - let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); - for document in index_reader.documents()? { - builder.append_json_object(&document?)?; + if index_scheduler.no_edition_2024_for_dumps() { + // 5.3 Import the documents. + // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. + tracing::info!("Importing the documents."); + let file = tempfile::tempfile()?; + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); + for document in index_reader.documents()? { + builder.append_json_object(&document?)?; + } + + // This flush the content of the batch builder. + let file = builder.into_inner()?.into_inner()?; + + // 5.3.2 We feed it to the milli index. + let reader = BufReader::new(file); + let reader = DocumentsBatchReader::from_reader(reader)?; + + let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; + let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; + + let builder = milli::update::IndexDocuments::new( + &mut wtxn, + &index, + indexer_config, + IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }, + |indexing_step| tracing::trace!("update: {:?}", indexing_step), + || false, + &embedder_stats, + )?; + + let builder = builder.with_embedders(embedders); + + let (builder, user_result) = builder.add_documents(reader)?; + let user_result = user_result?; + tracing::info!(documents_found = user_result, "{} documents found.", user_result); + builder.execute()?; + } else { + unimplemented!("new document indexer when importing dumps"); } - // This flush the content of the batch builder. - let file = builder.into_inner()?.into_inner()?; - - // 5.3.2 We feed it to the milli index. - let reader = BufReader::new(file); - let reader = DocumentsBatchReader::from_reader(reader)?; - - let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; - let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; - - let builder = milli::update::IndexDocuments::new( - &mut wtxn, - &index, - indexer_config, - IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }, - |indexing_step| tracing::trace!("update: {:?}", indexing_step), - || false, - &embedder_stats, - )?; - - let builder = builder.with_embedders(embedders); - - let (builder, user_result) = builder.add_documents(reader)?; - let user_result = user_result?; - tracing::info!(documents_found = user_result, "{} documents found.", user_result); - builder.execute()?; wtxn.commit()?; tracing::info!("All documents successfully imported."); - index_scheduler.refresh_index_stats(&uid)?; } diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 9658352c8..77106d362 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -68,6 +68,8 @@ const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str = const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str = "MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES"; const MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION: &str = "MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION"; +const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS: &str = + "MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_DB_PATH: &str = "./data.ms"; const DEFAULT_HTTP_ADDR: &str = "localhost:7700"; @@ -467,6 +469,15 @@ pub struct Opt { #[serde(default)] pub experimental_no_snapshot_compaction: bool, + /// Experimental make dump imports use the old document indexer. + /// + /// When enabled, Meilisearch will use the old document indexer when importing dumps. + /// + /// For more information, see . + #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)] + #[serde(default)] + pub experimental_no_edition_2024_for_dumps: bool, + #[serde(flatten)] #[clap(flatten)] pub indexer_options: IndexerOpts, @@ -572,6 +583,7 @@ impl Opt { experimental_limit_batched_tasks_total_size, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, + experimental_no_edition_2024_for_dumps, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr); @@ -672,6 +684,10 @@ impl Opt { MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION, experimental_no_snapshot_compaction.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS, + experimental_no_edition_2024_for_dumps.to_string(), + ); indexer_options.export_to_env(); } From 338806283b1303691843150d6530904ca34fb717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 16:13:00 +0200 Subject: [PATCH 2/9] Do not track meilisearch databases --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 764447352..44cfa8f75 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ **/*.json_lines **/*.rs.bk /*.mdb -/data.ms +/*.ms /snapshots /dumps /bench From 760ccffdbd2a1bb51b95de3c024e77e09f46f475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 17:12:18 +0200 Subject: [PATCH 3/9] Expose the documents files from the dumps --- crates/dump/src/reader/compat/v1_to_v2.rs | 5 +++++ crates/dump/src/reader/compat/v2_to_v3.rs | 8 ++++++++ crates/dump/src/reader/compat/v3_to_v4.rs | 9 +++++++++ crates/dump/src/reader/compat/v4_to_v5.rs | 9 +++++++++ crates/dump/src/reader/compat/v5_to_v6.rs | 8 ++++++++ crates/dump/src/reader/mod.rs | 7 +++++++ crates/dump/src/reader/v1/mod.rs | 4 ++++ crates/dump/src/reader/v2/mod.rs | 4 ++++ crates/dump/src/reader/v3/mod.rs | 4 ++++ crates/dump/src/reader/v4/mod.rs | 4 ++++ crates/dump/src/reader/v5/mod.rs | 4 ++++ crates/dump/src/reader/v6/mod.rs | 4 ++++ 12 files changed, 70 insertions(+) diff --git a/crates/dump/src/reader/compat/v1_to_v2.rs b/crates/dump/src/reader/compat/v1_to_v2.rs index 0d050497b..35d369c3a 100644 --- a/crates/dump/src/reader/compat/v1_to_v2.rs +++ b/crates/dump/src/reader/compat/v1_to_v2.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::str::FromStr; use super::v2_to_v3::CompatV2ToV3; @@ -94,6 +95,10 @@ impl CompatIndexV1ToV2 { self.from.documents().map(|it| Box::new(it) as Box>) } + pub fn documents_file(&self) -> &File { + self.from.documents_file() + } + pub fn settings(&mut self) -> Result> { Ok(v2::settings::Settings::::from(self.from.settings()?).check()) } diff --git a/crates/dump/src/reader/compat/v2_to_v3.rs b/crates/dump/src/reader/compat/v2_to_v3.rs index e7516e708..62326040e 100644 --- a/crates/dump/src/reader/compat/v2_to_v3.rs +++ b/crates/dump/src/reader/compat/v2_to_v3.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::str::FromStr; use time::OffsetDateTime; @@ -122,6 +123,13 @@ impl CompatIndexV2ToV3 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV2ToV3::V2(v2) => v2.documents_file(), + CompatIndexV2ToV3::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { let settings = match self { CompatIndexV2ToV3::V2(from) => from.settings()?, diff --git a/crates/dump/src/reader/compat/v3_to_v4.rs b/crates/dump/src/reader/compat/v3_to_v4.rs index 5bb70e9b2..1dba37771 100644 --- a/crates/dump/src/reader/compat/v3_to_v4.rs +++ b/crates/dump/src/reader/compat/v3_to_v4.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use super::v2_to_v3::{CompatIndexV2ToV3, CompatV2ToV3}; use super::v4_to_v5::CompatV4ToV5; use crate::reader::{v3, v4, UpdateFile}; @@ -252,6 +254,13 @@ impl CompatIndexV3ToV4 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV3ToV4::V3(v3) => v3.documents_file(), + CompatIndexV3ToV4::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { Ok(match self { CompatIndexV3ToV4::V3(v3) => { diff --git a/crates/dump/src/reader/compat/v4_to_v5.rs b/crates/dump/src/reader/compat/v4_to_v5.rs index e52acb176..3f47b5b48 100644 --- a/crates/dump/src/reader/compat/v4_to_v5.rs +++ b/crates/dump/src/reader/compat/v4_to_v5.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use super::v3_to_v4::{CompatIndexV3ToV4, CompatV3ToV4}; use super::v5_to_v6::CompatV5ToV6; use crate::reader::{v4, v5, Document}; @@ -241,6 +243,13 @@ impl CompatIndexV4ToV5 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV4ToV5::V4(v4) => v4.documents_file(), + CompatIndexV4ToV5::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { CompatIndexV4ToV5::V4(v4) => Ok(v5::Settings::from(v4.settings()?).check()), diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index f7bda81c6..f173bb6bd 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::num::NonZeroUsize; use std::str::FromStr; @@ -243,6 +244,13 @@ impl CompatIndexV5ToV6 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV5ToV6::V5(v5) => v5.documents_file(), + CompatIndexV5ToV6::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { CompatIndexV5ToV6::V5(v5) => Ok(v6::Settings::from(v5.settings()?).check()), diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 23e7eec9e..91c6d5880 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -192,6 +192,13 @@ impl DumpIndexReader { } } + pub fn documents_file(&self) -> &File { + match self { + DumpIndexReader::Current(v6) => v6.documents_file(), + DumpIndexReader::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { DumpIndexReader::Current(v6) => v6.settings(), diff --git a/crates/dump/src/reader/v1/mod.rs b/crates/dump/src/reader/v1/mod.rs index ac7324d9a..d86ede62c 100644 --- a/crates/dump/src/reader/v1/mod.rs +++ b/crates/dump/src/reader/v1/mod.rs @@ -72,6 +72,10 @@ impl V1IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result { Ok(serde_json::from_reader(&mut self.settings)?) } diff --git a/crates/dump/src/reader/v2/mod.rs b/crates/dump/src/reader/v2/mod.rs index 14a643c2d..a74687381 100644 --- a/crates/dump/src/reader/v2/mod.rs +++ b/crates/dump/src/reader/v2/mod.rs @@ -203,6 +203,10 @@ impl V2IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v3/mod.rs b/crates/dump/src/reader/v3/mod.rs index 920e1dc6e..5f89eb861 100644 --- a/crates/dump/src/reader/v3/mod.rs +++ b/crates/dump/src/reader/v3/mod.rs @@ -215,6 +215,10 @@ impl V3IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v4/mod.rs b/crates/dump/src/reader/v4/mod.rs index 585786ae4..16a1e27c2 100644 --- a/crates/dump/src/reader/v4/mod.rs +++ b/crates/dump/src/reader/v4/mod.rs @@ -210,6 +210,10 @@ impl V4IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v5/mod.rs b/crates/dump/src/reader/v5/mod.rs index dfbc6346c..0123db433 100644 --- a/crates/dump/src/reader/v5/mod.rs +++ b/crates/dump/src/reader/v5/mod.rs @@ -247,6 +247,10 @@ impl V5IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 449a7e5fe..08d4700e5 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -284,6 +284,10 @@ impl V6IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { let mut settings: Settings = serde_json::from_reader(&mut self.settings)?; patch_embedders(&mut settings); From d67db6e3c2a97acb956c55ccab4f38b1dd988212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 17:12:42 +0200 Subject: [PATCH 4/9] Use the edition 2024 documents indexer in the dumps --- Cargo.lock | 5 ++-- crates/meilisearch/Cargo.toml | 1 + crates/meilisearch/src/lib.rs | 54 +++++++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ceec0a05e..8413b3d14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3775,6 +3775,7 @@ dependencies = [ "meili-snap", "meilisearch-auth", "meilisearch-types", + "memmap2", "mimalloc", "mime", "mopa-maintained", @@ -3908,9 +3909,9 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "memmap2" -version = "0.9.5" +version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +checksum = "483758ad303d734cec05e5c12b41d7e93e6a6390c5e9dae6bdeb7c1259012d28" dependencies = [ "libc", "stable_deref_trait", diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 83eb439d9..21f6b58e5 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -50,6 +50,7 @@ jsonwebtoken = "9.3.1" lazy_static = "1.5.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.7" mimalloc = { version = "0.1.47", default-features = false } mime = "0.3.17" num_cpus = "1.17.0" diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 8907a5632..57a20a633 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -30,6 +30,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest}; use analytics::Analytics; use anyhow::bail; +use bumpalo::Bump; use error::PayloadError; use extractors::payload::PayloadConfig; use index_scheduler::versioning::Versioning; @@ -38,6 +39,7 @@ use meilisearch_auth::{open_auth_store_env, AuthController}; use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use meilisearch_types::milli::progress::{EmbedderStats, Progress}; +use meilisearch_types::milli::update::new::indexer; use meilisearch_types::milli::update::{ default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, }; @@ -534,7 +536,7 @@ fn import_dump( let mut index_reader = index_reader?; let metadata = index_reader.metadata(); let uid = metadata.uid.clone(); - tracing::info!("Importing index `{}`.", metadata.uid); + tracing::info!("Importing index `{uid}`."); let date = Some((metadata.created_at, metadata.updated_at)); let index = index_scheduler.create_raw_index(&metadata.uid, date)?; @@ -553,6 +555,10 @@ fn import_dump( apply_settings_to_builder(&settings, &mut builder); let embedder_stats: Arc = Default::default(); builder.execute(&|| false, &progress, embedder_stats.clone())?; + wtxn.commit()?; + + let mut wtxn = index.write_txn()?; + let rtxn = index.read_txn()?; if index_scheduler.no_edition_2024_for_dumps() { // 5.3 Import the documents. @@ -594,7 +600,51 @@ fn import_dump( tracing::info!(documents_found = user_result, "{} documents found.", user_result); builder.execute()?; } else { - unimplemented!("new document indexer when importing dumps"); + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let primary_key = index.primary_key(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + let mut indexer = indexer::DocumentOperation::new(); + let embedders = index.embedding_configs().embedding_configs(&mut wtxn)?; + let embedders = index_scheduler.embedders(uid.clone(), embedders)?; + + let mmap = unsafe { memmap2::Mmap::map(index_reader.documents_file())? }; + + indexer.replace_documents(&mmap)?; + + let indexer_config = index_scheduler.indexer_config(); + let pool = &indexer_config.thread_pool; + + let indexer_alloc = Bump::new(); + let (document_changes, mut operation_stats, primary_key) = indexer.into_changes( + &indexer_alloc, + &index, + &rtxn, + primary_key, + &mut new_fields_ids_map, + &|| false, // never stop processing a dump + progress.clone(), + )?; + + let operation_stats = operation_stats.pop().unwrap(); + if let Some(error) = operation_stats.error { + return Err(error.into()); + } + + let _congestion = indexer::index( + &mut wtxn, + &index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| false, // never stop processing a dump + &progress, + &embedder_stats, + )?; } wtxn.commit()?; From a1b42c10e2fceadd07f39ca1cac547a99053a8b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 17:21:03 +0200 Subject: [PATCH 5/9] Make clippy happy --- crates/index-scheduler/src/insta_snapshot.rs | 1 + crates/index-scheduler/src/test_utils.rs | 1 + crates/meilisearch/src/lib.rs | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 0cbbb2514..32ce131b5 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -20,6 +20,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { let IndexScheduler { cleanup_enabled: _, + experimental_no_edition_2024_for_dumps: _, processing_tasks, env, version, diff --git a/crates/index-scheduler/src/test_utils.rs b/crates/index-scheduler/src/test_utils.rs index bfed7f53a..0a705b6c7 100644 --- a/crates/index-scheduler/src/test_utils.rs +++ b/crates/index-scheduler/src/test_utils.rs @@ -115,6 +115,7 @@ impl IndexScheduler { auto_upgrade: true, // Don't cost much and will ensure the happy path works embedding_cache_cap: 10, experimental_no_snapshot_compaction: false, + experimental_no_edition_2024_for_dumps: false, }; let version = configuration(&mut options).unwrap_or({ (versioning::VERSION_MAJOR, versioning::VERSION_MINOR, versioning::VERSION_PATCH) diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 57a20a633..13d2eb789 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -605,7 +605,7 @@ fn import_dump( let mut new_fields_ids_map = db_fields_ids_map.clone(); let mut indexer = indexer::DocumentOperation::new(); - let embedders = index.embedding_configs().embedding_configs(&mut wtxn)?; + let embedders = index.embedding_configs().embedding_configs(&rtxn)?; let embedders = index_scheduler.embedders(uid.clone(), embedders)?; let mmap = unsafe { memmap2::Mmap::map(index_reader.documents_file())? }; From 1b476b8a35655283840cca3f37ce1af691d3feb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 17:26:41 +0200 Subject: [PATCH 6/9] Add documentation to the new documents_file dump reader method Co-authored-by: Louis Dureuil --- crates/dump/src/reader/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 91c6d5880..c894c255f 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -192,6 +192,7 @@ impl DumpIndexReader { } } + /// A reference to a file in the NDJSON format containing all the documents of the index pub fn documents_file(&self) -> &File { match self { DumpIndexReader::Current(v6) => v6.documents_file(), From 626be0ef28fef7e71ad8628d95f0b76d44509b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 17:27:00 +0200 Subject: [PATCH 7/9] Small typo fix Co-authored-by: Louis Dureuil --- crates/index-scheduler/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index f91e45914..8715bc100 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -139,7 +139,7 @@ pub struct IndexSchedulerOptions { pub embedding_cache_cap: usize, /// Snapshot compaction status. pub experimental_no_snapshot_compaction: bool, - /// Whether dump import use the old document indexer or the new one. + /// Whether dump import uses the old document indexer or the new one. pub experimental_no_edition_2024_for_dumps: bool, } From b85657de1eb075a5b5160dfcbec721237856feeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 17 Jul 2025 17:29:59 +0200 Subject: [PATCH 8/9] Update memmap2 version everywhere --- crates/benchmarks/Cargo.toml | 3 +-- crates/index-scheduler/Cargo.toml | 2 +- crates/meilisearch-types/Cargo.toml | 2 +- crates/milli/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index 68ed5aff4..f60f0979c 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -14,7 +14,7 @@ license.workspace = true anyhow = "1.0.98" bumpalo = "3.18.1" csv = "1.3.1" -memmap2 = "0.9.5" +memmap2 = "0.9.7" milli = { path = "../milli" } mimalloc = { version = "0.1.47", default-features = false } serde_json = { version = "1.0.140", features = ["preserve_order"] } @@ -55,4 +55,3 @@ harness = false [[bench]] name = "sort" harness = false - diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index de0d01935..20cc49686 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -26,7 +26,7 @@ flate2 = "1.1.2" indexmap = "2.9.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } -memmap2 = "0.9.5" +memmap2 = "0.9.7" page_size = "0.6.0" rayon = "1.10.0" roaring = { version = "0.10.12", features = ["serde"] } diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index faf59643f..f3279a094 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,7 +24,7 @@ enum-iterator = "2.1.0" file-store = { path = "../file-store" } flate2 = "1.1.2" fst = "0.4.7" -memmap2 = "0.9.5" +memmap2 = "0.9.7" milli = { path = "../milli" } roaring = { version = "0.10.12", features = ["serde"] } rustc-hash = "2.1.1" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 3d08252ac..d94a4d4e1 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -40,7 +40,7 @@ indexmap = { version = "2.9.0", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memchr = "2.7.5" -memmap2 = "0.9.5" +memmap2 = "0.9.7" obkv = "0.3.0" once_cell = "1.21.3" ordered-float = "5.0.0" From bdc2d1e64dbb04fd2bce2b099600e605c16b51f9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 21 Jul 2025 14:37:22 +0200 Subject: [PATCH 9/9] Move the edition 2024 dump parameter to the right place --- crates/index-scheduler/src/lib.rs | 6 ++-- crates/index-scheduler/src/test_utils.rs | 1 - .../src/analytics/segment_analytics.rs | 2 +- crates/meilisearch/src/lib.rs | 1 - crates/meilisearch/src/option.rs | 31 ++++++++++--------- crates/meilisearch/tests/common/server.rs | 1 + crates/milli/src/update/indexer_config.rs | 2 ++ 7 files changed, 24 insertions(+), 20 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 8715bc100..46566b9ba 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -139,8 +139,6 @@ pub struct IndexSchedulerOptions { pub embedding_cache_cap: usize, /// Snapshot compaction status. pub experimental_no_snapshot_compaction: bool, - /// Whether dump import uses the old document indexer or the new one. - pub experimental_no_edition_2024_for_dumps: bool, } /// Structure which holds meilisearch's indexes and schedules the tasks @@ -302,7 +300,9 @@ impl IndexScheduler { index_mapper, env, cleanup_enabled: options.cleanup_enabled, - experimental_no_edition_2024_for_dumps: options.experimental_no_edition_2024_for_dumps, + experimental_no_edition_2024_for_dumps: options + .indexer_config + .experimental_no_edition_2024_for_dumps, webhook_url: options.webhook_url, webhook_authorization_header: options.webhook_authorization_header, embedders: Default::default(), diff --git a/crates/index-scheduler/src/test_utils.rs b/crates/index-scheduler/src/test_utils.rs index 0a705b6c7..bfed7f53a 100644 --- a/crates/index-scheduler/src/test_utils.rs +++ b/crates/index-scheduler/src/test_utils.rs @@ -115,7 +115,6 @@ impl IndexScheduler { auto_upgrade: true, // Don't cost much and will ensure the happy path works embedding_cache_cap: 10, experimental_no_snapshot_compaction: false, - experimental_no_edition_2024_for_dumps: false, }; let version = configuration(&mut options).unwrap_or({ (versioning::VERSION_MAJOR, versioning::VERSION_MINOR, versioning::VERSION_PATCH) diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index a96ddf068..a2a0f0c05 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -254,7 +254,6 @@ impl Infos { experimental_limit_batched_tasks_total_size, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, - experimental_no_edition_2024_for_dumps, http_addr, master_key: _, env, @@ -295,6 +294,7 @@ impl Infos { max_indexing_threads, skip_index_budget: _, experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps, } = indexer_options; let RuntimeTogglableFeatures { diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 13d2eb789..0fb93b65a 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -240,7 +240,6 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< auto_upgrade: opt.experimental_dumpless_upgrade, embedding_cache_cap: opt.experimental_embedding_cache_entries, experimental_no_snapshot_compaction: opt.experimental_no_snapshot_compaction, - experimental_no_edition_2024_for_dumps: opt.experimental_no_edition_2024_for_dumps, }; let binary_version = (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH); diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 77106d362..dd77a1222 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -469,15 +469,6 @@ pub struct Opt { #[serde(default)] pub experimental_no_snapshot_compaction: bool, - /// Experimental make dump imports use the old document indexer. - /// - /// When enabled, Meilisearch will use the old document indexer when importing dumps. - /// - /// For more information, see . - #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)] - #[serde(default)] - pub experimental_no_edition_2024_for_dumps: bool, - #[serde(flatten)] #[clap(flatten)] pub indexer_options: IndexerOpts, @@ -583,7 +574,6 @@ impl Opt { experimental_limit_batched_tasks_total_size, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, - experimental_no_edition_2024_for_dumps, } = self; export_to_env_if_not_present(MEILI_DB_PATH, db_path); export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr); @@ -684,10 +674,6 @@ impl Opt { MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION, experimental_no_snapshot_compaction.to_string(), ); - export_to_env_if_not_present( - MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS, - experimental_no_edition_2024_for_dumps.to_string(), - ); indexer_options.export_to_env(); } @@ -775,6 +761,15 @@ pub struct IndexerOpts { #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)] #[serde(default)] pub experimental_no_edition_2024_for_settings: bool, + + /// Experimental make dump imports use the old document indexer. + /// + /// When enabled, Meilisearch will use the old document indexer when importing dumps. + /// + /// For more information, see . + #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)] + #[serde(default)] + pub experimental_no_edition_2024_for_dumps: bool, } impl IndexerOpts { @@ -785,6 +780,7 @@ impl IndexerOpts { max_indexing_threads, skip_index_budget: _, experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps, } = self; if let Some(max_indexing_memory) = max_indexing_memory.0 { export_to_env_if_not_present( @@ -804,6 +800,12 @@ impl IndexerOpts { experimental_no_edition_2024_for_settings.to_string(), ); } + if experimental_no_edition_2024_for_dumps { + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS, + experimental_no_edition_2024_for_dumps.to_string(), + ); + } } } @@ -824,6 +826,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { skip_index_budget: other.skip_index_budget, experimental_no_edition_2024_for_settings: other .experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps: other.experimental_no_edition_2024_for_dumps, chunk_compression_type: Default::default(), chunk_compression_level: Default::default(), documents_chunk_size: Default::default(), diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 5f82bb380..ad0678122 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -466,6 +466,7 @@ pub fn default_settings(dir: impl AsRef) -> Opt { // Having 2 threads makes the tests way faster max_indexing_threads: MaxThreads::from_str("2").unwrap(), experimental_no_edition_2024_for_settings: false, + experimental_no_edition_2024_for_dumps: false, }, experimental_enable_metrics: false, ..Parser::parse_from(None as Option<&str>) diff --git a/crates/milli/src/update/indexer_config.rs b/crates/milli/src/update/indexer_config.rs index a0f901818..845da5a51 100644 --- a/crates/milli/src/update/indexer_config.rs +++ b/crates/milli/src/update/indexer_config.rs @@ -16,6 +16,7 @@ pub struct IndexerConfig { pub max_positions_per_attributes: Option, pub skip_index_budget: bool, pub experimental_no_edition_2024_for_settings: bool, + pub experimental_no_edition_2024_for_dumps: bool, } impl IndexerConfig { @@ -65,6 +66,7 @@ impl Default for IndexerConfig { max_positions_per_attributes: None, skip_index_budget: false, experimental_no_edition_2024_for_settings: false, + experimental_no_edition_2024_for_dumps: false, } } }