diff --git a/.gitignore b/.gitignore index 764447352..44cfa8f75 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ **/*.json_lines **/*.rs.bk /*.mdb -/data.ms +/*.ms /snapshots /dumps /bench diff --git a/Cargo.lock b/Cargo.lock index ceec0a05e..8413b3d14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3775,6 +3775,7 @@ dependencies = [ "meili-snap", "meilisearch-auth", "meilisearch-types", + "memmap2", "mimalloc", "mime", "mopa-maintained", @@ -3908,9 +3909,9 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "memmap2" -version = "0.9.5" +version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +checksum = "483758ad303d734cec05e5c12b41d7e93e6a6390c5e9dae6bdeb7c1259012d28" dependencies = [ "libc", "stable_deref_trait", diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index 68ed5aff4..f60f0979c 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -14,7 +14,7 @@ license.workspace = true anyhow = "1.0.98" bumpalo = "3.18.1" csv = "1.3.1" -memmap2 = "0.9.5" +memmap2 = "0.9.7" milli = { path = "../milli" } mimalloc = { version = "0.1.47", default-features = false } serde_json = { version = "1.0.140", features = ["preserve_order"] } @@ -55,4 +55,3 @@ harness = false [[bench]] name = "sort" harness = false - diff --git a/crates/dump/src/reader/compat/v1_to_v2.rs b/crates/dump/src/reader/compat/v1_to_v2.rs index 0d050497b..35d369c3a 100644 --- a/crates/dump/src/reader/compat/v1_to_v2.rs +++ b/crates/dump/src/reader/compat/v1_to_v2.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::str::FromStr; use super::v2_to_v3::CompatV2ToV3; @@ -94,6 +95,10 @@ impl CompatIndexV1ToV2 { self.from.documents().map(|it| Box::new(it) as Box>) } + pub fn documents_file(&self) -> &File { + self.from.documents_file() + } + pub fn settings(&mut self) -> Result> { Ok(v2::settings::Settings::::from(self.from.settings()?).check()) } diff --git a/crates/dump/src/reader/compat/v2_to_v3.rs b/crates/dump/src/reader/compat/v2_to_v3.rs index e7516e708..62326040e 100644 --- a/crates/dump/src/reader/compat/v2_to_v3.rs +++ b/crates/dump/src/reader/compat/v2_to_v3.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::str::FromStr; use time::OffsetDateTime; @@ -122,6 +123,13 @@ impl CompatIndexV2ToV3 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV2ToV3::V2(v2) => v2.documents_file(), + CompatIndexV2ToV3::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { let settings = match self { CompatIndexV2ToV3::V2(from) => from.settings()?, diff --git a/crates/dump/src/reader/compat/v3_to_v4.rs b/crates/dump/src/reader/compat/v3_to_v4.rs index 5bb70e9b2..1dba37771 100644 --- a/crates/dump/src/reader/compat/v3_to_v4.rs +++ b/crates/dump/src/reader/compat/v3_to_v4.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use super::v2_to_v3::{CompatIndexV2ToV3, CompatV2ToV3}; use super::v4_to_v5::CompatV4ToV5; use crate::reader::{v3, v4, UpdateFile}; @@ -252,6 +254,13 @@ impl CompatIndexV3ToV4 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV3ToV4::V3(v3) => v3.documents_file(), + CompatIndexV3ToV4::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { Ok(match self { CompatIndexV3ToV4::V3(v3) => { diff --git a/crates/dump/src/reader/compat/v4_to_v5.rs b/crates/dump/src/reader/compat/v4_to_v5.rs index e52acb176..3f47b5b48 100644 --- a/crates/dump/src/reader/compat/v4_to_v5.rs +++ b/crates/dump/src/reader/compat/v4_to_v5.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use super::v3_to_v4::{CompatIndexV3ToV4, CompatV3ToV4}; use super::v5_to_v6::CompatV5ToV6; use crate::reader::{v4, v5, Document}; @@ -241,6 +243,13 @@ impl CompatIndexV4ToV5 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV4ToV5::V4(v4) => v4.documents_file(), + CompatIndexV4ToV5::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { CompatIndexV4ToV5::V4(v4) => Ok(v5::Settings::from(v4.settings()?).check()), diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index f7bda81c6..f173bb6bd 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::num::NonZeroUsize; use std::str::FromStr; @@ -243,6 +244,13 @@ impl CompatIndexV5ToV6 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV5ToV6::V5(v5) => v5.documents_file(), + CompatIndexV5ToV6::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { CompatIndexV5ToV6::V5(v5) => Ok(v6::Settings::from(v5.settings()?).check()), diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 23e7eec9e..c894c255f 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -192,6 +192,14 @@ impl DumpIndexReader { } } + /// A reference to a file in the NDJSON format containing all the documents of the index + pub fn documents_file(&self) -> &File { + match self { + DumpIndexReader::Current(v6) => v6.documents_file(), + DumpIndexReader::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { DumpIndexReader::Current(v6) => v6.settings(), diff --git a/crates/dump/src/reader/v1/mod.rs b/crates/dump/src/reader/v1/mod.rs index ac7324d9a..d86ede62c 100644 --- a/crates/dump/src/reader/v1/mod.rs +++ b/crates/dump/src/reader/v1/mod.rs @@ -72,6 +72,10 @@ impl V1IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result { Ok(serde_json::from_reader(&mut self.settings)?) } diff --git a/crates/dump/src/reader/v2/mod.rs b/crates/dump/src/reader/v2/mod.rs index 14a643c2d..a74687381 100644 --- a/crates/dump/src/reader/v2/mod.rs +++ b/crates/dump/src/reader/v2/mod.rs @@ -203,6 +203,10 @@ impl V2IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v3/mod.rs b/crates/dump/src/reader/v3/mod.rs index 920e1dc6e..5f89eb861 100644 --- a/crates/dump/src/reader/v3/mod.rs +++ b/crates/dump/src/reader/v3/mod.rs @@ -215,6 +215,10 @@ impl V3IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v4/mod.rs b/crates/dump/src/reader/v4/mod.rs index 585786ae4..16a1e27c2 100644 --- a/crates/dump/src/reader/v4/mod.rs +++ b/crates/dump/src/reader/v4/mod.rs @@ -210,6 +210,10 @@ impl V4IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v5/mod.rs b/crates/dump/src/reader/v5/mod.rs index dfbc6346c..0123db433 100644 --- a/crates/dump/src/reader/v5/mod.rs +++ b/crates/dump/src/reader/v5/mod.rs @@ -247,6 +247,10 @@ impl V5IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 449a7e5fe..08d4700e5 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -284,6 +284,10 @@ impl V6IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { let mut settings: Settings = serde_json::from_reader(&mut self.settings)?; patch_embedders(&mut settings); diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index de0d01935..20cc49686 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -26,7 +26,7 @@ flate2 = "1.1.2" indexmap = "2.9.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } -memmap2 = "0.9.5" +memmap2 = "0.9.7" page_size = "0.6.0" rayon = "1.10.0" roaring = { version = "0.10.12", features = ["serde"] } diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 0cbbb2514..32ce131b5 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -20,6 +20,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { let IndexScheduler { cleanup_enabled: _, + experimental_no_edition_2024_for_dumps: _, processing_tasks, env, version, diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index b2f27d66b..46566b9ba 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -168,6 +168,9 @@ pub struct IndexScheduler { /// Whether we should automatically cleanup the task queue or not. pub(crate) cleanup_enabled: bool, + /// Whether we should use the old document indexer or the new one. + pub(crate) experimental_no_edition_2024_for_dumps: bool, + /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, /// The Authorization header to send to the webhook URL. @@ -210,6 +213,7 @@ impl IndexScheduler { index_mapper: self.index_mapper.clone(), cleanup_enabled: self.cleanup_enabled, + experimental_no_edition_2024_for_dumps: self.experimental_no_edition_2024_for_dumps, webhook_url: self.webhook_url.clone(), webhook_authorization_header: self.webhook_authorization_header.clone(), embedders: self.embedders.clone(), @@ -296,6 +300,9 @@ impl IndexScheduler { index_mapper, env, cleanup_enabled: options.cleanup_enabled, + experimental_no_edition_2024_for_dumps: options + .indexer_config + .experimental_no_edition_2024_for_dumps, webhook_url: options.webhook_url, webhook_authorization_header: options.webhook_authorization_header, embedders: Default::default(), @@ -594,6 +601,11 @@ impl IndexScheduler { Ok(nbr_index_processing_tasks > 0) } + /// Whether the index should use the old document indexer. + pub fn no_edition_2024_for_dumps(&self) -> bool { + self.experimental_no_edition_2024_for_dumps + } + /// Return the tasks matching the query from the user's point of view along /// with the total number of tasks matching the query, ignoring from and limit. /// diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index b8d100415..b14f23d0b 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -5,6 +5,7 @@ use std::sync::atomic::Ordering; use dump::IndexMetadata; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self}; @@ -227,12 +228,21 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index a951a7ca6..0cd06f2e4 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -9,6 +9,7 @@ use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; @@ -229,12 +230,21 @@ impl IndexScheduler { )); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert( embedder_name, diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index a9b920bd2..791fed4d8 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -3,6 +3,7 @@ use std::collections::BTreeMap; use big_s::S; use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; use meilisearch_types::milli::vector::SearchQuery; @@ -220,8 +221,8 @@ fn import_vectors() { let embeddings = index.embeddings(&rtxn, 0).unwrap(); - assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -311,9 +312,9 @@ fn import_vectors() { let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == patou_embed, @"true"); // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -497,13 +498,13 @@ fn import_vectors_first_and_embedder_later() { let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; - assert!(!embedding.is_empty(), "{embedding:?}"); + let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"]; + assert!(!embeddings.is_empty(), "{embeddings:?}"); // the document with the id 3 should keep its original embedding let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embeddings, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"]; snapshot!(embeddings.len(), @"1"); assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); @@ -558,7 +559,7 @@ fn import_vectors_first_and_embedder_later() { "###); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); @@ -566,7 +567,7 @@ fn import_vectors_first_and_embedder_later() { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); } @@ -696,7 +697,7 @@ fn delete_document_containing_vector() { "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["manual"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["manual"]; assert!(!embedding.is_empty(), "{embedding:?}"); index_scheduler diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index faf59643f..f3279a094 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,7 +24,7 @@ enum-iterator = "2.1.0" file-store = { path = "../file-store" } flate2 = "1.1.2" fst = "0.4.7" -memmap2 = "0.9.5" +memmap2 = "0.9.7" milli = { path = "../milli" } roaring = { version = "0.10.12", features = ["serde"] } rustc-hash = "2.1.1" diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index e210f8df3..aec3199a3 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -233,9 +233,6 @@ pub enum Action { #[serde(rename = "*")] #[deserr(rename = "*")] All = 0, - #[serde(rename = "*.get")] - #[deserr(rename = "*.get")] - AllGet, #[serde(rename = "search")] #[deserr(rename = "search")] Search, @@ -365,6 +362,9 @@ pub enum Action { #[serde(rename = "chatsSettings.update")] #[deserr(rename = "chatsSettings.update")] ChatsSettingsUpdate, + #[serde(rename = "*.get")] + #[deserr(rename = "*.get")] + AllGet, } impl Action { @@ -403,6 +403,7 @@ impl Action { METRICS_GET => Some(Self::MetricsGet), DUMPS_ALL => Some(Self::DumpsAll), DUMPS_CREATE => Some(Self::DumpsCreate), + SNAPSHOTS_ALL => Some(Self::SnapshotsAll), SNAPSHOTS_CREATE => Some(Self::SnapshotsCreate), VERSION => Some(Self::Version), KEYS_CREATE => Some(Self::KeysAdd), @@ -411,8 +412,10 @@ impl Action { KEYS_DELETE => Some(Self::KeysDelete), EXPERIMENTAL_FEATURES_GET => Some(Self::ExperimentalFeaturesGet), EXPERIMENTAL_FEATURES_UPDATE => Some(Self::ExperimentalFeaturesUpdate), + EXPORT => Some(Self::Export), NETWORK_GET => Some(Self::NetworkGet), NETWORK_UPDATE => Some(Self::NetworkUpdate), + ALL_GET => Some(Self::AllGet), _otherwise => None, } } @@ -497,6 +500,7 @@ pub mod actions { pub const METRICS_GET: u8 = MetricsGet.repr(); pub const DUMPS_ALL: u8 = DumpsAll.repr(); pub const DUMPS_CREATE: u8 = DumpsCreate.repr(); + pub const SNAPSHOTS_ALL: u8 = SnapshotsAll.repr(); pub const SNAPSHOTS_CREATE: u8 = SnapshotsCreate.repr(); pub const VERSION: u8 = Version.repr(); pub const KEYS_CREATE: u8 = KeysAdd.repr(); @@ -519,3 +523,68 @@ pub mod actions { pub const CHATS_SETTINGS_GET: u8 = ChatsSettingsGet.repr(); pub const CHATS_SETTINGS_UPDATE: u8 = ChatsSettingsUpdate.repr(); } + +#[cfg(test)] +pub(crate) mod test { + use super::actions::*; + use super::Action::*; + use super::*; + + #[test] + fn test_action_repr_and_constants() { + assert!(All.repr() == 0 && ALL == 0); + assert!(Search.repr() == 1 && SEARCH == 1); + assert!(DocumentsAll.repr() == 2 && DOCUMENTS_ALL == 2); + assert!(DocumentsAdd.repr() == 3 && DOCUMENTS_ADD == 3); + assert!(DocumentsGet.repr() == 4 && DOCUMENTS_GET == 4); + assert!(DocumentsDelete.repr() == 5 && DOCUMENTS_DELETE == 5); + assert!(IndexesAll.repr() == 6 && INDEXES_ALL == 6); + assert!(IndexesAdd.repr() == 7 && INDEXES_CREATE == 7); + assert!(IndexesGet.repr() == 8 && INDEXES_GET == 8); + assert!(IndexesUpdate.repr() == 9 && INDEXES_UPDATE == 9); + assert!(IndexesDelete.repr() == 10 && INDEXES_DELETE == 10); + assert!(IndexesSwap.repr() == 11 && INDEXES_SWAP == 11); + assert!(TasksAll.repr() == 12 && TASKS_ALL == 12); + assert!(TasksCancel.repr() == 13 && TASKS_CANCEL == 13); + assert!(TasksDelete.repr() == 14 && TASKS_DELETE == 14); + assert!(TasksGet.repr() == 15 && TASKS_GET == 15); + assert!(SettingsAll.repr() == 16 && SETTINGS_ALL == 16); + assert!(SettingsGet.repr() == 17 && SETTINGS_GET == 17); + assert!(SettingsUpdate.repr() == 18 && SETTINGS_UPDATE == 18); + assert!(StatsAll.repr() == 19 && STATS_ALL == 19); + assert!(StatsGet.repr() == 20 && STATS_GET == 20); + assert!(MetricsAll.repr() == 21 && METRICS_ALL == 21); + assert!(MetricsGet.repr() == 22 && METRICS_GET == 22); + assert!(DumpsAll.repr() == 23 && DUMPS_ALL == 23); + assert!(DumpsCreate.repr() == 24 && DUMPS_CREATE == 24); + assert!(SnapshotsAll.repr() == 25 && SNAPSHOTS_ALL == 25); + assert!(SnapshotsCreate.repr() == 26 && SNAPSHOTS_CREATE == 26); + assert!(Version.repr() == 27 && VERSION == 27); + assert!(KeysAdd.repr() == 28 && KEYS_CREATE == 28); + assert!(KeysGet.repr() == 29 && KEYS_GET == 29); + assert!(KeysUpdate.repr() == 30 && KEYS_UPDATE == 30); + assert!(KeysDelete.repr() == 31 && KEYS_DELETE == 31); + assert!(ExperimentalFeaturesGet.repr() == 32 && EXPERIMENTAL_FEATURES_GET == 32); + assert!(ExperimentalFeaturesUpdate.repr() == 33 && EXPERIMENTAL_FEATURES_UPDATE == 33); + assert!(Export.repr() == 34 && EXPORT == 34); + assert!(NetworkGet.repr() == 35 && NETWORK_GET == 35); + assert!(NetworkUpdate.repr() == 36 && NETWORK_UPDATE == 36); + assert!(ChatCompletions.repr() == 37 && CHAT_COMPLETIONS == 37); + assert!(ChatsAll.repr() == 38 && CHATS_ALL == 38); + assert!(ChatsGet.repr() == 39 && CHATS_GET == 39); + assert!(ChatsDelete.repr() == 40 && CHATS_DELETE == 40); + assert!(ChatsSettingsAll.repr() == 41 && CHATS_SETTINGS_ALL == 41); + assert!(ChatsSettingsGet.repr() == 42 && CHATS_SETTINGS_GET == 42); + assert!(ChatsSettingsUpdate.repr() == 43 && CHATS_SETTINGS_UPDATE == 43); + assert!(AllGet.repr() == 44 && ALL_GET == 44); + } + + #[test] + fn test_from_repr() { + for action in enum_iterator::all::() { + let repr = action.repr(); + let action_from_repr = Action::from_repr(repr); + assert_eq!(Some(action), action_from_repr, "Failed for action: {:?}", action); + } + } +} diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 83eb439d9..9da316b84 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -50,6 +50,7 @@ jsonwebtoken = "9.3.1" lazy_static = "1.5.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.7" mimalloc = { version = "0.1.47", default-features = false } mime = "0.3.17" num_cpus = "1.17.0" @@ -169,5 +170,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.20/build.zip" -sha1 = "82a7ddd7bf14bb5323c3d235d2b62892a98b6a59" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.21/build.zip" +sha1 = "94f56a8e24e2e3a1bc1bd7d9ceaa23464a5e241a" diff --git a/crates/meilisearch/db.snapshot b/crates/meilisearch/db.snapshot deleted file mode 100644 index 29377ce42..000000000 Binary files a/crates/meilisearch/db.snapshot and /dev/null differ diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 0abc5c817..a2a0f0c05 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -203,6 +203,7 @@ struct Infos { experimental_composite_embedders: bool, experimental_embedding_cache_entries: usize, experimental_no_snapshot_compaction: bool, + experimental_no_edition_2024_for_dumps: bool, experimental_no_edition_2024_for_settings: bool, gpu_enabled: bool, db_path: bool, @@ -293,6 +294,7 @@ impl Infos { max_indexing_threads, skip_index_budget: _, experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps, } = indexer_options; let RuntimeTogglableFeatures { @@ -329,6 +331,7 @@ impl Infos { experimental_composite_embedders: composite_embedders, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, + experimental_no_edition_2024_for_dumps, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 43d7afe0e..0fb93b65a 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -30,6 +30,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest}; use analytics::Analytics; use anyhow::bail; +use bumpalo::Bump; use error::PayloadError; use extractors::payload::PayloadConfig; use index_scheduler::versioning::Versioning; @@ -38,6 +39,7 @@ use meilisearch_auth::{open_auth_store_env, AuthController}; use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use meilisearch_types::milli::progress::{EmbedderStats, Progress}; +use meilisearch_types::milli::update::new::indexer; use meilisearch_types::milli::update::{ default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, }; @@ -533,7 +535,7 @@ fn import_dump( let mut index_reader = index_reader?; let metadata = index_reader.metadata(); let uid = metadata.uid.clone(); - tracing::info!("Importing index `{}`.", metadata.uid); + tracing::info!("Importing index `{uid}`."); let date = Some((metadata.created_at, metadata.updated_at)); let index = index_scheduler.create_raw_index(&metadata.uid, date)?; @@ -552,48 +554,100 @@ fn import_dump( apply_settings_to_builder(&settings, &mut builder); let embedder_stats: Arc = Default::default(); builder.execute(&|| false, &progress, embedder_stats.clone())?; + wtxn.commit()?; - // 5.3 Import the documents. - // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. - tracing::info!("Importing the documents."); - let file = tempfile::tempfile()?; - let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); - for document in index_reader.documents()? { - builder.append_json_object(&document?)?; + let mut wtxn = index.write_txn()?; + let rtxn = index.read_txn()?; + + if index_scheduler.no_edition_2024_for_dumps() { + // 5.3 Import the documents. + // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. + tracing::info!("Importing the documents."); + let file = tempfile::tempfile()?; + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); + for document in index_reader.documents()? { + builder.append_json_object(&document?)?; + } + + // This flush the content of the batch builder. + let file = builder.into_inner()?.into_inner()?; + + // 5.3.2 We feed it to the milli index. + let reader = BufReader::new(file); + let reader = DocumentsBatchReader::from_reader(reader)?; + + let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; + let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; + + let builder = milli::update::IndexDocuments::new( + &mut wtxn, + &index, + indexer_config, + IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }, + |indexing_step| tracing::trace!("update: {:?}", indexing_step), + || false, + &embedder_stats, + )?; + + let builder = builder.with_embedders(embedders); + + let (builder, user_result) = builder.add_documents(reader)?; + let user_result = user_result?; + tracing::info!(documents_found = user_result, "{} documents found.", user_result); + builder.execute()?; + } else { + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let primary_key = index.primary_key(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + let mut indexer = indexer::DocumentOperation::new(); + let embedders = index.embedding_configs().embedding_configs(&rtxn)?; + let embedders = index_scheduler.embedders(uid.clone(), embedders)?; + + let mmap = unsafe { memmap2::Mmap::map(index_reader.documents_file())? }; + + indexer.replace_documents(&mmap)?; + + let indexer_config = index_scheduler.indexer_config(); + let pool = &indexer_config.thread_pool; + + let indexer_alloc = Bump::new(); + let (document_changes, mut operation_stats, primary_key) = indexer.into_changes( + &indexer_alloc, + &index, + &rtxn, + primary_key, + &mut new_fields_ids_map, + &|| false, // never stop processing a dump + progress.clone(), + )?; + + let operation_stats = operation_stats.pop().unwrap(); + if let Some(error) = operation_stats.error { + return Err(error.into()); + } + + let _congestion = indexer::index( + &mut wtxn, + &index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| false, // never stop processing a dump + &progress, + &embedder_stats, + )?; } - // This flush the content of the batch builder. - let file = builder.into_inner()?.into_inner()?; - - // 5.3.2 We feed it to the milli index. - let reader = BufReader::new(file); - let reader = DocumentsBatchReader::from_reader(reader)?; - - let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; - let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; - - let builder = milli::update::IndexDocuments::new( - &mut wtxn, - &index, - indexer_config, - IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }, - |indexing_step| tracing::trace!("update: {:?}", indexing_step), - || false, - &embedder_stats, - )?; - - let builder = builder.with_embedders(embedders); - - let (builder, user_result) = builder.add_documents(reader)?; - let user_result = user_result?; - tracing::info!(documents_found = user_result, "{} documents found.", user_result); - builder.execute()?; wtxn.commit()?; tracing::info!("All documents successfully imported."); - index_scheduler.refresh_index_stats(&uid)?; } diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index d52e04cc6..607bc91eb 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -15,30 +15,33 @@ lazy_static! { "Meilisearch number of degraded search requests" )) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_SEARCH_REQUESTS: IntCounterVec = register_int_counter_vec!( + pub static ref MEILISEARCH_CHAT_SEARCHES_TOTAL: IntCounterVec = register_int_counter_vec!( opts!( - "meilisearch_chat_search_requests", - "Meilisearch number of search requests performed by the chat route itself" + "meilisearch_chat_searches_total", + "Total number of searches performed by the chat route" ), &["type"] ) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( - opts!("meilisearch_chat_prompt_tokens_usage", "Meilisearch Chat Prompt Tokens Usage"), + pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_TOTAL: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_prompt_tokens_total", "Total number of prompt tokens consumed"), &["workspace", "model"] ) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE: IntCounterVec = + pub static ref MEILISEARCH_CHAT_COMPLETION_TOKENS_TOTAL: IntCounterVec = register_int_counter_vec!( opts!( - "meilisearch_chat_completion_tokens_usage", - "Meilisearch Chat Completion Tokens Usage" + "meilisearch_chat_completion_tokens_total", + "Total number of completion tokens consumed" ), &["workspace", "model"] ) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( - opts!("meilisearch_chat_total_tokens_usage", "Meilisearch Chat Total Tokens Usage"), + pub static ref MEILISEARCH_CHAT_TOKENS_TOTAL: IntCounterVec = register_int_counter_vec!( + opts!( + "meilisearch_chat_tokens_total", + "Total number of tokens consumed (prompt + completion)" + ), &["workspace", "model"] ) .expect("Can't create a metric"); diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 9658352c8..dd77a1222 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -68,6 +68,8 @@ const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str = const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str = "MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES"; const MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION: &str = "MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION"; +const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS: &str = + "MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_DB_PATH: &str = "./data.ms"; const DEFAULT_HTTP_ADDR: &str = "localhost:7700"; @@ -759,6 +761,15 @@ pub struct IndexerOpts { #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)] #[serde(default)] pub experimental_no_edition_2024_for_settings: bool, + + /// Experimental make dump imports use the old document indexer. + /// + /// When enabled, Meilisearch will use the old document indexer when importing dumps. + /// + /// For more information, see . + #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)] + #[serde(default)] + pub experimental_no_edition_2024_for_dumps: bool, } impl IndexerOpts { @@ -769,6 +780,7 @@ impl IndexerOpts { max_indexing_threads, skip_index_budget: _, experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps, } = self; if let Some(max_indexing_memory) = max_indexing_memory.0 { export_to_env_if_not_present( @@ -788,6 +800,12 @@ impl IndexerOpts { experimental_no_edition_2024_for_settings.to_string(), ); } + if experimental_no_edition_2024_for_dumps { + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS, + experimental_no_edition_2024_for_dumps.to_string(), + ); + } } } @@ -808,6 +826,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { skip_index_budget: other.skip_index_budget, experimental_no_edition_2024_for_settings: other .experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps: other.experimental_no_edition_2024_for_dumps, chunk_compression_type: Default::default(), chunk_compression_level: Default::default(), documents_chunk_size: Default::default(), diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index b636678f5..f2c17a696 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -50,8 +50,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, - MEILISEARCH_CHAT_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, + MEILISEARCH_CHAT_COMPLETION_TOKENS_TOTAL, MEILISEARCH_CHAT_PROMPT_TOKENS_TOTAL, + MEILISEARCH_CHAT_SEARCHES_TOTAL, MEILISEARCH_CHAT_TOKENS_TOTAL, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; @@ -319,7 +319,7 @@ async fn process_search_request( }; let mut documents = Vec::new(); if let Ok((ref rtxn, ref search_result)) = output { - MEILISEARCH_CHAT_SEARCH_REQUESTS.with_label_values(&["internal"]).inc(); + MEILISEARCH_CHAT_SEARCHES_TOTAL.with_label_values(&["internal"]).inc(); if search_result.degraded { MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } @@ -596,13 +596,13 @@ async fn run_conversation( match result { Ok(resp) => { if let Some(usage) = resp.usage.as_ref() { - MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE + MEILISEARCH_CHAT_PROMPT_TOKENS_TOTAL .with_label_values(&[workspace_uid, &chat_completion.model]) .inc_by(usage.prompt_tokens as u64); - MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE + MEILISEARCH_CHAT_COMPLETION_TOKENS_TOTAL .with_label_values(&[workspace_uid, &chat_completion.model]) .inc_by(usage.completion_tokens as u64); - MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE + MEILISEARCH_CHAT_TOKENS_TOTAL .with_label_values(&[workspace_uid, &chat_completion.model]) .inc_by(usage.total_tokens as u64); } diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index cf6181de5..5ced4603e 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -19,6 +19,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::documents::sort::recursive_sort; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::{AscDesc, DocumentId}; @@ -1470,9 +1471,13 @@ fn some_documents<'a, 't: 'a>( Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? { + for ( + name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }, + ) in index.embeddings(rtxn, key)? + { let embeddings = - ExplicitVectors { embeddings: Some(vector.into()), regenerate }; + ExplicitVectors { embeddings: Some(embeddings.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 586dea83f..7bcc8a9f8 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -16,7 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::locales::Locale; -use meilisearch_types::milli::index::{self, SearchParameters}; +use meilisearch_types::milli::index::{self, EmbeddingsWithMetadata, SearchParameters}; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; @@ -1528,8 +1528,11 @@ impl<'a> HitMaker<'a> { Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? { - let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate }; + for (name, EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }) in + self.index.embeddings(self.rtxn, id)? + { + let embeddings = + ExplicitVectors { embeddings: Some(embeddings.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index 0b8a3d2c5..6dc3f429b 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r#" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `*.get`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`, `*.get`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index e8d935fde..b16ccb2f5 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `*.get`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`, `*.get`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 5f82bb380..ad0678122 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -466,6 +466,7 @@ pub fn default_settings(dir: impl AsRef) -> Opt { // Having 2 threads makes the tests way faster max_indexing_threads: MaxThreads::from_str("2").unwrap(), experimental_no_edition_2024_for_settings: false, + experimental_no_edition_2024_for_dumps: false, }, experimental_enable_metrics: false, ..Parser::parse_from(None as Option<&str>) diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index b967e620c..170bbdcc8 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -15,6 +15,7 @@ use meilisearch_types::heed::{ }; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; @@ -591,12 +592,21 @@ fn export_documents( .into()); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 3d08252ac..d94a4d4e1 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -40,7 +40,7 @@ indexmap = { version = "2.9.0", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memchr = "2.7.5" -memmap2 = "0.9.5" +memmap2 = "0.9.7" obkv = "0.3.0" once_cell = "1.21.3" ordered-float = "5.0.0" diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index b2ec992ba..9f32fdb04 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1766,20 +1766,22 @@ impl Index { &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result, bool)>> { + ) -> Result> { let mut res = BTreeMap::new(); let embedders = self.embedding_configs(); for config in embedders.embedding_configs(rtxn)? { let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); + let has_fragments = config.config.embedder_options.has_fragments(); let reader = ArroyWrapper::new( self.vector_arroy, embedder_info.embedder_id, config.config.quantized(), ); let embeddings = reader.item_vectors(rtxn, docid)?; + let regenerate = embedder_info.embedding_status.must_regenerate(docid); res.insert( config.name.to_owned(), - (embeddings, embedder_info.embedding_status.must_regenerate(docid)), + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, ); } Ok(res) @@ -1919,6 +1921,12 @@ impl Index { } } +pub struct EmbeddingsWithMetadata { + pub embeddings: Vec, + pub regenerate: bool, + pub has_fragments: bool, +} + #[derive(Debug, Default, Deserialize, Serialize)] pub struct ChatConfig { pub description: String, diff --git a/crates/milli/src/update/chat.rs b/crates/milli/src/update/chat.rs index 2f364894d..a6c0b3fbc 100644 --- a/crates/milli/src/update/chat.rs +++ b/crates/milli/src/update/chat.rs @@ -93,7 +93,7 @@ pub struct ChatSearchParams { pub hybrid: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default = Setting::Set(20))] + #[deserr(default)] #[schema(value_type = Option)] pub limit: Setting, diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 064cfd154..a1dfa1aad 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -23,7 +23,7 @@ use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; +use crate::vector::db::{EmbedderInfo, EmbeddingStatusDelta}; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; @@ -441,6 +441,8 @@ pub fn extract_vector_points( { let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_)); + let (old_is_user_provided, old_must_regenerate) = + embedder_info.embedding_status.is_user_provided_must_regenerate(docid); let (old, new) = parsed_vectors.remove(embedder_name); let new_must_regenerate = new.must_regenerate(); let delta = match action { @@ -499,16 +501,19 @@ pub fn extract_vector_points( let is_adding_fragments = has_fragments && !old_has_fragments; - if is_adding_fragments { + if !has_fragments { + // removing fragments + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } else if is_adding_fragments || + // regenerate all fragments when going from user provided to ! user provided + old_is_user_provided + { regenerate_all_fragments( runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, ) - } else if !has_fragments { - // removing fragments - regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? } else { let mut fragment_diff = Vec::new(); let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); @@ -600,7 +605,8 @@ pub fn extract_vector_points( docid, &delta, new_must_regenerate, - &embedder_info.embedding_status, + old_is_user_provided, + old_must_regenerate, ); // and we finally push the unique vectors into the writer @@ -657,10 +663,9 @@ fn push_embedding_status_delta( docid: DocumentId, delta: &VectorStateDelta, new_must_regenerate: bool, - embedding_status: &EmbeddingStatus, + old_is_user_provided: bool, + old_must_regenerate: bool, ) { - let (old_is_user_provided, old_must_regenerate) = - embedding_status.is_user_provided_must_regenerate(docid); let new_is_user_provided = match delta { VectorStateDelta::NoChange => old_is_user_provided, VectorStateDelta::NowRemoved => { diff --git a/crates/milli/src/update/indexer_config.rs b/crates/milli/src/update/indexer_config.rs index a0f901818..845da5a51 100644 --- a/crates/milli/src/update/indexer_config.rs +++ b/crates/milli/src/update/indexer_config.rs @@ -16,6 +16,7 @@ pub struct IndexerConfig { pub max_positions_per_attributes: Option, pub skip_index_budget: bool, pub experimental_no_edition_2024_for_settings: bool, + pub experimental_no_edition_2024_for_dumps: bool, } impl IndexerConfig { @@ -65,6 +66,7 @@ impl Default for IndexerConfig { max_positions_per_attributes: None, skip_index_budget: false, experimental_no_edition_2024_for_settings: false, + experimental_no_edition_2024_for_dumps: false, } } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4ca68027c..71fa9bf09 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { - match &mut self.kind { - ChunkType::Fragments { fragments: _, session } => { - let doc_alloc = session.doc_alloc(); + self.set_status(docid, old_is_user_provided, true, false, true); - if old_is_user_provided | full_reindex { + match &mut self.kind { + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let reindex_all_fragments = + // when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment, + // and so Meilisearch needs to clear all embeddings in that case. + // Fortunately, as dump export fragment vector with `regenerate` set to `false`, + // this case should be rare and opt-in. + old_is_user_provided || + // full-reindex case + full_reindex; + + if reindex_all_fragments { session.on_embed_mut().clear_vectors(docid); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + insert_autogenerated( + docid, + external_docid, + extractors, + document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } settings_delta.try_for_each_fragment_diff( @@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Result::Ok(()) }, )?; - self.set_status(docid, old_is_user_provided, true, false, true); } ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { ExtractorDiff::Removed => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } OnEmbed::process_embedding_response( session.on_embed_mut(), crate::vector::session::EmbeddingResponse { metadata, embedding: None }, ); } ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } session.request_embedding(metadata, input, unused_vectors_distribution)?; } ExtractorDiff::Unchanged => { /* do nothing */ } @@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + false, + new_must_regenerate, + ); match &mut self.kind { ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map, ); - if old_is_user_provided { - session.on_embed_mut().clear_vectors(docid); - } - update_autogenerated( docid, external_docid, @@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &external_docid, old_must_regenerate, + old_is_user_provided, session, unused_vectors_distribution, )? @@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { }); if old_is_user_provided { + // when the document was `userProvided`, Meilisearch cannot know whose fragments a particular + // vector was referring to. + // So as a result Meilisearch will regenerate all fragments on this case. + // Fortunately, since dumps for fragments set regenerate to false, this case should be rare. session.on_embed_mut().clear_vectors(docid); + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } update_autogenerated( @@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &(), old_must_regenerate, + false, session, unused_vectors_distribution, )? } }; - self.set_status( - docid, - old_is_user_provided, - old_must_regenerate, - false, - new_must_regenerate, - ); - Ok(()) } #[allow(clippy::too_many_arguments)] - pub fn insert_autogenerated + Debug>( + pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>( &mut self, docid: DocumentId, external_docid: &'a str, @@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map: &'a RefCell, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, new_must_regenerate: bool, - ) -> Result<()> { + ) -> Result<()> + where + 'a: 'doc, + { let (default_is_user_provided, default_must_regenerate) = (false, true); self.set_status( docid, @@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( new_document: ND, meta: &E::DocumentMetadata, old_must_regenerate: bool, + mut must_clear_on_generation: bool, session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, ) -> Result<()> @@ -984,6 +1027,11 @@ where }; if must_regenerate { + if must_clear_on_generation { + must_clear_on_generation = false; + session.on_embed_mut().clear_vectors(docid); + } + let metadata = Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; @@ -1002,7 +1050,7 @@ where Ok(()) } -fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( +fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>( docid: DocumentId, external_docid: &'a str, extractors: impl IntoIterator, diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 088d98b72..1f07f6c4f 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -835,6 +835,25 @@ impl EmbedderOptions { } } } + + pub fn has_fragments(&self) -> bool { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => false, + EmbedderOptions::Rest(embedder_options) => { + !embedder_options.indexing_fragments.is_empty() + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + !embedder_options.indexing_fragments.is_empty() + } else { + false + } + } + } + } } impl Default for EmbedderOptions {