diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index b8d100415..b14f23d0b 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -5,6 +5,7 @@ use std::sync::atomic::Ordering; use dump::IndexMetadata; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self}; @@ -227,12 +228,21 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index a951a7ca6..0cd06f2e4 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -9,6 +9,7 @@ use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; @@ -229,12 +230,21 @@ impl IndexScheduler { )); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert( embedder_name, diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index a9b920bd2..791fed4d8 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -3,6 +3,7 @@ use std::collections::BTreeMap; use big_s::S; use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; use meilisearch_types::milli::vector::SearchQuery; @@ -220,8 +221,8 @@ fn import_vectors() { let embeddings = index.embeddings(&rtxn, 0).unwrap(); - assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -311,9 +312,9 @@ fn import_vectors() { let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == patou_embed, @"true"); // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -497,13 +498,13 @@ fn import_vectors_first_and_embedder_later() { let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; - assert!(!embedding.is_empty(), "{embedding:?}"); + let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"]; + assert!(!embeddings.is_empty(), "{embeddings:?}"); // the document with the id 3 should keep its original embedding let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embeddings, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"]; snapshot!(embeddings.len(), @"1"); assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); @@ -558,7 +559,7 @@ fn import_vectors_first_and_embedder_later() { "###); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); @@ -566,7 +567,7 @@ fn import_vectors_first_and_embedder_later() { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); } @@ -696,7 +697,7 @@ fn delete_document_containing_vector() { "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["manual"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["manual"]; assert!(!embedding.is_empty(), "{embedding:?}"); index_scheduler diff --git a/crates/meilisearch/db.snapshot b/crates/meilisearch/db.snapshot deleted file mode 100644 index 29377ce42..000000000 Binary files a/crates/meilisearch/db.snapshot and /dev/null differ diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 947cd153f..138f5140f 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -19,6 +19,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::documents::sort::recursive_sort; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::{AscDesc, DocumentId}; @@ -1460,9 +1461,13 @@ fn some_documents<'a, 't: 'a>( Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? { + for ( + name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }, + ) in index.embeddings(rtxn, key)? + { let embeddings = - ExplicitVectors { embeddings: Some(vector.into()), regenerate }; + ExplicitVectors { embeddings: Some(embeddings.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 93efad67f..82096e7b4 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -16,7 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::locales::Locale; -use meilisearch_types::milli::index::{self, SearchParameters}; +use meilisearch_types::milli::index::{self, EmbeddingsWithMetadata, SearchParameters}; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; @@ -1528,8 +1528,11 @@ impl<'a> HitMaker<'a> { Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? { - let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate }; + for (name, EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }) in + self.index.embeddings(self.rtxn, id)? + { + let embeddings = + ExplicitVectors { embeddings: Some(embeddings.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index b967e620c..170bbdcc8 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -15,6 +15,7 @@ use meilisearch_types::heed::{ }; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; @@ -591,12 +592,21 @@ fn export_documents( .into()); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index b2ec992ba..9f32fdb04 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1766,20 +1766,22 @@ impl Index { &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result, bool)>> { + ) -> Result> { let mut res = BTreeMap::new(); let embedders = self.embedding_configs(); for config in embedders.embedding_configs(rtxn)? { let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); + let has_fragments = config.config.embedder_options.has_fragments(); let reader = ArroyWrapper::new( self.vector_arroy, embedder_info.embedder_id, config.config.quantized(), ); let embeddings = reader.item_vectors(rtxn, docid)?; + let regenerate = embedder_info.embedding_status.must_regenerate(docid); res.insert( config.name.to_owned(), - (embeddings, embedder_info.embedding_status.must_regenerate(docid)), + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, ); } Ok(res) @@ -1919,6 +1921,12 @@ impl Index { } } +pub struct EmbeddingsWithMetadata { + pub embeddings: Vec, + pub regenerate: bool, + pub has_fragments: bool, +} + #[derive(Debug, Default, Deserialize, Serialize)] pub struct ChatConfig { pub description: String, diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 064cfd154..a1dfa1aad 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -23,7 +23,7 @@ use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; +use crate::vector::db::{EmbedderInfo, EmbeddingStatusDelta}; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; @@ -441,6 +441,8 @@ pub fn extract_vector_points( { let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_)); + let (old_is_user_provided, old_must_regenerate) = + embedder_info.embedding_status.is_user_provided_must_regenerate(docid); let (old, new) = parsed_vectors.remove(embedder_name); let new_must_regenerate = new.must_regenerate(); let delta = match action { @@ -499,16 +501,19 @@ pub fn extract_vector_points( let is_adding_fragments = has_fragments && !old_has_fragments; - if is_adding_fragments { + if !has_fragments { + // removing fragments + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } else if is_adding_fragments || + // regenerate all fragments when going from user provided to ! user provided + old_is_user_provided + { regenerate_all_fragments( runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, ) - } else if !has_fragments { - // removing fragments - regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? } else { let mut fragment_diff = Vec::new(); let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); @@ -600,7 +605,8 @@ pub fn extract_vector_points( docid, &delta, new_must_regenerate, - &embedder_info.embedding_status, + old_is_user_provided, + old_must_regenerate, ); // and we finally push the unique vectors into the writer @@ -657,10 +663,9 @@ fn push_embedding_status_delta( docid: DocumentId, delta: &VectorStateDelta, new_must_regenerate: bool, - embedding_status: &EmbeddingStatus, + old_is_user_provided: bool, + old_must_regenerate: bool, ) { - let (old_is_user_provided, old_must_regenerate) = - embedding_status.is_user_provided_must_regenerate(docid); let new_is_user_provided = match delta { VectorStateDelta::NoChange => old_is_user_provided, VectorStateDelta::NowRemoved => { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4ca68027c..71fa9bf09 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { - match &mut self.kind { - ChunkType::Fragments { fragments: _, session } => { - let doc_alloc = session.doc_alloc(); + self.set_status(docid, old_is_user_provided, true, false, true); - if old_is_user_provided | full_reindex { + match &mut self.kind { + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let reindex_all_fragments = + // when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment, + // and so Meilisearch needs to clear all embeddings in that case. + // Fortunately, as dump export fragment vector with `regenerate` set to `false`, + // this case should be rare and opt-in. + old_is_user_provided || + // full-reindex case + full_reindex; + + if reindex_all_fragments { session.on_embed_mut().clear_vectors(docid); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + insert_autogenerated( + docid, + external_docid, + extractors, + document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } settings_delta.try_for_each_fragment_diff( @@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Result::Ok(()) }, )?; - self.set_status(docid, old_is_user_provided, true, false, true); } ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { ExtractorDiff::Removed => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } OnEmbed::process_embedding_response( session.on_embed_mut(), crate::vector::session::EmbeddingResponse { metadata, embedding: None }, ); } ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } session.request_embedding(metadata, input, unused_vectors_distribution)?; } ExtractorDiff::Unchanged => { /* do nothing */ } @@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + false, + new_must_regenerate, + ); match &mut self.kind { ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map, ); - if old_is_user_provided { - session.on_embed_mut().clear_vectors(docid); - } - update_autogenerated( docid, external_docid, @@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &external_docid, old_must_regenerate, + old_is_user_provided, session, unused_vectors_distribution, )? @@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { }); if old_is_user_provided { + // when the document was `userProvided`, Meilisearch cannot know whose fragments a particular + // vector was referring to. + // So as a result Meilisearch will regenerate all fragments on this case. + // Fortunately, since dumps for fragments set regenerate to false, this case should be rare. session.on_embed_mut().clear_vectors(docid); + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } update_autogenerated( @@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &(), old_must_regenerate, + false, session, unused_vectors_distribution, )? } }; - self.set_status( - docid, - old_is_user_provided, - old_must_regenerate, - false, - new_must_regenerate, - ); - Ok(()) } #[allow(clippy::too_many_arguments)] - pub fn insert_autogenerated + Debug>( + pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>( &mut self, docid: DocumentId, external_docid: &'a str, @@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map: &'a RefCell, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, new_must_regenerate: bool, - ) -> Result<()> { + ) -> Result<()> + where + 'a: 'doc, + { let (default_is_user_provided, default_must_regenerate) = (false, true); self.set_status( docid, @@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( new_document: ND, meta: &E::DocumentMetadata, old_must_regenerate: bool, + mut must_clear_on_generation: bool, session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, ) -> Result<()> @@ -984,6 +1027,11 @@ where }; if must_regenerate { + if must_clear_on_generation { + must_clear_on_generation = false; + session.on_embed_mut().clear_vectors(docid); + } + let metadata = Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; @@ -1002,7 +1050,7 @@ where Ok(()) } -fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( +fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>( docid: DocumentId, external_docid: &'a str, extractors: impl IntoIterator, diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index f64223e41..873693a34 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -841,6 +841,25 @@ impl EmbedderOptions { } } } + + pub fn has_fragments(&self) -> bool { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => false, + EmbedderOptions::Rest(embedder_options) => { + !embedder_options.indexing_fragments.is_empty() + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + !embedder_options.indexing_fragments.is_empty() + } else { + false + } + } + } + } } impl Default for EmbedderOptions {