diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4ca68027c..71fa9bf09 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { - match &mut self.kind { - ChunkType::Fragments { fragments: _, session } => { - let doc_alloc = session.doc_alloc(); + self.set_status(docid, old_is_user_provided, true, false, true); - if old_is_user_provided | full_reindex { + match &mut self.kind { + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let reindex_all_fragments = + // when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment, + // and so Meilisearch needs to clear all embeddings in that case. + // Fortunately, as dump export fragment vector with `regenerate` set to `false`, + // this case should be rare and opt-in. + old_is_user_provided || + // full-reindex case + full_reindex; + + if reindex_all_fragments { session.on_embed_mut().clear_vectors(docid); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + insert_autogenerated( + docid, + external_docid, + extractors, + document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } settings_delta.try_for_each_fragment_diff( @@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Result::Ok(()) }, )?; - self.set_status(docid, old_is_user_provided, true, false, true); } ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { ExtractorDiff::Removed => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } OnEmbed::process_embedding_response( session.on_embed_mut(), crate::vector::session::EmbeddingResponse { metadata, embedding: None }, ); } ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } session.request_embedding(metadata, input, unused_vectors_distribution)?; } ExtractorDiff::Unchanged => { /* do nothing */ } @@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + false, + new_must_regenerate, + ); match &mut self.kind { ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map, ); - if old_is_user_provided { - session.on_embed_mut().clear_vectors(docid); - } - update_autogenerated( docid, external_docid, @@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &external_docid, old_must_regenerate, + old_is_user_provided, session, unused_vectors_distribution, )? @@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { }); if old_is_user_provided { + // when the document was `userProvided`, Meilisearch cannot know whose fragments a particular + // vector was referring to. + // So as a result Meilisearch will regenerate all fragments on this case. + // Fortunately, since dumps for fragments set regenerate to false, this case should be rare. session.on_embed_mut().clear_vectors(docid); + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } update_autogenerated( @@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &(), old_must_regenerate, + false, session, unused_vectors_distribution, )? } }; - self.set_status( - docid, - old_is_user_provided, - old_must_regenerate, - false, - new_must_regenerate, - ); - Ok(()) } #[allow(clippy::too_many_arguments)] - pub fn insert_autogenerated + Debug>( + pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>( &mut self, docid: DocumentId, external_docid: &'a str, @@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map: &'a RefCell, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, new_must_regenerate: bool, - ) -> Result<()> { + ) -> Result<()> + where + 'a: 'doc, + { let (default_is_user_provided, default_must_regenerate) = (false, true); self.set_status( docid, @@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( new_document: ND, meta: &E::DocumentMetadata, old_must_regenerate: bool, + mut must_clear_on_generation: bool, session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, ) -> Result<()> @@ -984,6 +1027,11 @@ where }; if must_regenerate { + if must_clear_on_generation { + must_clear_on_generation = false; + session.on_embed_mut().clear_vectors(docid); + } + let metadata = Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; @@ -1002,7 +1050,7 @@ where Ok(()) } -fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( +fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>( docid: DocumentId, external_docid: &'a str, extractors: impl IntoIterator,