Merge pull request #5763 from meilisearch/embedding-fixes

Regenerate all fragments when coming from a user provided vector
This commit is contained in:
Louis Dureuil
2025-07-22 08:35:07 +00:00
committed by GitHub
11 changed files with 171 additions and 52 deletions

View File

@ -1766,20 +1766,22 @@ impl Index {
&self,
rtxn: &RoTxn<'_>,
docid: DocumentId,
) -> Result<BTreeMap<String, (Vec<Embedding>, bool)>> {
) -> Result<BTreeMap<String, EmbeddingsWithMetadata>> {
let mut res = BTreeMap::new();
let embedders = self.embedding_configs();
for config in embedders.embedding_configs(rtxn)? {
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
let has_fragments = config.config.embedder_options.has_fragments();
let reader = ArroyWrapper::new(
self.vector_arroy,
embedder_info.embedder_id,
config.config.quantized(),
);
let embeddings = reader.item_vectors(rtxn, docid)?;
let regenerate = embedder_info.embedding_status.must_regenerate(docid);
res.insert(
config.name.to_owned(),
(embeddings, embedder_info.embedding_status.must_regenerate(docid)),
EmbeddingsWithMetadata { embeddings, regenerate, has_fragments },
);
}
Ok(res)
@ -1919,6 +1921,12 @@ impl Index {
}
}
pub struct EmbeddingsWithMetadata {
pub embeddings: Vec<Embedding>,
pub regenerate: bool,
pub has_fragments: bool,
}
#[derive(Debug, Default, Deserialize, Serialize)]
pub struct ChatConfig {
pub description: String,

View File

@ -23,7 +23,7 @@ use crate::progress::EmbedderStats;
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta};
use crate::vector::db::{EmbedderInfo, EmbeddingStatusDelta};
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor};
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState};
@ -441,6 +441,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
{
let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_));
let (old_is_user_provided, old_must_regenerate) =
embedder_info.embedding_status.is_user_provided_must_regenerate(docid);
let (old, new) = parsed_vectors.remove(embedder_name);
let new_must_regenerate = new.must_regenerate();
let delta = match action {
@ -499,16 +501,19 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let is_adding_fragments = has_fragments && !old_has_fragments;
if is_adding_fragments {
if !has_fragments {
// removing fragments
regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)?
} else if is_adding_fragments ||
// regenerate all fragments when going from user provided to ! user provided
old_is_user_provided
{
regenerate_all_fragments(
runtime.fragments(),
&doc_alloc,
new_fields_ids_map,
obkv,
)
} else if !has_fragments {
// removing fragments
regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)?
} else {
let mut fragment_diff = Vec::new();
let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map();
@ -600,7 +605,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
docid,
&delta,
new_must_regenerate,
&embedder_info.embedding_status,
old_is_user_provided,
old_must_regenerate,
);
// and we finally push the unique vectors into the writer
@ -657,10 +663,9 @@ fn push_embedding_status_delta(
docid: DocumentId,
delta: &VectorStateDelta,
new_must_regenerate: bool,
embedding_status: &EmbeddingStatus,
old_is_user_provided: bool,
old_must_regenerate: bool,
) {
let (old_is_user_provided, old_must_regenerate) =
embedding_status.is_user_provided_must_regenerate(docid);
let new_is_user_provided = match delta {
VectorStateDelta::NoChange => old_is_user_provided,
VectorStateDelta::NowRemoved => {

View File

@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
where
'a: 'doc,
{
match &mut self.kind {
ChunkType::Fragments { fragments: _, session } => {
let doc_alloc = session.doc_alloc();
self.set_status(docid, old_is_user_provided, true, false, true);
if old_is_user_provided | full_reindex {
match &mut self.kind {
ChunkType::Fragments { fragments, session } => {
let doc_alloc = session.doc_alloc();
let reindex_all_fragments =
// when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment,
// and so Meilisearch needs to clear all embeddings in that case.
// Fortunately, as dump export fragment vector with `regenerate` set to `false`,
// this case should be rare and opt-in.
old_is_user_provided ||
// full-reindex case
full_reindex;
if reindex_all_fragments {
session.on_embed_mut().clear_vectors(docid);
let extractors = fragments.iter().map(|fragment| {
RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors()
});
insert_autogenerated(
docid,
external_docid,
extractors,
document,
&(),
session,
unused_vectors_distribution,
)?;
return Ok(());
}
settings_delta.try_for_each_fragment_diff(
@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
Result::Ok(())
},
)?;
self.set_status(docid, old_is_user_provided, true, false, true);
}
ChunkType::DocumentTemplate { document_template, session } => {
let doc_alloc = session.doc_alloc();
@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? {
ExtractorDiff::Removed => {
if old_is_user_provided || full_reindex {
session.on_embed_mut().clear_vectors(docid);
}
OnEmbed::process_embedding_response(
session.on_embed_mut(),
crate::vector::session::EmbeddingResponse { metadata, embedding: None },
);
}
ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => {
if old_is_user_provided || full_reindex {
session.on_embed_mut().clear_vectors(docid);
}
session.request_embedding(metadata, input, unused_vectors_distribution)?;
}
ExtractorDiff::Unchanged => { /* do nothing */ }
@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
where
'a: 'doc,
{
self.set_status(
docid,
old_is_user_provided,
old_must_regenerate,
false,
new_must_regenerate,
);
match &mut self.kind {
ChunkType::DocumentTemplate { document_template, session } => {
let doc_alloc = session.doc_alloc();
@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
new_fields_ids_map,
);
if old_is_user_provided {
session.on_embed_mut().clear_vectors(docid);
}
update_autogenerated(
docid,
external_docid,
@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
new_document,
&external_docid,
old_must_regenerate,
old_is_user_provided,
session,
unused_vectors_distribution,
)?
@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
});
if old_is_user_provided {
// when the document was `userProvided`, Meilisearch cannot know whose fragments a particular
// vector was referring to.
// So as a result Meilisearch will regenerate all fragments on this case.
// Fortunately, since dumps for fragments set regenerate to false, this case should be rare.
session.on_embed_mut().clear_vectors(docid);
insert_autogenerated(
docid,
external_docid,
extractors,
new_document,
&(),
session,
unused_vectors_distribution,
)?;
return Ok(());
}
update_autogenerated(
@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
new_document,
&(),
old_must_regenerate,
false,
session,
unused_vectors_distribution,
)?
}
};
self.set_status(
docid,
old_is_user_provided,
old_must_regenerate,
false,
new_must_regenerate,
);
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub fn insert_autogenerated<D: Document<'a> + Debug>(
pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>(
&mut self,
docid: DocumentId,
external_docid: &'a str,
@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
new_fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>,
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
new_must_regenerate: bool,
) -> Result<()> {
) -> Result<()>
where
'a: 'doc,
{
let (default_is_user_provided, default_must_regenerate) = (false, true);
self.set_status(
docid,
@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>(
new_document: ND,
meta: &E::DocumentMetadata,
old_must_regenerate: bool,
mut must_clear_on_generation: bool,
session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>,
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
) -> Result<()>
@ -984,6 +1027,11 @@ where
};
if must_regenerate {
if must_clear_on_generation {
must_clear_on_generation = false;
session.on_embed_mut().clear_vectors(docid);
}
let metadata =
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };
@ -1002,7 +1050,7 @@ where
Ok(())
}
fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>(
fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>(
docid: DocumentId,
external_docid: &'a str,
extractors: impl IntoIterator<Item = E>,

View File

@ -841,6 +841,25 @@ impl EmbedderOptions {
}
}
}
pub fn has_fragments(&self) -> bool {
match &self {
EmbedderOptions::HuggingFace(_)
| EmbedderOptions::OpenAi(_)
| EmbedderOptions::Ollama(_)
| EmbedderOptions::UserProvided(_) => false,
EmbedderOptions::Rest(embedder_options) => {
!embedder_options.indexing_fragments.is_empty()
}
EmbedderOptions::Composite(embedder_options) => {
if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index {
!embedder_options.indexing_fragments.is_empty()
} else {
false
}
}
}
}
}
impl Default for EmbedderOptions {