mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 00:31:02 +00:00
Fix new indexer
This commit is contained in:
@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
where
|
where
|
||||||
'a: 'doc,
|
'a: 'doc,
|
||||||
{
|
{
|
||||||
match &mut self.kind {
|
self.set_status(docid, old_is_user_provided, true, false, true);
|
||||||
ChunkType::Fragments { fragments: _, session } => {
|
|
||||||
let doc_alloc = session.doc_alloc();
|
|
||||||
|
|
||||||
if old_is_user_provided | full_reindex {
|
match &mut self.kind {
|
||||||
|
ChunkType::Fragments { fragments, session } => {
|
||||||
|
let doc_alloc = session.doc_alloc();
|
||||||
|
let reindex_all_fragments =
|
||||||
|
// when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment,
|
||||||
|
// and so Meilisearch needs to clear all embeddings in that case.
|
||||||
|
// Fortunately, as dump export fragment vector with `regenerate` set to `false`,
|
||||||
|
// this case should be rare and opt-in.
|
||||||
|
old_is_user_provided ||
|
||||||
|
// full-reindex case
|
||||||
|
full_reindex;
|
||||||
|
|
||||||
|
if reindex_all_fragments {
|
||||||
session.on_embed_mut().clear_vectors(docid);
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
let extractors = fragments.iter().map(|fragment| {
|
||||||
|
RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors()
|
||||||
|
});
|
||||||
|
insert_autogenerated(
|
||||||
|
docid,
|
||||||
|
external_docid,
|
||||||
|
extractors,
|
||||||
|
document,
|
||||||
|
&(),
|
||||||
|
session,
|
||||||
|
unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
settings_delta.try_for_each_fragment_diff(
|
settings_delta.try_for_each_fragment_diff(
|
||||||
@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
Result::Ok(())
|
Result::Ok(())
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
self.set_status(docid, old_is_user_provided, true, false, true);
|
|
||||||
}
|
}
|
||||||
ChunkType::DocumentTemplate { document_template, session } => {
|
ChunkType::DocumentTemplate { document_template, session } => {
|
||||||
let doc_alloc = session.doc_alloc();
|
let doc_alloc = session.doc_alloc();
|
||||||
@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
|
|
||||||
match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? {
|
match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? {
|
||||||
ExtractorDiff::Removed => {
|
ExtractorDiff::Removed => {
|
||||||
|
if old_is_user_provided || full_reindex {
|
||||||
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
}
|
||||||
OnEmbed::process_embedding_response(
|
OnEmbed::process_embedding_response(
|
||||||
session.on_embed_mut(),
|
session.on_embed_mut(),
|
||||||
crate::vector::session::EmbeddingResponse { metadata, embedding: None },
|
crate::vector::session::EmbeddingResponse { metadata, embedding: None },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => {
|
ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => {
|
||||||
|
if old_is_user_provided || full_reindex {
|
||||||
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
}
|
||||||
session.request_embedding(metadata, input, unused_vectors_distribution)?;
|
session.request_embedding(metadata, input, unused_vectors_distribution)?;
|
||||||
}
|
}
|
||||||
ExtractorDiff::Unchanged => { /* do nothing */ }
|
ExtractorDiff::Unchanged => { /* do nothing */ }
|
||||||
@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
where
|
where
|
||||||
'a: 'doc,
|
'a: 'doc,
|
||||||
{
|
{
|
||||||
|
self.set_status(
|
||||||
|
docid,
|
||||||
|
old_is_user_provided,
|
||||||
|
old_must_regenerate,
|
||||||
|
false,
|
||||||
|
new_must_regenerate,
|
||||||
|
);
|
||||||
match &mut self.kind {
|
match &mut self.kind {
|
||||||
ChunkType::DocumentTemplate { document_template, session } => {
|
ChunkType::DocumentTemplate { document_template, session } => {
|
||||||
let doc_alloc = session.doc_alloc();
|
let doc_alloc = session.doc_alloc();
|
||||||
@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_fields_ids_map,
|
new_fields_ids_map,
|
||||||
);
|
);
|
||||||
|
|
||||||
if old_is_user_provided {
|
|
||||||
session.on_embed_mut().clear_vectors(docid);
|
|
||||||
}
|
|
||||||
|
|
||||||
update_autogenerated(
|
update_autogenerated(
|
||||||
docid,
|
docid,
|
||||||
external_docid,
|
external_docid,
|
||||||
@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_document,
|
new_document,
|
||||||
&external_docid,
|
&external_docid,
|
||||||
old_must_regenerate,
|
old_must_regenerate,
|
||||||
|
old_is_user_provided,
|
||||||
session,
|
session,
|
||||||
unused_vectors_distribution,
|
unused_vectors_distribution,
|
||||||
)?
|
)?
|
||||||
@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if old_is_user_provided {
|
if old_is_user_provided {
|
||||||
|
// when the document was `userProvided`, Meilisearch cannot know whose fragments a particular
|
||||||
|
// vector was referring to.
|
||||||
|
// So as a result Meilisearch will regenerate all fragments on this case.
|
||||||
|
// Fortunately, since dumps for fragments set regenerate to false, this case should be rare.
|
||||||
session.on_embed_mut().clear_vectors(docid);
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
insert_autogenerated(
|
||||||
|
docid,
|
||||||
|
external_docid,
|
||||||
|
extractors,
|
||||||
|
new_document,
|
||||||
|
&(),
|
||||||
|
session,
|
||||||
|
unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
update_autogenerated(
|
update_autogenerated(
|
||||||
@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_document,
|
new_document,
|
||||||
&(),
|
&(),
|
||||||
old_must_regenerate,
|
old_must_regenerate,
|
||||||
|
false,
|
||||||
session,
|
session,
|
||||||
unused_vectors_distribution,
|
unused_vectors_distribution,
|
||||||
)?
|
)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
self.set_status(
|
|
||||||
docid,
|
|
||||||
old_is_user_provided,
|
|
||||||
old_must_regenerate,
|
|
||||||
false,
|
|
||||||
new_must_regenerate,
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn insert_autogenerated<D: Document<'a> + Debug>(
|
pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>(
|
||||||
&mut self,
|
&mut self,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: &'a str,
|
external_docid: &'a str,
|
||||||
@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>,
|
new_fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>,
|
||||||
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
||||||
new_must_regenerate: bool,
|
new_must_regenerate: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
'a: 'doc,
|
||||||
|
{
|
||||||
let (default_is_user_provided, default_must_regenerate) = (false, true);
|
let (default_is_user_provided, default_must_regenerate) = (false, true);
|
||||||
self.set_status(
|
self.set_status(
|
||||||
docid,
|
docid,
|
||||||
@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>(
|
|||||||
new_document: ND,
|
new_document: ND,
|
||||||
meta: &E::DocumentMetadata,
|
meta: &E::DocumentMetadata,
|
||||||
old_must_regenerate: bool,
|
old_must_regenerate: bool,
|
||||||
|
mut must_clear_on_generation: bool,
|
||||||
session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>,
|
session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>,
|
||||||
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
@ -984,6 +1027,11 @@ where
|
|||||||
};
|
};
|
||||||
|
|
||||||
if must_regenerate {
|
if must_regenerate {
|
||||||
|
if must_clear_on_generation {
|
||||||
|
must_clear_on_generation = false;
|
||||||
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
}
|
||||||
|
|
||||||
let metadata =
|
let metadata =
|
||||||
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };
|
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };
|
||||||
|
|
||||||
@ -1002,7 +1050,7 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>(
|
fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>(
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: &'a str,
|
external_docid: &'a str,
|
||||||
extractors: impl IntoIterator<Item = E>,
|
extractors: impl IntoIterator<Item = E>,
|
||||||
|
Reference in New Issue
Block a user