mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 00:31:02 +00:00
Merge pull request #5763 from meilisearch/embedding-fixes
Regenerate all fragments when coming from a user provided vector
This commit is contained in:
@ -5,6 +5,7 @@ use std::sync::atomic::Ordering;
|
|||||||
|
|
||||||
use dump::IndexMetadata;
|
use dump::IndexMetadata;
|
||||||
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||||
|
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
||||||
use meilisearch_types::milli::progress::{Progress, VariableNameStep};
|
use meilisearch_types::milli::progress::{Progress, VariableNameStep};
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||||
use meilisearch_types::milli::{self};
|
use meilisearch_types::milli::{self};
|
||||||
@ -227,12 +228,21 @@ impl IndexScheduler {
|
|||||||
return Err(Error::from_milli(user_err, Some(uid.to_string())));
|
return Err(Error::from_milli(user_err, Some(uid.to_string())));
|
||||||
};
|
};
|
||||||
|
|
||||||
for (embedder_name, (embeddings, regenerate)) in embeddings {
|
for (
|
||||||
|
embedder_name,
|
||||||
|
EmbeddingsWithMetadata { embeddings, regenerate, has_fragments },
|
||||||
|
) in embeddings
|
||||||
|
{
|
||||||
let embeddings = ExplicitVectors {
|
let embeddings = ExplicitVectors {
|
||||||
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
||||||
embeddings,
|
embeddings,
|
||||||
)),
|
)),
|
||||||
regenerate,
|
regenerate: regenerate &&
|
||||||
|
// Meilisearch does not handle well dumps with fragments, because as the fragments
|
||||||
|
// are marked as user-provided,
|
||||||
|
// all embeddings would be regenerated on any settings change or document update.
|
||||||
|
// To prevent this, we mark embeddings has non regenerate in this case.
|
||||||
|
!has_fragments,
|
||||||
};
|
};
|
||||||
vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap());
|
vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap());
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ use flate2::write::GzEncoder;
|
|||||||
use flate2::Compression;
|
use flate2::Compression;
|
||||||
use meilisearch_types::index_uid_pattern::IndexUidPattern;
|
use meilisearch_types::index_uid_pattern::IndexUidPattern;
|
||||||
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||||
|
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
||||||
use meilisearch_types::milli::progress::{Progress, VariableNameStep};
|
use meilisearch_types::milli::progress::{Progress, VariableNameStep};
|
||||||
use meilisearch_types::milli::update::{request_threads, Setting};
|
use meilisearch_types::milli::update::{request_threads, Setting};
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||||
@ -229,12 +230,21 @@ impl IndexScheduler {
|
|||||||
));
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
for (embedder_name, (embeddings, regenerate)) in embeddings {
|
for (
|
||||||
|
embedder_name,
|
||||||
|
EmbeddingsWithMetadata { embeddings, regenerate, has_fragments },
|
||||||
|
) in embeddings
|
||||||
|
{
|
||||||
let embeddings = ExplicitVectors {
|
let embeddings = ExplicitVectors {
|
||||||
embeddings: Some(
|
embeddings: Some(
|
||||||
VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
|
VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
|
||||||
),
|
),
|
||||||
regenerate,
|
regenerate: regenerate &&
|
||||||
|
// Meilisearch does not handle well dumps with fragments, because as the fragments
|
||||||
|
// are marked as user-provided,
|
||||||
|
// all embeddings would be regenerated on any settings change or document update.
|
||||||
|
// To prevent this, we mark embeddings has non regenerate in this case.
|
||||||
|
!has_fragments,
|
||||||
};
|
};
|
||||||
vectors.insert(
|
vectors.insert(
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
@ -3,6 +3,7 @@ use std::collections::BTreeMap;
|
|||||||
use big_s::S;
|
use big_s::S;
|
||||||
use insta::assert_json_snapshot;
|
use insta::assert_json_snapshot;
|
||||||
use meili_snap::{json_string, snapshot};
|
use meili_snap::{json_string, snapshot};
|
||||||
|
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
||||||
use meilisearch_types::milli::update::Setting;
|
use meilisearch_types::milli::update::Setting;
|
||||||
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
||||||
use meilisearch_types::milli::vector::SearchQuery;
|
use meilisearch_types::milli::vector::SearchQuery;
|
||||||
@ -220,8 +221,8 @@ fn import_vectors() {
|
|||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||||
|
|
||||||
assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true");
|
assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == lab_embed, @"true");
|
||||||
assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true");
|
assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true");
|
||||||
|
|
||||||
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
|
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
@ -311,9 +312,9 @@ fn import_vectors() {
|
|||||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||||
|
|
||||||
// automatically changed to patou because set to regenerate
|
// automatically changed to patou because set to regenerate
|
||||||
assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true");
|
assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == patou_embed, @"true");
|
||||||
// remained beagle
|
// remained beagle
|
||||||
assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true");
|
assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true");
|
||||||
|
|
||||||
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
|
let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1;
|
||||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
@ -497,13 +498,13 @@ fn import_vectors_first_and_embedder_later() {
|
|||||||
|
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
let (embedding, _) = &embeddings["my_doggo_embedder"];
|
let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"];
|
||||||
assert!(!embedding.is_empty(), "{embedding:?}");
|
assert!(!embeddings.is_empty(), "{embeddings:?}");
|
||||||
|
|
||||||
// the document with the id 3 should keep its original embedding
|
// the document with the id 3 should keep its original embedding
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
let (embeddings, _) = &embeddings["my_doggo_embedder"];
|
let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"];
|
||||||
|
|
||||||
snapshot!(embeddings.len(), @"1");
|
snapshot!(embeddings.len(), @"1");
|
||||||
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
|
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
|
||||||
@ -558,7 +559,7 @@ fn import_vectors_first_and_embedder_later() {
|
|||||||
"###);
|
"###);
|
||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
let (embedding, _) = &embeddings["my_doggo_embedder"];
|
let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"];
|
||||||
|
|
||||||
assert!(!embedding.is_empty());
|
assert!(!embedding.is_empty());
|
||||||
assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
|
assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
|
||||||
@ -566,7 +567,7 @@ fn import_vectors_first_and_embedder_later() {
|
|||||||
// the document with the id 4 should generate an embedding
|
// the document with the id 4 should generate an embedding
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
|
let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
let (embedding, _) = &embeddings["my_doggo_embedder"];
|
let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"];
|
||||||
|
|
||||||
assert!(!embedding.is_empty());
|
assert!(!embedding.is_empty());
|
||||||
}
|
}
|
||||||
@ -696,7 +697,7 @@ fn delete_document_containing_vector() {
|
|||||||
"###);
|
"###);
|
||||||
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
||||||
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
let (embedding, _) = &embeddings["manual"];
|
let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["manual"];
|
||||||
assert!(!embedding.is_empty(), "{embedding:?}");
|
assert!(!embedding.is_empty(), "{embedding:?}");
|
||||||
|
|
||||||
index_scheduler
|
index_scheduler
|
||||||
|
Binary file not shown.
@ -19,6 +19,7 @@ use meilisearch_types::error::{Code, ResponseError};
|
|||||||
use meilisearch_types::heed::RoTxn;
|
use meilisearch_types::heed::RoTxn;
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::milli::documents::sort::recursive_sort;
|
use meilisearch_types::milli::documents::sort::recursive_sort;
|
||||||
|
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
||||||
use meilisearch_types::milli::update::IndexDocumentsMethod;
|
use meilisearch_types::milli::update::IndexDocumentsMethod;
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
||||||
use meilisearch_types::milli::{AscDesc, DocumentId};
|
use meilisearch_types::milli::{AscDesc, DocumentId};
|
||||||
@ -1460,9 +1461,13 @@ fn some_documents<'a, 't: 'a>(
|
|||||||
Some(Value::Object(map)) => map,
|
Some(Value::Object(map)) => map,
|
||||||
_ => Default::default(),
|
_ => Default::default(),
|
||||||
};
|
};
|
||||||
for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? {
|
for (
|
||||||
|
name,
|
||||||
|
EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ },
|
||||||
|
) in index.embeddings(rtxn, key)?
|
||||||
|
{
|
||||||
let embeddings =
|
let embeddings =
|
||||||
ExplicitVectors { embeddings: Some(vector.into()), regenerate };
|
ExplicitVectors { embeddings: Some(embeddings.into()), regenerate };
|
||||||
vectors.insert(
|
vectors.insert(
|
||||||
name,
|
name,
|
||||||
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
|
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
|
||||||
|
@ -16,7 +16,7 @@ use meilisearch_types::error::{Code, ResponseError};
|
|||||||
use meilisearch_types::heed::RoTxn;
|
use meilisearch_types::heed::RoTxn;
|
||||||
use meilisearch_types::index_uid::IndexUid;
|
use meilisearch_types::index_uid::IndexUid;
|
||||||
use meilisearch_types::locales::Locale;
|
use meilisearch_types::locales::Locale;
|
||||||
use meilisearch_types::milli::index::{self, SearchParameters};
|
use meilisearch_types::milli::index::{self, EmbeddingsWithMetadata, SearchParameters};
|
||||||
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
|
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
|
||||||
use meilisearch_types::milli::vector::Embedder;
|
use meilisearch_types::milli::vector::Embedder;
|
||||||
@ -1528,8 +1528,11 @@ impl<'a> HitMaker<'a> {
|
|||||||
Some(Value::Object(map)) => map,
|
Some(Value::Object(map)) => map,
|
||||||
_ => Default::default(),
|
_ => Default::default(),
|
||||||
};
|
};
|
||||||
for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? {
|
for (name, EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }) in
|
||||||
let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate };
|
self.index.embeddings(self.rtxn, id)?
|
||||||
|
{
|
||||||
|
let embeddings =
|
||||||
|
ExplicitVectors { embeddings: Some(embeddings.into()), regenerate };
|
||||||
vectors.insert(
|
vectors.insert(
|
||||||
name,
|
name,
|
||||||
serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?,
|
serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?,
|
||||||
|
@ -15,6 +15,7 @@ use meilisearch_types::heed::{
|
|||||||
};
|
};
|
||||||
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
|
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||||
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
||||||
use meilisearch_types::tasks::{Status, Task};
|
use meilisearch_types::tasks::{Status, Task};
|
||||||
@ -591,12 +592,21 @@ fn export_documents(
|
|||||||
.into());
|
.into());
|
||||||
};
|
};
|
||||||
|
|
||||||
for (embedder_name, (embeddings, regenerate)) in embeddings {
|
for (
|
||||||
|
embedder_name,
|
||||||
|
EmbeddingsWithMetadata { embeddings, regenerate, has_fragments },
|
||||||
|
) in embeddings
|
||||||
|
{
|
||||||
let embeddings = ExplicitVectors {
|
let embeddings = ExplicitVectors {
|
||||||
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
||||||
embeddings,
|
embeddings,
|
||||||
)),
|
)),
|
||||||
regenerate,
|
regenerate: regenerate &&
|
||||||
|
// Meilisearch does not handle well dumps with fragments, because as the fragments
|
||||||
|
// are marked as user-provided,
|
||||||
|
// all embeddings would be regenerated on any settings change or document update.
|
||||||
|
// To prevent this, we mark embeddings has non regenerate in this case.
|
||||||
|
!has_fragments,
|
||||||
};
|
};
|
||||||
vectors
|
vectors
|
||||||
.insert(embedder_name, serde_json::to_value(embeddings).unwrap());
|
.insert(embedder_name, serde_json::to_value(embeddings).unwrap());
|
||||||
|
@ -1766,20 +1766,22 @@ impl Index {
|
|||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn<'_>,
|
rtxn: &RoTxn<'_>,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
) -> Result<BTreeMap<String, (Vec<Embedding>, bool)>> {
|
) -> Result<BTreeMap<String, EmbeddingsWithMetadata>> {
|
||||||
let mut res = BTreeMap::new();
|
let mut res = BTreeMap::new();
|
||||||
let embedders = self.embedding_configs();
|
let embedders = self.embedding_configs();
|
||||||
for config in embedders.embedding_configs(rtxn)? {
|
for config in embedders.embedding_configs(rtxn)? {
|
||||||
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
|
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
|
||||||
|
let has_fragments = config.config.embedder_options.has_fragments();
|
||||||
let reader = ArroyWrapper::new(
|
let reader = ArroyWrapper::new(
|
||||||
self.vector_arroy,
|
self.vector_arroy,
|
||||||
embedder_info.embedder_id,
|
embedder_info.embedder_id,
|
||||||
config.config.quantized(),
|
config.config.quantized(),
|
||||||
);
|
);
|
||||||
let embeddings = reader.item_vectors(rtxn, docid)?;
|
let embeddings = reader.item_vectors(rtxn, docid)?;
|
||||||
|
let regenerate = embedder_info.embedding_status.must_regenerate(docid);
|
||||||
res.insert(
|
res.insert(
|
||||||
config.name.to_owned(),
|
config.name.to_owned(),
|
||||||
(embeddings, embedder_info.embedding_status.must_regenerate(docid)),
|
EmbeddingsWithMetadata { embeddings, regenerate, has_fragments },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
Ok(res)
|
Ok(res)
|
||||||
@ -1919,6 +1921,12 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct EmbeddingsWithMetadata {
|
||||||
|
pub embeddings: Vec<Embedding>,
|
||||||
|
pub regenerate: bool,
|
||||||
|
pub has_fragments: bool,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Default, Deserialize, Serialize)]
|
#[derive(Debug, Default, Deserialize, Serialize)]
|
||||||
pub struct ChatConfig {
|
pub struct ChatConfig {
|
||||||
pub description: String,
|
pub description: String,
|
||||||
|
@ -23,7 +23,7 @@ use crate::progress::EmbedderStats;
|
|||||||
use crate::prompt::Prompt;
|
use crate::prompt::Prompt;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta};
|
use crate::vector::db::{EmbedderInfo, EmbeddingStatusDelta};
|
||||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||||
use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor};
|
use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor};
|
||||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState};
|
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState};
|
||||||
@ -441,6 +441,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
{
|
{
|
||||||
let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_));
|
let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_));
|
||||||
|
|
||||||
|
let (old_is_user_provided, old_must_regenerate) =
|
||||||
|
embedder_info.embedding_status.is_user_provided_must_regenerate(docid);
|
||||||
let (old, new) = parsed_vectors.remove(embedder_name);
|
let (old, new) = parsed_vectors.remove(embedder_name);
|
||||||
let new_must_regenerate = new.must_regenerate();
|
let new_must_regenerate = new.must_regenerate();
|
||||||
let delta = match action {
|
let delta = match action {
|
||||||
@ -499,16 +501,19 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let is_adding_fragments = has_fragments && !old_has_fragments;
|
let is_adding_fragments = has_fragments && !old_has_fragments;
|
||||||
|
|
||||||
if is_adding_fragments {
|
if !has_fragments {
|
||||||
|
// removing fragments
|
||||||
|
regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)?
|
||||||
|
} else if is_adding_fragments ||
|
||||||
|
// regenerate all fragments when going from user provided to ! user provided
|
||||||
|
old_is_user_provided
|
||||||
|
{
|
||||||
regenerate_all_fragments(
|
regenerate_all_fragments(
|
||||||
runtime.fragments(),
|
runtime.fragments(),
|
||||||
&doc_alloc,
|
&doc_alloc,
|
||||||
new_fields_ids_map,
|
new_fields_ids_map,
|
||||||
obkv,
|
obkv,
|
||||||
)
|
)
|
||||||
} else if !has_fragments {
|
|
||||||
// removing fragments
|
|
||||||
regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)?
|
|
||||||
} else {
|
} else {
|
||||||
let mut fragment_diff = Vec::new();
|
let mut fragment_diff = Vec::new();
|
||||||
let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map();
|
let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map();
|
||||||
@ -600,7 +605,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
docid,
|
docid,
|
||||||
&delta,
|
&delta,
|
||||||
new_must_regenerate,
|
new_must_regenerate,
|
||||||
&embedder_info.embedding_status,
|
old_is_user_provided,
|
||||||
|
old_must_regenerate,
|
||||||
);
|
);
|
||||||
|
|
||||||
// and we finally push the unique vectors into the writer
|
// and we finally push the unique vectors into the writer
|
||||||
@ -657,10 +663,9 @@ fn push_embedding_status_delta(
|
|||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
delta: &VectorStateDelta,
|
delta: &VectorStateDelta,
|
||||||
new_must_regenerate: bool,
|
new_must_regenerate: bool,
|
||||||
embedding_status: &EmbeddingStatus,
|
old_is_user_provided: bool,
|
||||||
|
old_must_regenerate: bool,
|
||||||
) {
|
) {
|
||||||
let (old_is_user_provided, old_must_regenerate) =
|
|
||||||
embedding_status.is_user_provided_must_regenerate(docid);
|
|
||||||
let new_is_user_provided = match delta {
|
let new_is_user_provided = match delta {
|
||||||
VectorStateDelta::NoChange => old_is_user_provided,
|
VectorStateDelta::NoChange => old_is_user_provided,
|
||||||
VectorStateDelta::NowRemoved => {
|
VectorStateDelta::NowRemoved => {
|
||||||
|
@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
where
|
where
|
||||||
'a: 'doc,
|
'a: 'doc,
|
||||||
{
|
{
|
||||||
match &mut self.kind {
|
self.set_status(docid, old_is_user_provided, true, false, true);
|
||||||
ChunkType::Fragments { fragments: _, session } => {
|
|
||||||
let doc_alloc = session.doc_alloc();
|
|
||||||
|
|
||||||
if old_is_user_provided | full_reindex {
|
match &mut self.kind {
|
||||||
|
ChunkType::Fragments { fragments, session } => {
|
||||||
|
let doc_alloc = session.doc_alloc();
|
||||||
|
let reindex_all_fragments =
|
||||||
|
// when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment,
|
||||||
|
// and so Meilisearch needs to clear all embeddings in that case.
|
||||||
|
// Fortunately, as dump export fragment vector with `regenerate` set to `false`,
|
||||||
|
// this case should be rare and opt-in.
|
||||||
|
old_is_user_provided ||
|
||||||
|
// full-reindex case
|
||||||
|
full_reindex;
|
||||||
|
|
||||||
|
if reindex_all_fragments {
|
||||||
session.on_embed_mut().clear_vectors(docid);
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
let extractors = fragments.iter().map(|fragment| {
|
||||||
|
RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors()
|
||||||
|
});
|
||||||
|
insert_autogenerated(
|
||||||
|
docid,
|
||||||
|
external_docid,
|
||||||
|
extractors,
|
||||||
|
document,
|
||||||
|
&(),
|
||||||
|
session,
|
||||||
|
unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
settings_delta.try_for_each_fragment_diff(
|
settings_delta.try_for_each_fragment_diff(
|
||||||
@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
Result::Ok(())
|
Result::Ok(())
|
||||||
},
|
},
|
||||||
)?;
|
)?;
|
||||||
self.set_status(docid, old_is_user_provided, true, false, true);
|
|
||||||
}
|
}
|
||||||
ChunkType::DocumentTemplate { document_template, session } => {
|
ChunkType::DocumentTemplate { document_template, session } => {
|
||||||
let doc_alloc = session.doc_alloc();
|
let doc_alloc = session.doc_alloc();
|
||||||
@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
|
|
||||||
match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? {
|
match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? {
|
||||||
ExtractorDiff::Removed => {
|
ExtractorDiff::Removed => {
|
||||||
|
if old_is_user_provided || full_reindex {
|
||||||
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
}
|
||||||
OnEmbed::process_embedding_response(
|
OnEmbed::process_embedding_response(
|
||||||
session.on_embed_mut(),
|
session.on_embed_mut(),
|
||||||
crate::vector::session::EmbeddingResponse { metadata, embedding: None },
|
crate::vector::session::EmbeddingResponse { metadata, embedding: None },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => {
|
ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => {
|
||||||
|
if old_is_user_provided || full_reindex {
|
||||||
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
}
|
||||||
session.request_embedding(metadata, input, unused_vectors_distribution)?;
|
session.request_embedding(metadata, input, unused_vectors_distribution)?;
|
||||||
}
|
}
|
||||||
ExtractorDiff::Unchanged => { /* do nothing */ }
|
ExtractorDiff::Unchanged => { /* do nothing */ }
|
||||||
@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
where
|
where
|
||||||
'a: 'doc,
|
'a: 'doc,
|
||||||
{
|
{
|
||||||
|
self.set_status(
|
||||||
|
docid,
|
||||||
|
old_is_user_provided,
|
||||||
|
old_must_regenerate,
|
||||||
|
false,
|
||||||
|
new_must_regenerate,
|
||||||
|
);
|
||||||
match &mut self.kind {
|
match &mut self.kind {
|
||||||
ChunkType::DocumentTemplate { document_template, session } => {
|
ChunkType::DocumentTemplate { document_template, session } => {
|
||||||
let doc_alloc = session.doc_alloc();
|
let doc_alloc = session.doc_alloc();
|
||||||
@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_fields_ids_map,
|
new_fields_ids_map,
|
||||||
);
|
);
|
||||||
|
|
||||||
if old_is_user_provided {
|
|
||||||
session.on_embed_mut().clear_vectors(docid);
|
|
||||||
}
|
|
||||||
|
|
||||||
update_autogenerated(
|
update_autogenerated(
|
||||||
docid,
|
docid,
|
||||||
external_docid,
|
external_docid,
|
||||||
@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_document,
|
new_document,
|
||||||
&external_docid,
|
&external_docid,
|
||||||
old_must_regenerate,
|
old_must_regenerate,
|
||||||
|
old_is_user_provided,
|
||||||
session,
|
session,
|
||||||
unused_vectors_distribution,
|
unused_vectors_distribution,
|
||||||
)?
|
)?
|
||||||
@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if old_is_user_provided {
|
if old_is_user_provided {
|
||||||
|
// when the document was `userProvided`, Meilisearch cannot know whose fragments a particular
|
||||||
|
// vector was referring to.
|
||||||
|
// So as a result Meilisearch will regenerate all fragments on this case.
|
||||||
|
// Fortunately, since dumps for fragments set regenerate to false, this case should be rare.
|
||||||
session.on_embed_mut().clear_vectors(docid);
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
insert_autogenerated(
|
||||||
|
docid,
|
||||||
|
external_docid,
|
||||||
|
extractors,
|
||||||
|
new_document,
|
||||||
|
&(),
|
||||||
|
session,
|
||||||
|
unused_vectors_distribution,
|
||||||
|
)?;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
update_autogenerated(
|
update_autogenerated(
|
||||||
@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_document,
|
new_document,
|
||||||
&(),
|
&(),
|
||||||
old_must_regenerate,
|
old_must_regenerate,
|
||||||
|
false,
|
||||||
session,
|
session,
|
||||||
unused_vectors_distribution,
|
unused_vectors_distribution,
|
||||||
)?
|
)?
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
self.set_status(
|
|
||||||
docid,
|
|
||||||
old_is_user_provided,
|
|
||||||
old_must_regenerate,
|
|
||||||
false,
|
|
||||||
new_must_regenerate,
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn insert_autogenerated<D: Document<'a> + Debug>(
|
pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>(
|
||||||
&mut self,
|
&mut self,
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: &'a str,
|
external_docid: &'a str,
|
||||||
@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
new_fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>,
|
new_fields_ids_map: &'a RefCell<crate::GlobalFieldsIdsMap>,
|
||||||
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
||||||
new_must_regenerate: bool,
|
new_must_regenerate: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
'a: 'doc,
|
||||||
|
{
|
||||||
let (default_is_user_provided, default_must_regenerate) = (false, true);
|
let (default_is_user_provided, default_must_regenerate) = (false, true);
|
||||||
self.set_status(
|
self.set_status(
|
||||||
docid,
|
docid,
|
||||||
@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>(
|
|||||||
new_document: ND,
|
new_document: ND,
|
||||||
meta: &E::DocumentMetadata,
|
meta: &E::DocumentMetadata,
|
||||||
old_must_regenerate: bool,
|
old_must_regenerate: bool,
|
||||||
|
mut must_clear_on_generation: bool,
|
||||||
session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>,
|
session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>,
|
||||||
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
@ -984,6 +1027,11 @@ where
|
|||||||
};
|
};
|
||||||
|
|
||||||
if must_regenerate {
|
if must_regenerate {
|
||||||
|
if must_clear_on_generation {
|
||||||
|
must_clear_on_generation = false;
|
||||||
|
session.on_embed_mut().clear_vectors(docid);
|
||||||
|
}
|
||||||
|
|
||||||
let metadata =
|
let metadata =
|
||||||
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };
|
Metadata { docid, external_docid, extractor_id: extractor.extractor_id() };
|
||||||
|
|
||||||
@ -1002,7 +1050,7 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>(
|
fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>(
|
||||||
docid: DocumentId,
|
docid: DocumentId,
|
||||||
external_docid: &'a str,
|
external_docid: &'a str,
|
||||||
extractors: impl IntoIterator<Item = E>,
|
extractors: impl IntoIterator<Item = E>,
|
||||||
|
@ -841,6 +841,25 @@ impl EmbedderOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn has_fragments(&self) -> bool {
|
||||||
|
match &self {
|
||||||
|
EmbedderOptions::HuggingFace(_)
|
||||||
|
| EmbedderOptions::OpenAi(_)
|
||||||
|
| EmbedderOptions::Ollama(_)
|
||||||
|
| EmbedderOptions::UserProvided(_) => false,
|
||||||
|
EmbedderOptions::Rest(embedder_options) => {
|
||||||
|
!embedder_options.indexing_fragments.is_empty()
|
||||||
|
}
|
||||||
|
EmbedderOptions::Composite(embedder_options) => {
|
||||||
|
if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index {
|
||||||
|
!embedder_options.indexing_fragments.is_empty()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for EmbedderOptions {
|
impl Default for EmbedderOptions {
|
||||||
|
Reference in New Issue
Block a user