From 324666759032683e164e0dab8e1f52d57cb875bf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 21 Jul 2025 15:09:31 +0200 Subject: [PATCH] when exporting vectors, for regenerate to false when the embedder has fragments --- .../src/scheduler/process_dump_creation.rs | 14 ++++++++++++-- .../src/scheduler/process_export.rs | 14 ++++++++++++-- crates/meilitool/src/main.rs | 14 ++++++++++++-- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index b8d100415..b14f23d0b 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -5,6 +5,7 @@ use std::sync::atomic::Ordering; use dump::IndexMetadata; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self}; @@ -227,12 +228,21 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index a951a7ca6..0cd06f2e4 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -9,6 +9,7 @@ use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; @@ -229,12 +230,21 @@ impl IndexScheduler { )); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert( embedder_name, diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index b967e620c..170bbdcc8 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -15,6 +15,7 @@ use meilisearch_types::heed::{ }; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; @@ -591,12 +592,21 @@ fn export_documents( .into()); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap());