From d161a0d7b45983f2c5de2383e8d3a43f105e5b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 12 Aug 2025 15:09:26 +0200 Subject: [PATCH] Make the VectorStore aware of the index version --- .../src/scheduler/process_batch.rs | 1 - crates/milli/src/index.rs | 11 ++++-- crates/milli/src/search/new/vector_sort.rs | 7 +++- crates/milli/src/search/similar.rs | 7 +++- .../milli/src/update/index_documents/mod.rs | 11 ++++-- .../src/update/index_documents/transform.rs | 10 ++++-- .../src/update/index_documents/typed_chunk.rs | 8 ++++- crates/milli/src/update/new/indexer/mod.rs | 35 +++++++++++++++---- .../milli/src/update/new/vector_document.rs | 9 +++-- crates/milli/src/update/upgrade/v1_18.rs | 4 ++- crates/milli/src/vector/mod.rs | 8 +++-- 11 files changed, 89 insertions(+), 22 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index c21ab27ad..7a1da362f 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -146,7 +146,6 @@ impl IndexScheduler { }; let mut index_wtxn = index.write_txn()?; - let index_version = index.get_version(&index_wtxn)?.unwrap_or((1, 12, 0)); let package_version = (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH); if index_version != package_version { diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index b636c3942..bef41e961 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1769,10 +1769,12 @@ impl Index { ) -> Result> { let mut res = BTreeMap::new(); let embedders = self.embedding_configs(); + let index_version = self.get_version(rtxn)?.unwrap(); for config in embedders.embedding_configs(rtxn)? { let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); let has_fragments = config.config.embedder_options.has_fragments(); let reader = VectorStore::new( + index_version, self.vector_store, embedder_info.embedder_id, config.config.quantized(), @@ -1795,10 +1797,15 @@ impl Index { pub fn hannoy_stats(&self, rtxn: &RoTxn<'_>) -> Result { let mut stats = HannoyStats::default(); let embedding_configs = self.embedding_configs(); + let index_version = self.get_version(rtxn)?.unwrap(); for config in embedding_configs.embedding_configs(rtxn)? { let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); - let reader = - VectorStore::new(self.vector_store, embedder_id, config.config.quantized()); + let reader = VectorStore::new( + index_version, + self.vector_store, + embedder_id, + config.config.quantized(), + ); reader.aggregate_stats(rtxn, &mut stats)?; } Ok(stats) diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index 284fcd431..fce3340c5 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -56,7 +56,12 @@ impl VectorSort { let target = &self.target; let before = Instant::now(); - let reader = VectorStore::new(ctx.index.vector_store, self.embedder_index, self.quantized); + let reader = VectorStore::new( + ctx.index.get_version(ctx.txn)?.unwrap(), + ctx.index.vector_store, + self.embedder_index, + self.quantized, + ); let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; self.cached_sorted_docids = results.into_iter(); *ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats { diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 83e65fd6a..d4b45cd4e 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -72,7 +72,12 @@ impl<'a> Similar<'a> { crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) })?; - let reader = VectorStore::new(self.index.vector_store, embedder_index, self.quantized); + let reader = VectorStore::new( + self.index.get_version(self.rtxn)?.unwrap(), + self.index.vector_store, + embedder_index, + self.quantized, + ); let results = reader.nns_by_item( self.rtxn, self.id, diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index b2ae1811a..0d2a712ac 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -485,6 +485,7 @@ where // If an embedder wasn't used in the typedchunk but must be binary quantized // we should insert it in `dimension` + let index_version = self.index.get_version(&self.wtxn)?.unwrap(); for (name, action) in settings_diff.embedding_config_updates.iter() { if action.is_being_quantized && !dimension.contains_key(name.as_str()) { let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or( @@ -493,7 +494,12 @@ where key: None, }, )?; - let reader = VectorStore::new(self.index.vector_store, index, action.was_quantized); + let reader = VectorStore::new( + index_version, + self.index.vector_store, + index, + action.was_quantized, + ); let Some(dim) = reader.dimensions(self.wtxn)? else { continue; }; @@ -522,7 +528,8 @@ where let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); pool.install(|| { - let mut writer = VectorStore::new(vector_hannoy, embedder_index, was_quantized); + let mut writer = + VectorStore::new(index_version, vector_hannoy, embedder_index, was_quantized); writer.build_and_quantize( wtxn, // In the settings we don't have any progress to share diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index 985f3a88f..a8e0d318c 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -834,6 +834,7 @@ impl<'a, 'i> Transform<'a, 'i> { None }; + let index_version = self.index.get_version(wtxn)?.unwrap(); let readers: BTreeMap<&str, (VectorStore, &RoaringBitmap)> = settings_diff .embedding_config_updates .iter() @@ -842,6 +843,7 @@ impl<'a, 'i> Transform<'a, 'i> { action.write_back() { let reader = VectorStore::new( + index_version, self.index.vector_store, *embedder_id, action.was_quantized, @@ -949,8 +951,12 @@ impl<'a, 'i> Transform<'a, 'i> { else { continue; }; - let hannoy = - VectorStore::new(self.index.vector_store, infos.embedder_id, was_quantized); + let hannoy = VectorStore::new( + index_version, + self.index.vector_store, + infos.embedder_id, + was_quantized, + ); let Some(dimensions) = hannoy.dimensions(wtxn)? else { continue; }; diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index fe5a8bde8..4efe6bde2 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -619,6 +619,7 @@ pub(crate) fn write_typed_chunk_into_index( let _entered = span.enter(); let embedders = index.embedding_configs(); + let index_version = index.get_version(wtxn)?.unwrap(); let mut remove_vectors_builder = MergerBuilder::new(KeepFirst); let mut manual_vectors_builder = MergerBuilder::new(KeepFirst); @@ -677,7 +678,12 @@ pub(crate) fn write_typed_chunk_into_index( .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance - let writer = VectorStore::new(index.vector_store, infos.embedder_id, binary_quantized); + let writer = VectorStore::new( + index_version, + index.vector_store, + infos.embedder_id, + binary_quantized, + ); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 3300a6ad5..edf940758 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -8,7 +8,7 @@ use document_changes::{DocumentChanges, IndexingContext}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; -use heed::RwTxn; +use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; pub use post_processing::recompute_word_fst_from_word_docids_database; pub use update_by_function::UpdateByFunction; @@ -130,6 +130,7 @@ where let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let vector_arroy = index.vector_store; + let index_version = index.get_version(wtxn)?.unwrap(); let hannoy_writers: Result> = embedders .inner_as_ref() .iter() @@ -143,7 +144,12 @@ where })?; let dimensions = runtime.embedder.dimensions(); - let writer = VectorStore::new(vector_arroy, embedder_index, runtime.is_quantized); + let writer = VectorStore::new( + index_version, + vector_arroy, + embedder_index, + runtime.is_quantized, + ); Ok(( embedder_index, @@ -285,6 +291,7 @@ where let index_embedder_category_ids = settings_delta.new_embedder_category_id(); let mut hannoy_writers = hannoy_writers_from_embedder_actions( index, + wtxn, embedder_actions, new_embedders, index_embedder_category_ids, @@ -338,11 +345,13 @@ where fn hannoy_writers_from_embedder_actions<'indexer>( index: &Index, + rtxn: &RoTxn, embedder_actions: &'indexer BTreeMap, embedders: &'indexer RuntimeEmbedders, index_embedder_category_ids: &'indexer std::collections::HashMap, ) -> Result> { let vector_arroy = index.vector_store; + let index_version = index.get_version(rtxn)?.unwrap(); embedders .inner_as_ref() @@ -360,8 +369,12 @@ fn hannoy_writers_from_embedder_actions<'indexer>( }, ))); }; - let writer = - VectorStore::new(vector_arroy, embedder_category_id, action.was_quantized); + let writer = VectorStore::new( + index_version, + vector_arroy, + embedder_category_id, + action.was_quantized, + ); let dimensions = runtime.embedder.dimensions(); Some(Ok(( embedder_category_id, @@ -384,7 +397,12 @@ where let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { continue; }; - let reader = VectorStore::new(index.vector_store, *embedder_id, action.was_quantized); + let reader = VectorStore::new( + index.get_version(wtxn)?.unwrap(), + index.vector_store, + *embedder_id, + action.was_quantized, + ); let Some(dimensions) = reader.dimensions(wtxn)? else { continue; }; @@ -400,7 +418,12 @@ where let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { continue; }; - let arroy = VectorStore::new(index.vector_store, infos.embedder_id, was_quantized); + let arroy = VectorStore::new( + index.get_version(wtxn)?.unwrap(), + index.vector_store, + infos.embedder_id, + was_quantized, + ); let Some(dimensions) = arroy.dimensions(wtxn)? else { continue; }; diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index d04f9bb79..76639ad31 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -120,8 +120,13 @@ impl<'t> VectorDocumentFromDb<'t> { config: &IndexEmbeddingConfig, status: &EmbeddingStatus, ) -> Result> { - let reader = - VectorStore::new(self.index.vector_store, embedder_id, config.config.quantized()); + let index_version = self.index.get_version(self.rtxn)?.unwrap(); + let reader = VectorStore::new( + index_version, + self.index.vector_store, + embedder_id, + config.config.quantized(), + ); let vectors = reader.item_vectors(self.rtxn, self.docid)?; Ok(VectorEntry { diff --git a/crates/milli/src/update/upgrade/v1_18.rs b/crates/milli/src/update/upgrade/v1_18.rs index ef46e1f5b..f2e44f0f3 100644 --- a/crates/milli/src/update/upgrade/v1_18.rs +++ b/crates/milli/src/update/upgrade/v1_18.rs @@ -17,11 +17,13 @@ impl UpgradeIndex for Latest_V1_17_To_V1_18_0 { progress: Progress, ) -> Result { let embedding_configs = index.embedding_configs(); + let index_version = index.get_version(wtxn)?.unwrap(); for config in embedding_configs.embedding_configs(wtxn)? { // TODO use the embedder name to display progress let quantized = config.config.quantized(); let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap(); - let vector_store = VectorStore::new(index.vector_store, embedder_id, quantized); + let vector_store = + VectorStore::new(index_version, index.vector_store, embedder_id, quantized); vector_store.convert_from_arroy(wtxn, progress.clone())?; } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 181a13e6a..7a78f15a4 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -47,18 +47,20 @@ const HANNOY_M: usize = 16; const HANNOY_M0: usize = 32; pub struct VectorStore { - quantized: bool, - embedder_index: u8, + version: (u32, u32, u32), database: hannoy::Database, + embedder_index: u8, + quantized: bool, } impl VectorStore { pub fn new( + version: (u32, u32, u32), database: hannoy::Database, embedder_index: u8, quantized: bool, ) -> Self { - Self { database, embedder_index, quantized } + Self { version, database, embedder_index, quantized } } pub fn embedder_index(&self) -> u8 {