Make the VectorStore aware of the index version

This commit is contained in:
Clément Renault
2025-08-12 15:09:26 +02:00
parent 2a01504ba0
commit d161a0d7b4
11 changed files with 89 additions and 22 deletions

View File

@ -146,7 +146,6 @@ impl IndexScheduler {
};
let mut index_wtxn = index.write_txn()?;
let index_version = index.get_version(&index_wtxn)?.unwrap_or((1, 12, 0));
let package_version = (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH);
if index_version != package_version {

View File

@ -1769,10 +1769,12 @@ impl Index {
) -> Result<BTreeMap<String, EmbeddingsWithMetadata>> {
let mut res = BTreeMap::new();
let embedders = self.embedding_configs();
let index_version = self.get_version(rtxn)?.unwrap();
for config in embedders.embedding_configs(rtxn)? {
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
let has_fragments = config.config.embedder_options.has_fragments();
let reader = VectorStore::new(
index_version,
self.vector_store,
embedder_info.embedder_id,
config.config.quantized(),
@ -1795,10 +1797,15 @@ impl Index {
pub fn hannoy_stats(&self, rtxn: &RoTxn<'_>) -> Result<HannoyStats> {
let mut stats = HannoyStats::default();
let embedding_configs = self.embedding_configs();
let index_version = self.get_version(rtxn)?.unwrap();
for config in embedding_configs.embedding_configs(rtxn)? {
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
let reader =
VectorStore::new(self.vector_store, embedder_id, config.config.quantized());
let reader = VectorStore::new(
index_version,
self.vector_store,
embedder_id,
config.config.quantized(),
);
reader.aggregate_stats(rtxn, &mut stats)?;
}
Ok(stats)

View File

@ -56,7 +56,12 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
let target = &self.target;
let before = Instant::now();
let reader = VectorStore::new(ctx.index.vector_store, self.embedder_index, self.quantized);
let reader = VectorStore::new(
ctx.index.get_version(ctx.txn)?.unwrap(),
ctx.index.vector_store,
self.embedder_index,
self.quantized,
);
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
self.cached_sorted_docids = results.into_iter();
*ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats {

View File

@ -72,7 +72,12 @@ impl<'a> Similar<'a> {
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
})?;
let reader = VectorStore::new(self.index.vector_store, embedder_index, self.quantized);
let reader = VectorStore::new(
self.index.get_version(self.rtxn)?.unwrap(),
self.index.vector_store,
embedder_index,
self.quantized,
);
let results = reader.nns_by_item(
self.rtxn,
self.id,

View File

@ -485,6 +485,7 @@ where
// If an embedder wasn't used in the typedchunk but must be binary quantized
// we should insert it in `dimension`
let index_version = self.index.get_version(&self.wtxn)?.unwrap();
for (name, action) in settings_diff.embedding_config_updates.iter() {
if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or(
@ -493,7 +494,12 @@ where
key: None,
},
)?;
let reader = VectorStore::new(self.index.vector_store, index, action.was_quantized);
let reader = VectorStore::new(
index_version,
self.index.vector_store,
index,
action.was_quantized,
);
let Some(dim) = reader.dimensions(self.wtxn)? else {
continue;
};
@ -522,7 +528,8 @@ where
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
pool.install(|| {
let mut writer = VectorStore::new(vector_hannoy, embedder_index, was_quantized);
let mut writer =
VectorStore::new(index_version, vector_hannoy, embedder_index, was_quantized);
writer.build_and_quantize(
wtxn,
// In the settings we don't have any progress to share

View File

@ -834,6 +834,7 @@ impl<'a, 'i> Transform<'a, 'i> {
None
};
let index_version = self.index.get_version(wtxn)?.unwrap();
let readers: BTreeMap<&str, (VectorStore, &RoaringBitmap)> = settings_diff
.embedding_config_updates
.iter()
@ -842,6 +843,7 @@ impl<'a, 'i> Transform<'a, 'i> {
action.write_back()
{
let reader = VectorStore::new(
index_version,
self.index.vector_store,
*embedder_id,
action.was_quantized,
@ -949,8 +951,12 @@ impl<'a, 'i> Transform<'a, 'i> {
else {
continue;
};
let hannoy =
VectorStore::new(self.index.vector_store, infos.embedder_id, was_quantized);
let hannoy = VectorStore::new(
index_version,
self.index.vector_store,
infos.embedder_id,
was_quantized,
);
let Some(dimensions) = hannoy.dimensions(wtxn)? else {
continue;
};

View File

@ -619,6 +619,7 @@ pub(crate) fn write_typed_chunk_into_index(
let _entered = span.enter();
let embedders = index.embedding_configs();
let index_version = index.get_version(wtxn)?.unwrap();
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
@ -677,7 +678,12 @@ pub(crate) fn write_typed_chunk_into_index(
.get(&embedder_name)
.is_some_and(|conf| conf.is_quantized);
// FIXME: allow customizing distance
let writer = VectorStore::new(index.vector_store, infos.embedder_id, binary_quantized);
let writer = VectorStore::new(
index_version,
index.vector_store,
infos.embedder_id,
binary_quantized,
);
// remove vectors for docids we want them removed
let merger = remove_vectors_builder.build();

View File

@ -8,7 +8,7 @@ use document_changes::{DocumentChanges, IndexingContext};
pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats};
use hashbrown::HashMap;
use heed::RwTxn;
use heed::{RoTxn, RwTxn};
pub use partial_dump::PartialDump;
pub use post_processing::recompute_word_fst_from_word_docids_database;
pub use update_by_function::UpdateByFunction;
@ -130,6 +130,7 @@ where
let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
let vector_arroy = index.vector_store;
let index_version = index.get_version(wtxn)?.unwrap();
let hannoy_writers: Result<HashMap<_, _>> = embedders
.inner_as_ref()
.iter()
@ -143,7 +144,12 @@ where
})?;
let dimensions = runtime.embedder.dimensions();
let writer = VectorStore::new(vector_arroy, embedder_index, runtime.is_quantized);
let writer = VectorStore::new(
index_version,
vector_arroy,
embedder_index,
runtime.is_quantized,
);
Ok((
embedder_index,
@ -285,6 +291,7 @@ where
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
let mut hannoy_writers = hannoy_writers_from_embedder_actions(
index,
wtxn,
embedder_actions,
new_embedders,
index_embedder_category_ids,
@ -338,11 +345,13 @@ where
fn hannoy_writers_from_embedder_actions<'indexer>(
index: &Index,
rtxn: &RoTxn,
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,
embedders: &'indexer RuntimeEmbedders,
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, VectorStore, usize)>> {
let vector_arroy = index.vector_store;
let index_version = index.get_version(rtxn)?.unwrap();
embedders
.inner_as_ref()
@ -360,8 +369,12 @@ fn hannoy_writers_from_embedder_actions<'indexer>(
},
)));
};
let writer =
VectorStore::new(vector_arroy, embedder_category_id, action.was_quantized);
let writer = VectorStore::new(
index_version,
vector_arroy,
embedder_category_id,
action.was_quantized,
);
let dimensions = runtime.embedder.dimensions();
Some(Ok((
embedder_category_id,
@ -384,7 +397,12 @@ where
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
continue;
};
let reader = VectorStore::new(index.vector_store, *embedder_id, action.was_quantized);
let reader = VectorStore::new(
index.get_version(wtxn)?.unwrap(),
index.vector_store,
*embedder_id,
action.was_quantized,
);
let Some(dimensions) = reader.dimensions(wtxn)? else {
continue;
};
@ -400,7 +418,12 @@ where
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
continue;
};
let arroy = VectorStore::new(index.vector_store, infos.embedder_id, was_quantized);
let arroy = VectorStore::new(
index.get_version(wtxn)?.unwrap(),
index.vector_store,
infos.embedder_id,
was_quantized,
);
let Some(dimensions) = arroy.dimensions(wtxn)? else {
continue;
};

View File

@ -120,8 +120,13 @@ impl<'t> VectorDocumentFromDb<'t> {
config: &IndexEmbeddingConfig,
status: &EmbeddingStatus,
) -> Result<VectorEntry<'t>> {
let reader =
VectorStore::new(self.index.vector_store, embedder_id, config.config.quantized());
let index_version = self.index.get_version(self.rtxn)?.unwrap();
let reader = VectorStore::new(
index_version,
self.index.vector_store,
embedder_id,
config.config.quantized(),
);
let vectors = reader.item_vectors(self.rtxn, self.docid)?;
Ok(VectorEntry {

View File

@ -17,11 +17,13 @@ impl UpgradeIndex for Latest_V1_17_To_V1_18_0 {
progress: Progress,
) -> Result<bool> {
let embedding_configs = index.embedding_configs();
let index_version = index.get_version(wtxn)?.unwrap();
for config in embedding_configs.embedding_configs(wtxn)? {
// TODO use the embedder name to display progress
let quantized = config.config.quantized();
let embedder_id = embedding_configs.embedder_id(wtxn, &config.name)?.unwrap();
let vector_store = VectorStore::new(index.vector_store, embedder_id, quantized);
let vector_store =
VectorStore::new(index_version, index.vector_store, embedder_id, quantized);
vector_store.convert_from_arroy(wtxn, progress.clone())?;
}

View File

@ -47,18 +47,20 @@ const HANNOY_M: usize = 16;
const HANNOY_M0: usize = 32;
pub struct VectorStore {
quantized: bool,
embedder_index: u8,
version: (u32, u32, u32),
database: hannoy::Database<Unspecified>,
embedder_index: u8,
quantized: bool,
}
impl VectorStore {
pub fn new(
version: (u32, u32, u32),
database: hannoy::Database<Unspecified>,
embedder_index: u8,
quantized: bool,
) -> Self {
Self { database, embedder_index, quantized }
Self { version, database, embedder_index, quantized }
}
pub fn embedder_index(&self) -> u8 {