Extract or regenerate vectors related to settings changes

This commit is contained in:
ManyTheFish
2025-06-25 14:46:45 +02:00
parent 51a087b764
commit 900be0ccad
5 changed files with 256 additions and 3 deletions

View File

@ -12,6 +12,7 @@ use super::super::steps::IndexingStep;
use super::super::thread_local::{FullySend, ThreadLocal};
use super::super::FacetFieldIdsDelta;
use super::document_changes::{extract, DocumentChanges, IndexingContext};
use super::settings_changes::settings_change_extract;
use crate::documents::FieldIdMapper;
use crate::documents::PrimaryKey;
use crate::index::IndexEmbeddingConfig;
@ -356,6 +357,53 @@ where
extractor_allocs,
)?;
'vectors: {
if settings_delta.embedder_actions().is_empty() {
break 'vectors;
}
let embedding_sender = extractor_sender.embeddings();
// extract the remaining embedders
let extractor = SettingsChangeEmbeddingExtractor::new(
settings_delta.new_embedders(),
settings_delta.old_embedders(),
settings_delta.embedder_actions(),
settings_delta.new_embedder_category_id(),
embedding_sender,
field_distribution,
request_threads(),
);
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
{
let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors");
let _entered = span.enter();
settings_change_extract(
&documents,
&extractor,
indexing_context,
extractor_allocs,
&datastore,
IndexingStep::ExtractingEmbeddings,
)?;
}
{
let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors");
let _entered = span.enter();
for config in &mut index_embeddings {
'data: for data in datastore.iter_mut() {
let data = &mut data.get_mut().0;
let Some(deladd) = data.remove(&config.name) else {
continue 'data;
};
deladd.apply_to(&mut config.user_provided, modified_docids);
}
}
}
}
indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites);
finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed);

View File

@ -170,6 +170,7 @@ where
index_embeddings,
arroy_memory,
&mut arroy_writers,
None,
&indexing_context.must_stop_processing,
)
})

View File

@ -1,3 +1,4 @@
use std::collections::BTreeMap;
use std::sync::atomic::AtomicBool;
use bstr::ByteSlice as _;
@ -13,6 +14,7 @@ use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig;
use crate::progress::Progress;
use crate::update::settings::InnerIndexSettings;
use crate::vector::settings::EmbedderAction;
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings};
use crate::{Error, Index, InternalError, Result, UserError};
@ -106,6 +108,7 @@ pub fn build_vectors<MSP>(
index_embeddings: Vec<IndexEmbeddingConfig>,
arroy_memory: Option<usize>,
arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
must_stop_processing: &MSP,
) -> Result<()>
where
@ -117,14 +120,17 @@ where
let seed = rand::random();
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
for (_index, (_embedder_name, _embedder, writer, dimensions)) in arroy_writers {
for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers {
let dimensions = *dimensions;
let is_being_quantized = embeder_actions
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
.unwrap_or(false);
writer.build_and_quantize(
wtxn,
progress,
&mut rng,
dimensions,
false,
is_being_quantized,
arroy_memory,
must_stop_processing,
)?;