Add embedder stats in batches

This commit is contained in:
Mubelotix
2025-06-20 12:42:22 +02:00
parent fc6cc80705
commit 4cadc8113b
26 changed files with 188 additions and 73 deletions

View File

@ -17,6 +17,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::error::FaultSource;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig;
use crate::progress::EmbedderStats;
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
@ -682,6 +683,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
embedder: Arc<Embedder>,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
unused_vectors_distribution: &UnusedVectorsDistribution,
request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> {
@ -724,6 +726,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
embedder_name,
possible_embedding_mistakes,
embedder_stats.clone(),
unused_vectors_distribution,
request_threads,
)?;
@ -746,6 +749,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
std::mem::take(&mut chunks),
embedder_name,
possible_embedding_mistakes,
embedder_stats.clone(),
unused_vectors_distribution,
request_threads,
)?;
@ -764,6 +768,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
vec![std::mem::take(&mut current_chunk)],
embedder_name,
possible_embedding_mistakes,
embedder_stats,
unused_vectors_distribution,
request_threads,
)?;
@ -783,10 +788,11 @@ fn embed_chunks(
text_chunks: Vec<Vec<String>>,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
unused_vectors_distribution: &UnusedVectorsDistribution,
request_threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embedding>>> {
match embedder.embed_index(text_chunks, request_threads) {
match embedder.embed_index(text_chunks, request_threads, embedder_stats) {
Ok(chunks) => Ok(chunks),
Err(error) => {
if let FaultSource::Bug = error.fault {

View File

@ -31,6 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::progress::EmbedderStats;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::error::PossibleEmbeddingMistakes;
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
@ -49,6 +50,7 @@ pub(crate) fn data_from_obkv_documents(
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
embedder_stats: Option<Arc<EmbedderStats>>,
) -> Result<()> {
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|| {
@ -62,6 +64,7 @@ pub(crate) fn data_from_obkv_documents(
embedders_configs.clone(),
settings_diff.clone(),
possible_embedding_mistakes.clone(),
embedder_stats.clone(),
)
})
.collect::<Result<()>>()
@ -231,6 +234,7 @@ fn send_original_documents_data(
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>,
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
embedder_stats: Option<Arc<EmbedderStats>>,
) -> Result<()> {
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@ -270,6 +274,7 @@ fn send_original_documents_data(
embedder.clone(),
&embedder_name,
&possible_embedding_mistakes,
embedder_stats.clone(),
&unused_vectors_distribution,
request_threads(),
) {

View File

@ -32,7 +32,7 @@ use crate::database_stats::DatabaseStats;
use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError};
use crate::index::{PrefixSearch, PrefixSettings};
use crate::progress::Progress;
use crate::progress::{EmbedderStats, Progress};
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
@ -81,6 +81,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
added_documents: u64,
deleted_documents: u64,
embedders: EmbeddingConfigs,
embedder_stats: Option<Arc<EmbedderStats>>,
}
#[derive(Default, Debug, Clone)]
@ -103,6 +104,7 @@ where
config: IndexDocumentsConfig,
progress: FP,
should_abort: FA,
embedder_stats: Option<Arc<EmbedderStats>>,
) -> Result<IndexDocuments<'t, 'i, 'a, FP, FA>> {
let transform = Some(Transform::new(
wtxn,
@ -123,6 +125,7 @@ where
added_documents: 0,
deleted_documents: 0,
embedders: Default::default(),
embedder_stats,
})
}
@ -292,6 +295,7 @@ where
// Run extraction pipeline in parallel.
let mut modified_docids = RoaringBitmap::new();
let embedder_stats = self.embedder_stats.clone();
pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || {
@ -326,7 +330,8 @@ where
embedders_configs.clone(),
settings_diff_cloned,
max_positions_per_attributes,
Arc::new(possible_embedding_mistakes)
Arc::new(possible_embedding_mistakes),
embedder_stats.clone()
)
});