Add embedder stats in batches

This commit is contained in:
Mubelotix
2025-06-20 12:42:22 +02:00
parent fc6cc80705
commit 4cadc8113b
26 changed files with 188 additions and 73 deletions

View File

@@ -17,6 +17,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::error::FaultSource;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig;
use crate::progress::EmbedderStats;
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
@@ -682,6 +683,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
embedder: Arc<Embedder>,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
unused_vectors_distribution: &UnusedVectorsDistribution,
request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> {
@@ -724,6 +726,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
embedder_name,
possible_embedding_mistakes,
embedder_stats.clone(),
unused_vectors_distribution,
request_threads,
)?;
@@ -746,6 +749,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
std::mem::take(&mut chunks),
embedder_name,
possible_embedding_mistakes,
embedder_stats.clone(),
unused_vectors_distribution,
request_threads,
)?;
@@ -764,6 +768,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
vec![std::mem::take(&mut current_chunk)],
embedder_name,
possible_embedding_mistakes,
embedder_stats,
unused_vectors_distribution,
request_threads,
)?;
@@ -783,10 +788,11 @@ fn embed_chunks(
text_chunks: Vec<Vec<String>>,
embedder_name: &str,
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
embedder_stats: Option<Arc<EmbedderStats>>,
unused_vectors_distribution: &UnusedVectorsDistribution,
request_threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embedding>>> {
match embedder.embed_index(text_chunks, request_threads) {
match embedder.embed_index(text_chunks, request_threads, embedder_stats) {
Ok(chunks) => Ok(chunks),
Err(error) => {
if let FaultSource::Bug = error.fault {

View File

@@ -31,6 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::progress::EmbedderStats;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::error::PossibleEmbeddingMistakes;
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
@@ -49,6 +50,7 @@ pub(crate) fn data_from_obkv_documents(
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
embedder_stats: Option<Arc<EmbedderStats>>,
) -> Result<()> {
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|| {
@@ -62,6 +64,7 @@ pub(crate) fn data_from_obkv_documents(
embedders_configs.clone(),
settings_diff.clone(),
possible_embedding_mistakes.clone(),
embedder_stats.clone(),
)
})
.collect::<Result<()>>()
@@ -231,6 +234,7 @@ fn send_original_documents_data(
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>,
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
embedder_stats: Option<Arc<EmbedderStats>>,
) -> Result<()> {
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@@ -270,6 +274,7 @@ fn send_original_documents_data(
embedder.clone(),
&embedder_name,
&possible_embedding_mistakes,
embedder_stats.clone(),
&unused_vectors_distribution,
request_threads(),
) {