Implement in old settings indexer and old dump import indexer

This commit is contained in:
Louis Dureuil
2025-06-30 00:01:05 +02:00
parent f8232976ed
commit cab5e35ff7
6 changed files with 824 additions and 246 deletions

View File

@ -23,16 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_vector_points::{
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints,
};
use self::extract_word_docids::extract_word_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::progress::EmbedderStats;
use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::db::EmbedderInfo;
use crate::vector::error::PossibleEmbeddingMistakes;
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
@ -46,9 +47,9 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
embedder_stats: &Arc<EmbedderStats>,
) -> Result<()> {
@ -61,8 +62,8 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
embedders_configs.clone(),
settings_diff.clone(),
embedder_info.clone(),
possible_embedding_mistakes.clone(),
embedder_stats.clone(),
)
@ -231,8 +232,8 @@ fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>,
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
embedder_stats: Arc<EmbedderStats>,
) -> Result<()> {
@ -245,7 +246,6 @@ fn send_original_documents_data(
if index_vectors {
let settings_diff = settings_diff.clone();
let embedders_configs = embedders_configs.clone();
let original_documents_chunk = original_documents_chunk.clone();
let lmdb_writer_sx = lmdb_writer_sx.clone();
@ -253,8 +253,8 @@ fn send_original_documents_data(
match extract_vector_points(
original_documents_chunk.clone(),
indexer,
&embedders_configs,
&settings_diff,
embedder_info.as_slice(),
&possible_embedding_mistakes,
) {
Ok((extracted_vectors, unused_vectors_distribution)) => {
@ -262,16 +262,16 @@ fn send_original_documents_data(
manual_vectors,
remove_vectors,
prompts,
inputs,
embedder_name,
embedder,
add_to_user_provided,
remove_from_user_provided,
runtime,
embedding_status_delta,
} in extracted_vectors
{
let embeddings = match extract_embeddings(
let embeddings_from_prompts = match extract_embeddings_from_prompts(
prompts,
indexer,
embedder.clone(),
runtime.clone(),
&embedder_name,
&possible_embedding_mistakes,
&embedder_stats,
@ -284,18 +284,37 @@ fn send_original_documents_data(
None
}
};
let embeddings_from_fragments = match extract_embeddings_from_fragments(
inputs,
indexer,
runtime.clone(),
&embedder_name,
&possible_embedding_mistakes,
&embedder_stats,
&unused_vectors_distribution,
request_threads(),
) {
Ok(results) => Some(results),
Err(error) => {
let _ = lmdb_writer_sx.send(Err(error));
None
}
};
if !(remove_vectors.is_empty()
&& manual_vectors.is_empty()
&& embeddings.as_ref().is_none_or(|e| e.is_empty()))
&& embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty())
&& embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty()))
{
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension: embedder.dimensions(),
embeddings_from_prompts,
embeddings_from_fragments,
expected_dimension: runtime.embedder.dimensions(),
manual_vectors,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
embedding_status_delta,
}));
}
}