mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 16:51:01 +00:00
Implement in old settings indexer and old dump import indexer
This commit is contained in:
@ -23,16 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::{
|
||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
||||
extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints,
|
||||
};
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::db::EmbedderInfo;
|
||||
use crate::vector::error::PossibleEmbeddingMistakes;
|
||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
@ -46,9 +47,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
embedder_stats: &Arc<EmbedderStats>,
|
||||
) -> Result<()> {
|
||||
@ -61,8 +62,8 @@ pub(crate) fn data_from_obkv_documents(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
embedders_configs.clone(),
|
||||
settings_diff.clone(),
|
||||
embedder_info.clone(),
|
||||
possible_embedding_mistakes.clone(),
|
||||
embedder_stats.clone(),
|
||||
)
|
||||
@ -231,8 +232,8 @@ fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
embedder_stats: Arc<EmbedderStats>,
|
||||
) -> Result<()> {
|
||||
@ -245,7 +246,6 @@ fn send_original_documents_data(
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
let embedders_configs = embedders_configs.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
@ -253,8 +253,8 @@ fn send_original_documents_data(
|
||||
match extract_vector_points(
|
||||
original_documents_chunk.clone(),
|
||||
indexer,
|
||||
&embedders_configs,
|
||||
&settings_diff,
|
||||
embedder_info.as_slice(),
|
||||
&possible_embedding_mistakes,
|
||||
) {
|
||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||
@ -262,16 +262,16 @@ fn send_original_documents_data(
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
inputs,
|
||||
embedder_name,
|
||||
embedder,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
runtime,
|
||||
embedding_status_delta,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
let embeddings_from_prompts = match extract_embeddings_from_prompts(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
runtime.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&embedder_stats,
|
||||
@ -284,18 +284,37 @@ fn send_original_documents_data(
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let embeddings_from_fragments = match extract_embeddings_from_fragments(
|
||||
inputs,
|
||||
indexer,
|
||||
runtime.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&embedder_stats,
|
||||
&unused_vectors_distribution,
|
||||
request_threads(),
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().is_none_or(|e| e.is_empty()))
|
||||
&& embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty())
|
||||
&& embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
embeddings_from_prompts,
|
||||
embeddings_from_fragments,
|
||||
expected_dimension: runtime.embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
embedding_status_delta,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user