mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-06 12:46:31 +00:00
Implement in new document indexer
This commit is contained in:
@ -13,21 +13,17 @@ use super::super::thread_local::{FullySend, ThreadLocal};
|
||||
use super::super::FacetFieldIdsDelta;
|
||||
use super::document_changes::{extract, DocumentChanges, IndexingContext};
|
||||
use super::settings_changes::settings_change_extract;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::progress::MergingWordCache;
|
||||
use crate::documents::{FieldIdMapper, PrimaryKey};
|
||||
use crate::progress::{EmbedderStats, MergingWordCache};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::new::extract::EmbeddingExtractor;
|
||||
use crate::update::new::indexer::settings_changes::DocumentsIndentifiers;
|
||||
use crate::update::new::merger::merge_and_send_rtree;
|
||||
use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases};
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::Index;
|
||||
use crate::InternalError;
|
||||
use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
||||
@ -35,7 +31,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
||||
indexing_context: IndexingContext<MSP>,
|
||||
indexer_span: Span,
|
||||
extractor_sender: ExtractorBbqueueSender,
|
||||
embedders: &EmbeddingConfigs,
|
||||
embedders: &RuntimeEmbedders,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_extraction: &AtomicBool,
|
||||
field_distribution: &mut BTreeMap<String, u64>,
|
||||
@ -275,14 +271,19 @@ where
|
||||
let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
let embedder_configs = index.embedding_configs();
|
||||
for config in &mut index_embeddings {
|
||||
let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap();
|
||||
|
||||
'data: for data in datastore.iter_mut() {
|
||||
let data = &mut data.get_mut().0;
|
||||
let Some(deladd) = data.remove(&config.name) else {
|
||||
let Some(delta) = data.remove(&config.name) else {
|
||||
continue 'data;
|
||||
};
|
||||
deladd.apply_to(&mut config.user_provided, modified_docids);
|
||||
delta.apply_to(&mut infos.embedding_status);
|
||||
}
|
||||
|
||||
extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress};
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
|
||||
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs};
|
||||
use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
||||
|
||||
pub(crate) mod de;
|
||||
@ -54,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
||||
new_fields_ids_map: FieldsIdsMap,
|
||||
new_primary_key: Option<PrimaryKey<'pl>>,
|
||||
document_changes: &DC,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
progress: &'indexer Progress,
|
||||
embedder_stats: &'indexer EmbedderStats,
|
||||
@ -93,7 +93,7 @@ where
|
||||
grenad_parameters: &grenad_parameters,
|
||||
};
|
||||
|
||||
let index_embeddings = index.embedding_configs(wtxn)?;
|
||||
let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?;
|
||||
let mut field_distribution = index.field_distribution(wtxn)?;
|
||||
let mut document_ids = index.documents_ids(wtxn)?;
|
||||
let mut modified_docids = roaring::RoaringBitmap::new();
|
||||
@ -133,20 +133,21 @@ where
|
||||
let arroy_writers: Result<HashMap<_, _>> = embedders
|
||||
.inner_as_ref()
|
||||
.iter()
|
||||
.map(|(embedder_name, (embedder, _, was_quantized))| {
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
.map(|(embedder_name, runtime)| {
|
||||
let embedder_index = index
|
||||
.embedding_configs()
|
||||
.embedder_id(wtxn, embedder_name)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
})?;
|
||||
|
||||
let dimensions = embedder.dimensions();
|
||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized);
|
||||
let dimensions = runtime.embedder.dimensions();
|
||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
|
||||
|
||||
Ok((
|
||||
embedder_index,
|
||||
(embedder_name.as_str(), embedder.as_ref(), writer, dimensions),
|
||||
(embedder_name.as_str(), &*runtime.embedder, writer, dimensions),
|
||||
))
|
||||
})
|
||||
.collect();
|
||||
|
@ -11,11 +11,11 @@ use super::super::channel::*;
|
||||
use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::settings::InnerIndexSettings;
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings};
|
||||
use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders};
|
||||
use crate::{Error, Index, InternalError, Result, UserError};
|
||||
|
||||
pub fn write_to_db(
|
||||
@ -64,6 +64,14 @@ pub fn write_to_db(
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
ReceiverAction::LargeVector(
|
||||
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
|
||||
) => {
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let embedding = large_vector.read_embedding(*dimensions);
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Every time the is a message in the channel we search
|
||||
@ -137,7 +145,7 @@ where
|
||||
)?;
|
||||
}
|
||||
|
||||
index.put_embedding_configs(wtxn, index_embeddings)?;
|
||||
index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -147,7 +155,7 @@ pub(super) fn update_index(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
new_fields_ids_map: FieldIdMapWithMetadata,
|
||||
new_primary_key: Option<PrimaryKey<'_>>,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
field_distribution: std::collections::BTreeMap<String, u64>,
|
||||
document_ids: roaring::RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
@ -226,14 +234,36 @@ pub fn write_from_bbqueue(
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let mut embeddings = Embeddings::new(*dimensions);
|
||||
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
if embeddings.append(all_embeddings.to_vec()).is_err() {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: all_embeddings.len(),
|
||||
}));
|
||||
}
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
if !all_embeddings.is_empty() {
|
||||
if embeddings.append(all_embeddings.to_vec()).is_err() {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: all_embeddings.len(),
|
||||
}));
|
||||
}
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
}
|
||||
EntryHeader::ArroySetVector(
|
||||
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. },
|
||||
) => {
|
||||
let frame = frame_with_header.frame();
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
|
||||
if embedding.is_empty() {
|
||||
writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?;
|
||||
} else {
|
||||
if embedding.len() != *dimensions {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: embedding.len(),
|
||||
}));
|
||||
}
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user