Implement in new document indexer

This commit is contained in:
Louis Dureuil
2025-06-30 00:00:22 +02:00
parent 22d363c05a
commit f8232976ed
10 changed files with 886 additions and 391 deletions

View File

@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions};
use super::indexer::de::DeserrRawValue;
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::documents::FieldIdMapper;
use crate::index::IndexEmbeddingConfig;
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs};
use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders};
use crate::{DocumentId, Index, InternalError, Result, UserError};
#[derive(Serialize)]
@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> {
None => None,
};
let embedding_config = index.embedding_configs(rtxn)?;
let embedding_config = index.embedding_configs().embedding_configs(rtxn)?;
Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc }))
}
@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> {
&self,
embedder_id: u8,
config: &IndexEmbeddingConfig,
status: &EmbeddingStatus,
) -> Result<VectorEntry<'t>> {
let reader =
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized());
@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> {
Ok(VectorEntry {
has_configured_embedder: true,
embeddings: Some(Embeddings::FromDb(vectors)),
regenerate: !config.user_provided.contains(self.docid),
regenerate: status.must_regenerate(self.docid),
implicit: false,
})
}
@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
self.embedding_config
.iter()
.map(|config| {
let embedder_id =
self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap();
let entry = self.entry_from_db(embedder_id, config)?;
let info =
self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap();
let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?;
let config_name = self.doc_alloc.alloc_str(config.name.as_str());
Ok((&*config_name, entry))
})
@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
}
fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'t>>> {
Ok(match self.index.embedder_category_id.get(self.rtxn, key)? {
Some(embedder_id) => {
Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? {
Some(info) => {
let config =
self.embedding_config.iter().find(|config| config.name == key).unwrap();
Some(self.entry_from_db(embedder_id, config)?)
Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?)
}
None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) {
Some(embedding_from_doc) => {
@ -222,7 +223,7 @@ fn entry_from_raw_value(
pub struct VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str,
vectors: RawMap<'doc, FxBuildHasher>,
embedders: &'doc EmbeddingConfigs,
embedders: &'doc RuntimeEmbedders,
}
impl<'doc> VectorDocumentFromVersions<'doc> {
@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str,
versions: &Versions<'doc>,
bump: &'doc Bump,
embedders: &'doc EmbeddingConfigs,
embedders: &'doc RuntimeEmbedders,
) -> Result<Option<Self>> {
let document = DocumentFromVersions::new(versions);
if let Some(vectors_field) = document.vectors_field()? {
@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> {
db_fields_ids_map: &'doc Mapper,
versions: &Versions<'doc>,
doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs,
embedders: &'doc RuntimeEmbedders,
) -> Result<Option<Self>> {
let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?;
let new_doc =
@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> {
external_document_id: &'doc str,
versions: &Versions<'doc>,
doc_alloc: &'doc Bump,
embedders: &'doc EmbeddingConfigs,
embedders: &'doc RuntimeEmbedders,
) -> Result<Option<Self>> {
let Some(new_doc) =
VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?