Remove useless Transform methods

This commit is contained in:
Clément Renault
2024-11-18 11:59:18 +01:00
parent 41dbdd2d18
commit 670aff5553
2 changed files with 11 additions and 1170 deletions

View File

@ -5,27 +5,16 @@ mod parallel;
mod transform;
mod typed_chunk;
use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek};
use std::iter;
use std::collections::HashSet;
use std::num::NonZeroU32;
use std::result::Result as StdResult;
use std::sync::Arc;
use crossbeam_channel::{Receiver, Sender};
use grenad::{Merger, MergerBuilder};
use grenad::Merger;
use heed::types::Str;
use heed::Database;
use rand::SeedableRng;
use rayon::iter::{ParallelBridge, ParallelIterator};
use rhai::{Dynamic, Engine, OptimizationLevel, Scope};
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use slice_group_by::GroupBy;
use tracing::debug;
use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
use typed_chunk::TypedChunk;
use self::enrich::enrich_documents_batch;
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
pub use self::helpers::*;
pub use self::transform::{Transform, TransformOutput};
@ -128,602 +117,11 @@ where
})
}
/// Adds a batch of documents to the current builder.
///
/// Since the documents are progressively added to the writer, a failure will cause only
/// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward.
///
/// Returns the number of documents added to the builder.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
pub fn add_documents<R: Read + Seek>(
mut self,
reader: DocumentsBatchReader<R>,
) -> Result<(Self, StdResult<u64, UserError>)> {
// Early return when there is no document to add
if reader.is_empty() {
return Ok((self, Ok(0)));
}
// We check for user errors in this validator and if there is one, we can return
// the `IndexDocument` struct as it is valid to send more documents into it.
// However, if there is an internal error we throw it away!
let enriched_documents_reader = match enrich_documents_batch(
self.wtxn,
self.index,
self.config.autogenerate_docids,
reader,
)? {
Ok(reader) => reader,
Err(user_error) => return Ok((self, Err(user_error))),
};
let indexed_documents =
self.transform.as_mut().expect("Invalid document addition state").read_documents(
enriched_documents_reader,
self.wtxn,
&self.progress,
&self.should_abort,
)? as u64;
self.added_documents += indexed_documents;
Ok((self, Ok(indexed_documents)))
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
pub fn edit_documents(
self,
documents: &RoaringBitmap,
context: Option<Object>,
code: &str,
) -> Result<(Self, StdResult<(u64, u64), UserError>)> {
// Early return when there is no document to edit
if documents.is_empty() {
return Ok((self, Ok((0, 0))));
}
fn rhaimap_to_object(map: rhai::Map) -> Object {
let mut output = Object::new();
for (key, value) in map {
let value = serde_json::to_value(&value).unwrap();
output.insert(key.into(), value);
}
output
}
// Setup the security and limits of the Engine
let mut engine = Engine::new();
engine.set_optimization_level(OptimizationLevel::Full);
engine.set_max_call_levels(1000);
// It is an arbitrary value. We need to let users define this in the settings.
engine.set_max_operations(1_000_000);
engine.set_max_variables(1000);
engine.set_max_functions(30);
engine.set_max_expr_depths(100, 1000);
engine.set_max_string_size(1024 * 1024 * 1024); // 1 GiB
engine.set_max_array_size(10_000);
engine.set_max_map_size(10_000);
let ast = engine.compile(code).map_err(UserError::DocumentEditionCompilationError)?;
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
let mut documents_batch_builder = tempfile::tempfile().map(DocumentsBatchBuilder::new)?;
let mut documents_to_remove = RoaringBitmap::new();
let context: Option<Dynamic> = match context {
Some(context) => {
Some(serde_json::from_value(context.into()).map_err(InternalError::SerdeJson)?)
}
None => None,
};
enum DocumentEdition {
Deleted(crate::DocumentId),
Edited(Object),
Nothing,
}
let immutable_obkvs = ImmutableObkvs::new(
self.wtxn,
self.index.documents,
fields_ids_map.clone(),
documents.clone(),
)?;
let processing = documents.into_iter().par_bridge().map(|docid| {
// safety: Both documents *must* exists in the database as
// their IDs comes from the list of documents ids.
let rhai_document = immutable_obkvs.rhai_map(docid)?.unwrap();
let json_document = immutable_obkvs.json_map(docid)?.unwrap();
let document_id = &json_document[primary_key];
let mut scope = Scope::new();
if let Some(context) = context.as_ref().cloned() {
scope.push_constant_dynamic("context", context.clone());
}
scope.push("doc", rhai_document);
// That's were the magic happens. We run the user script
// which edits "doc" scope variable reprensenting the document
// and ignore the output and even the type of it, i.e., Dynamic.
let _ = engine
.eval_ast_with_scope::<Dynamic>(&mut scope, &ast)
.map_err(UserError::DocumentEditionRuntimeError)?;
match scope.remove::<Dynamic>("doc") {
// If the "doc" variable has set to (), we effectively delete the document.
Some(doc) if doc.is_unit() => Ok(DocumentEdition::Deleted(docid)),
None => unreachable!("missing doc variable from the Rhai scope"),
Some(document) => match document.try_cast() {
Some(document) => {
let new_document = rhaimap_to_object(document);
// Note: This condition is not perfect. Sometimes it detect changes
// like with floating points numbers and consider updating
// the document even if nothing actually changed.
if json_document != new_document {
if Some(document_id) != new_document.get(primary_key) {
Err(Error::UserError(
UserError::DocumentEditionCannotModifyPrimaryKey,
))
} else {
Ok(DocumentEdition::Edited(new_document))
}
} else {
Ok(DocumentEdition::Nothing)
}
}
None => Err(Error::UserError(UserError::DocumentEditionDocumentMustBeObject)),
},
}
});
rayon_par_bridge::par_bridge(100, processing, |iterator| {
for result in iterator {
if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
match result? {
DocumentEdition::Deleted(docid) => {
documents_to_remove.insert(docid);
}
DocumentEdition::Edited(new_document) => {
documents_batch_builder.append_json_object(&new_document)?;
}
DocumentEdition::Nothing => (),
}
}
Ok(())
})?;
let file = documents_batch_builder.into_inner()?;
let reader = DocumentsBatchReader::from_reader(file)?;
let (this, removed) = self.remove_documents_from_db_no_batch(&documents_to_remove)?;
let (this, result) = this.add_documents(reader)?;
Ok((this, result.map(|added| (removed, added))))
}
pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
self.embedders = embedders;
self
}
/// Remove a batch of documents from the current builder.
///
/// Returns the number of documents deleted from the builder.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
pub fn remove_documents(
mut self,
to_delete: Vec<String>,
) -> Result<(Self, StdResult<u64, UserError>)> {
// Early return when there is no document to add
if to_delete.is_empty() {
// Maintains Invariant: remove documents actually always returns Ok for the inner result
return Ok((self, Ok(0)));
}
let deleted_documents = self
.transform
.as_mut()
.expect("Invalid document deletion state")
.remove_documents(to_delete, self.wtxn, &self.should_abort)?
as u64;
self.deleted_documents += deleted_documents;
// Maintains Invariant: remove documents actually always returns Ok for the inner result
Ok((self, Ok(deleted_documents)))
}
/// Removes documents from db using their internal document ids.
///
/// # Warning
///
/// This function is dangerous and will only work correctly if:
///
/// - All the passed ids currently exist in the database
/// - No batching using the standards `remove_documents` and `add_documents` took place
///
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::details")]
pub fn remove_documents_from_db_no_batch(
mut self,
to_delete: &RoaringBitmap,
) -> Result<(Self, u64)> {
// Early return when there is no document to add
if to_delete.is_empty() {
return Ok((self, 0));
}
let deleted_documents = self
.transform
.as_mut()
.expect("Invalid document deletion state")
.remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
as u64;
self.deleted_documents += deleted_documents;
Ok((self, deleted_documents))
}
#[tracing::instrument(
level = "trace"
skip_all,
target = "indexing::documents",
name = "index_documents"
)]
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
if self.added_documents == 0 && self.deleted_documents == 0 {
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
}
let output = self
.transform
.take()
.expect("Invalid document addition state")
.output_from_sorter(self.wtxn, &self.progress)?;
let indexed_documents = output.documents_count as u64;
let number_of_documents = self.execute_raw(output)?;
Ok(DocumentAdditionResult { indexed_documents, number_of_documents })
}
/// Returns the total number of documents in the index after the update.
#[tracing::instrument(
level = "trace",
skip_all,
target = "indexing::details",
name = "index_documents_raw"
)]
pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
where
FP: Fn(UpdateIndexingStep) + Sync,
FA: Fn() -> bool + Sync,
{
let TransformOutput {
primary_key,
mut settings_diff,
field_distribution,
documents_count,
original_documents,
flattened_documents,
} = output;
// update the internal facet and searchable list,
// because they might have changed due to the nested documents flattening.
settings_diff.new.recompute_facets(self.wtxn, self.index)?;
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
let settings_diff = Arc::new(settings_diff);
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
let possible_embedding_mistakes =
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
let backup_pool;
let pool = match self.indexer_config.thread_pool {
Some(ref pool) => pool,
None => {
// We initialize a backup pool with the default
// settings if none have already been set.
#[allow(unused_mut)]
let mut pool_builder = ThreadPoolNoAbortBuilder::new();
#[cfg(test)]
{
pool_builder = pool_builder.num_threads(1);
}
backup_pool = pool_builder.build()?;
&backup_pool
}
};
// create LMDB writer channel
let (lmdb_writer_sx, lmdb_writer_rx): (
Sender<Result<TypedChunk>>,
Receiver<Result<TypedChunk>>,
) = crossbeam_channel::unbounded();
// get the primary key field id
let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level,
max_memory: self.indexer_config.max_memory,
max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen.
};
let documents_chunk_size = match self.indexer_config.documents_chunk_size {
Some(chunk_size) => chunk_size,
None => {
let default_chunk_size = 1024 * 1024 * 4; // 4MiB
let min_chunk_size = 1024 * 512; // 512KiB
// compute the chunk size from the number of available threads and the inputed data size.
let total_size = match flattened_documents.as_ref() {
Some(flattened_documents) => flattened_documents.metadata().map(|m| m.len()),
None => Ok(default_chunk_size as u64),
};
let current_num_threads = pool.current_num_threads();
// if we have more than 2 thread, create a number of chunk equal to 3/4 threads count
let chunk_count = if current_num_threads > 2 {
(current_num_threads * 3 / 4).max(2)
} else {
current_num_threads
};
total_size
.map_or(default_chunk_size, |size| (size as usize) / chunk_count)
.max(min_chunk_size)
}
};
let original_documents = match original_documents {
Some(original_documents) => Some(grenad::Reader::new(original_documents)?),
None => None,
};
let flattened_documents = match flattened_documents {
Some(flattened_documents) => Some(grenad::Reader::new(flattened_documents)?),
None => None,
};
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
let mut final_documents_ids = RoaringBitmap::new();
let mut databases_seen = 0;
let mut word_position_docids = None;
let mut word_fid_docids = None;
let mut word_docids = None;
let mut exact_word_docids = None;
let mut chunk_accumulator = ChunkAccumulator::default();
let mut dimension = HashMap::new();
let current_span = tracing::Span::current();
// Run extraction pipeline in parallel.
pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::details", parent: &current_span, "extract_and_send_grenad_chunks");
let _enter = child_span.enter();
// split obkv file into several chunks
let original_chunk_iter = match original_documents {
Some(original_documents) => {
grenad_obkv_into_chunks(original_documents,pool_params,documents_chunk_size).map(either::Left)
},
None => Ok(either::Right(iter::empty())),
};
// split obkv file into several chunks
let flattened_chunk_iter = match flattened_documents {
Some(flattened_documents) => {
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size).map(either::Left)
},
None => Ok(either::Right(iter::empty())),
};
let result = original_chunk_iter.and_then(|original_chunk| {
let flattened_chunk = flattened_chunk_iter?;
// extract all databases from the chunked obkv douments
extract::data_from_obkv_documents(
original_chunk,
flattened_chunk,
pool_params,
lmdb_writer_sx.clone(),
primary_key_id,
embedders_configs.clone(),
settings_diff_cloned,
max_positions_per_attributes,
Arc::new(possible_embedding_mistakes)
)
});
if let Err(e) = result {
let _ = lmdb_writer_sx.send(Err(e));
}
// needs to be dropped to avoid channel waiting lock.
drop(lmdb_writer_sx);
});
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
loop {
if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
match lmdb_writer_rx.clone().recv_timeout(std::time::Duration::from_millis(500)) {
Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) =
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
(self.progress)(UpdateIndexingStep::IndexDocuments {
documents_seen: documents_seen_count as usize,
total_documents: documents_count,
});
debug!(documents = documents_seen_count, total = documents_count, "Seen");
}
if is_merged_database {
databases_seen += 1;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
}
// If no more chunk remains in the chunk accumulator and the channel is disconected, break.
} else if status == crossbeam_channel::RecvTimeoutError::Disconnected {
break;
} else {
rayon::yield_now();
}
}
Ok(result) => {
let typed_chunk = match result? {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk =
unsafe { as_cloneable_grenad(&word_docids_reader)? };
let word_docids = word_docids.get_or_insert_with(|| {
MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
});
word_docids.push(cloneable_chunk.into_cursor()?);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
let exact_word_docids =
exact_word_docids.get_or_insert_with(|| {
MergerBuilder::new(
MergeDeladdCboRoaringBitmaps,
)
});
exact_word_docids.push(cloneable_chunk.into_cursor()?);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
});
word_fid_docids.push(cloneable_chunk.into_cursor()?);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPositionDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
let word_position_docids =
word_position_docids.get_or_insert_with(|| {
MergerBuilder::new(
MergeDeladdCboRoaringBitmaps,
)
});
word_position_docids.push(cloneable_chunk.into_cursor()?);
TypedChunk::WordPositionDocids(chunk)
}
TypedChunk::VectorPoints {
expected_dimension,
remove_vectors,
embeddings,
manual_vectors,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
} => {
dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension,
manual_vectors,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
}
}
otherwise => otherwise,
};
chunk_accumulator.insert(typed_chunk);
}
}
}
Ok(())
}).map_err(InternalError::from)??;
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
// We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, &primary_key)?;
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
// If an embedder wasn't used in the typedchunk but must be binary quantized
// we should insert it in `dimension`
for (name, action) in settings_diff.embedding_config_updates.iter() {
if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or(
InternalError::DatabaseMissingEntry {
db_name: "embedder_category_id",
key: None,
},
)?;
let reader =
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
let dim = reader.dimensions(self.wtxn)?;
dimension.insert(name.to_string(), dim);
}
}
for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn;
let vector_arroy = self.index.vector_arroy;
let cancel = &self.should_abort;
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?;
let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name);
let was_quantized = settings_diff
.old
.embedding_configs
.get(&embedder_name)
.map_or(false, |conf| conf.2);
let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized);
pool.install(|| {
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?;
Result::Ok(())
})
.map_err(InternalError::from)??;
}
self.execute_prefix_databases(
word_docids.map(MergerBuilder::build),
exact_word_docids.map(MergerBuilder::build),
word_position_docids.map(MergerBuilder::build),
word_fid_docids.map(MergerBuilder::build),
)?;
Ok(number_of_documents)
}
#[tracing::instrument(
level = "trace",
skip_all,