Implement Incremental document database stats computing

This commit is contained in:
ManyTheFish
2025-02-17 16:36:33 +01:00
committed by Kerollmops
parent d9642ec916
commit 9f3663e768
9 changed files with 116 additions and 53 deletions

View File

@ -307,6 +307,7 @@ where
let current_span = tracing::Span::current();
// Run extraction pipeline in parallel.
let mut modified_docids = RoaringBitmap::new();
pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || {
@ -367,7 +368,7 @@ where
Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) =
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?;
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
@ -467,6 +468,10 @@ where
Ok(())
}).map_err(InternalError::from)??;
if !settings_diff.settings_update_only {
// Update the stats of the documents database when there is a document update.
self.index.update_documents_stats(self.wtxn, modified_docids)?;
}
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;

View File

@ -129,6 +129,7 @@ pub(crate) fn write_typed_chunk_into_index(
index: &Index,
settings_diff: &InnerIndexSettingsDiff,
typed_chunks: Vec<TypedChunk>,
modified_docids: &mut RoaringBitmap,
) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false;
match typed_chunks[0] {
@ -214,6 +215,7 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Create,
});
docids.insert(docid);
modified_docids.insert(docid);
} else {
db.delete(wtxn, &docid)?;
operations.push(DocumentOperation {
@ -222,6 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Delete,
});
docids.remove(docid);
modified_docids.insert(docid);
}
}
let external_documents_docids = index.external_documents_ids();