5325: Documents database stats r=irevoire a=ManyTheFish

# Pull Request

## Related issue
Fixes #5319

## List

- Create a DatabaseStats struct
- Compute and store the documents database stats in the IndexStats
- Force dumpless upgrade to update the index stats
- when a document addition/modification/deletion is made, we only recompute the database stats on the added/modified/deleted documents

Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Many the fish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2025-02-26 10:03:45 +00:00
committed by GitHub
20 changed files with 295 additions and 16 deletions

View File

@ -0,0 +1,96 @@
use heed::types::Bytes;
use heed::Database;
use heed::RoTxn;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
/// The stats of a database.
pub struct DatabaseStats {
/// The number of entries in the database.
number_of_entries: u64,
/// The total size of the keys in the database.
total_key_size: u64,
/// The total size of the values in the database.
total_value_size: u64,
}
impl DatabaseStats {
/// Returns the stats of the database.
///
/// This function iterates over the whole database and computes the stats.
/// It is not efficient and should be cached somewhere.
pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> {
let mut database_stats =
Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 };
let mut iter = database.iter(rtxn)?;
while let Some((key, value)) = iter.next().transpose()? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
database_stats.total_key_size += key_size;
database_stats.total_value_size += value_size;
}
database_stats.number_of_entries = database.len(rtxn)?;
Ok(database_stats)
}
/// Recomputes the stats of the database and returns the new stats.
///
/// This function is used to update the stats of the database when some keys are modified.
/// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states.
pub(crate) fn recompute<I, K>(
mut stats: Self,
database: Database<Bytes, Bytes>,
before_rtxn: &RoTxn<'_>,
after_rtxn: &RoTxn<'_>,
modified_keys: I,
) -> heed::Result<Self>
where
I: IntoIterator<Item = K>,
K: AsRef<[u8]>,
{
for key in modified_keys {
let key = key.as_ref();
if let Some(value) = database.get(after_rtxn, key)? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
stats.total_key_size = stats.total_key_size.saturating_add(key_size);
stats.total_value_size = stats.total_value_size.saturating_add(value_size);
}
if let Some(value) = database.get(before_rtxn, key)? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
stats.total_key_size = stats.total_key_size.saturating_sub(key_size);
stats.total_value_size = stats.total_value_size.saturating_sub(value_size);
}
}
stats.number_of_entries = database.len(after_rtxn)?;
Ok(stats)
}
pub fn average_key_size(&self) -> u64 {
self.total_key_size.checked_div(self.number_of_entries).unwrap_or(0)
}
pub fn average_value_size(&self) -> u64 {
self.total_value_size.checked_div(self.number_of_entries).unwrap_or(0)
}
pub fn number_of_entries(&self) -> u64 {
self.number_of_entries
}
pub fn total_key_size(&self) -> u64 {
self.total_key_size
}
pub fn total_value_size(&self) -> u64 {
self.total_value_size
}
}

View File

@ -11,6 +11,7 @@ use rstar::RTree;
use serde::{Deserialize, Serialize};
use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME};
use crate::database_stats::DatabaseStats;
use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError};
use crate::fields_ids_map::FieldsIdsMap;
@ -74,6 +75,7 @@ pub mod main_key {
pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules";
pub const FACET_SEARCH: &str = "facet_search";
pub const PREFIX_SEARCH: &str = "prefix_search";
pub const DOCUMENTS_STATS: &str = "documents_stats";
}
pub mod db_name {
@ -403,6 +405,58 @@ impl Index {
Ok(count.unwrap_or_default())
}
/// Updates the stats of the documents database based on the previous stats and the modified docids.
pub fn update_documents_stats(
&self,
wtxn: &mut RwTxn<'_>,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
let before_rtxn = self.read_txn()?;
let document_stats = match self.documents_stats(&before_rtxn)? {
Some(before_stats) => DatabaseStats::recompute(
before_stats,
self.documents.remap_types(),
&before_rtxn,
wtxn,
modified_docids.iter().map(|docid| docid.to_be_bytes()),
)?,
None => {
// This should never happen when there are already documents in the index, the documents stats should be present.
// If it happens, it means that the index was not properly initialized/upgraded.
debug_assert_eq!(
self.documents.len(&before_rtxn)?,
0,
"The documents stats should be present when there are documents in the index"
);
tracing::warn!("No documents stats found, creating new ones");
DatabaseStats::new(self.documents.remap_types(), &*wtxn)?
}
};
self.put_documents_stats(wtxn, document_stats)?;
Ok(())
}
/// Writes the stats of the documents database.
pub fn put_documents_stats(
&self,
wtxn: &mut RwTxn<'_>,
stats: DatabaseStats,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<DatabaseStats>>().put(
wtxn,
main_key::DOCUMENTS_STATS,
&stats,
)
}
/// Returns the stats of the documents database.
pub fn documents_stats(&self, rtxn: &RoTxn<'_>) -> heed::Result<Option<DatabaseStats>> {
self.main
.remap_types::<Str, SerdeJson<DatabaseStats>>()
.get(rtxn, main_key::DOCUMENTS_STATS)
}
/* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id.

View File

@ -10,6 +10,7 @@ pub mod documents;
mod asc_desc;
mod criterion;
pub mod database_stats;
mod error;
mod external_documents_ids;
pub mod facet;

View File

@ -307,6 +307,7 @@ where
let current_span = tracing::Span::current();
// Run extraction pipeline in parallel.
let mut modified_docids = RoaringBitmap::new();
pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || {
@ -367,7 +368,7 @@ where
Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) =
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?;
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
@ -467,6 +468,10 @@ where
Ok(())
}).map_err(InternalError::from)??;
if !settings_diff.settings_update_only {
// Update the stats of the documents database when there is a document update.
self.index.update_documents_stats(self.wtxn, modified_docids)?;
}
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;

View File

@ -129,6 +129,7 @@ pub(crate) fn write_typed_chunk_into_index(
index: &Index,
settings_diff: &InnerIndexSettingsDiff,
typed_chunks: Vec<TypedChunk>,
modified_docids: &mut RoaringBitmap,
) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false;
match typed_chunks[0] {
@ -214,6 +215,7 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Create,
});
docids.insert(docid);
modified_docids.insert(docid);
} else {
db.delete(wtxn, &docid)?;
operations.push(DocumentOperation {
@ -222,6 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Delete,
});
docids.remove(docid);
modified_docids.insert(docid);
}
}
let external_documents_docids = index.external_documents_ids();

View File

@ -711,15 +711,17 @@ impl DelAddRoaringBitmap {
DelAddRoaringBitmap { del, add }
}
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) {
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap) {
let DelAddRoaringBitmap { del, add } = self;
if let Some(del) = del {
*documents_ids -= del;
*modified_docids |= del;
}
if let Some(add) = add {
*documents_ids |= add;
*modified_docids |= add;
}
}
}

View File

@ -32,6 +32,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
field_distribution: &mut BTreeMap<String, u64>,
mut index_embeddings: Vec<IndexEmbeddingConfig>,
document_ids: &mut RoaringBitmap,
modified_docids: &mut RoaringBitmap,
) -> Result<(FacetFieldIdsDelta, Vec<IndexEmbeddingConfig>)>
where
DC: DocumentChanges<'pl>,
@ -70,7 +71,7 @@ where
// adding the delta should never cause a negative result, as we are removing fields that previously existed.
*current = current.saturating_add_signed(delta);
}
document_extractor_data.docids_delta.apply_to(document_ids);
document_extractor_data.docids_delta.apply_to(document_ids, modified_docids);
}
field_distribution.retain(|_, v| *v != 0);
@ -256,7 +257,7 @@ where
let Some(deladd) = data.remove(&config.name) else {
continue 'data;
};
deladd.apply_to(&mut config.user_provided);
deladd.apply_to(&mut config.user_provided, modified_docids);
}
}
}

View File

@ -129,6 +129,7 @@ where
let index_embeddings = index.embedding_configs(wtxn)?;
let mut field_distribution = index.field_distribution(wtxn)?;
let mut document_ids = index.documents_ids(wtxn)?;
let mut modified_docids = roaring::RoaringBitmap::new();
thread::scope(|s| -> Result<()> {
let indexer_span = tracing::Span::current();
@ -137,6 +138,7 @@ where
// prevent moving the field_distribution and document_ids in the inner closure...
let field_distribution = &mut field_distribution;
let document_ids = &mut document_ids;
let modified_docids = &mut modified_docids;
let extractor_handle =
Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
pool.install(move || {
@ -151,6 +153,7 @@ where
field_distribution,
index_embeddings,
document_ids,
modified_docids,
)
})
.unwrap()
@ -225,6 +228,7 @@ where
embedders,
field_distribution,
document_ids,
modified_docids,
)?;
Ok(())

View File

@ -113,6 +113,7 @@ where
Ok(())
}
#[allow(clippy::too_many_arguments)]
pub(super) fn update_index(
index: &Index,
wtxn: &mut RwTxn<'_>,
@ -121,6 +122,7 @@ pub(super) fn update_index(
embedders: EmbeddingConfigs,
field_distribution: std::collections::BTreeMap<String, u64>,
document_ids: roaring::RoaringBitmap,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
if let Some(new_primary_key) = new_primary_key {
@ -132,6 +134,7 @@ pub(super) fn update_index(
index.put_field_distribution(wtxn, &field_distribution)?;
index.put_documents_ids(wtxn, &document_ids)?;
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
index.update_documents_stats(wtxn, modified_docids)?;
Ok(())
}

View File

@ -3,7 +3,7 @@ mod v1_13;
use heed::RwTxn;
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::V1_13_0_To_Current;
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Current};
use crate::progress::{Progress, VariableNameStep};
use crate::{Index, InternalError, Result};
@ -28,13 +28,18 @@ pub fn upgrade(
progress: Progress,
) -> Result<bool> {
let from = index.get_version(wtxn)?.unwrap_or(db_version);
let upgrade_functions: &[&dyn UpgradeIndex] =
&[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()];
let upgrade_functions: &[&dyn UpgradeIndex] = &[
&V1_12_To_V1_12_3 {},
&V1_12_3_To_V1_13_0 {},
&V1_13_0_To_V1_13_1 {},
&V1_13_1_To_Current {},
];
let start = match from {
(1, 12, 0..=2) => 0,
(1, 12, 3..) => 1,
(1, 13, 0) => 2,
(1, 13, 1) => 3,
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 13, _) => return Ok(false),
(major, minor, patch) => {

View File

@ -2,13 +2,44 @@ use heed::RwTxn;
use super::UpgradeIndex;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::database_stats::DatabaseStats;
use crate::progress::Progress;
use crate::{Index, Result};
use crate::{make_enum_progress, Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct V1_13_0_To_Current();
pub(super) struct V1_13_0_To_V1_13_1();
impl UpgradeIndex for V1_13_0_To_Current {
impl UpgradeIndex for V1_13_0_To_V1_13_1 {
fn upgrade(
&self,
wtxn: &mut RwTxn,
index: &Index,
_original: (u32, u32, u32),
progress: Progress,
) -> Result<bool> {
make_enum_progress! {
enum DocumentsStats {
CreatingDocumentsStats,
}
};
// Create the new documents stats.
progress.update_progress(DocumentsStats::CreatingDocumentsStats);
let stats = DatabaseStats::new(index.documents.remap_types(), wtxn)?;
index.put_documents_stats(wtxn, stats)?;
Ok(true)
}
fn target_version(&self) -> (u32, u32, u32) {
(1, 13, 1)
}
}
#[allow(non_camel_case_types)]
pub(super) struct V1_13_1_To_Current();
impl UpgradeIndex for V1_13_1_To_Current {
fn upgrade(
&self,
_wtxn: &mut RwTxn,