mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Improve the performances of computing the size of the documents database
This commit is contained in:
		| @@ -1,8 +1,13 @@ | ||||
| use heed::types::Bytes; | ||||
| use std::mem; | ||||
|  | ||||
| use heed::Database; | ||||
| use heed::DatabaseStat; | ||||
| use heed::RoTxn; | ||||
| use heed::Unspecified; | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use crate::BEU32; | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| /// The stats of a database. | ||||
| @@ -20,58 +25,24 @@ impl DatabaseStats { | ||||
|     /// | ||||
|     /// This function iterates over the whole database and computes the stats. | ||||
|     /// It is not efficient and should be cached somewhere. | ||||
|     pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> { | ||||
|         let mut database_stats = | ||||
|             Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 }; | ||||
|     pub(crate) fn new( | ||||
|         database: Database<BEU32, Unspecified>, | ||||
|         rtxn: &RoTxn<'_>, | ||||
|     ) -> heed::Result<Self> { | ||||
|         let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } = | ||||
|             database.stat(rtxn)?; | ||||
|  | ||||
|         let mut iter = database.iter(rtxn)?; | ||||
|         while let Some((key, value)) = iter.next().transpose()? { | ||||
|             let key_size = key.len() as u64; | ||||
|             let value_size = value.len() as u64; | ||||
|             database_stats.total_key_size += key_size; | ||||
|             database_stats.total_value_size += value_size; | ||||
|         } | ||||
|         // We first take the total size without overflow pages as the overflow pages contains the values and only that. | ||||
|         let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize; | ||||
|         // We compute an estimated size for the keys. | ||||
|         let total_key_size = entries * (mem::size_of::<u32>() + 4); | ||||
|         let total_value_size = total_size - total_key_size; | ||||
|  | ||||
|         database_stats.number_of_entries = database.len(rtxn)?; | ||||
|  | ||||
|         Ok(database_stats) | ||||
|     } | ||||
|  | ||||
|     /// Recomputes the stats of the database and returns the new stats. | ||||
|     /// | ||||
|     /// This function is used to update the stats of the database when some keys are modified. | ||||
|     /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. | ||||
|     pub(crate) fn recompute<I, K>( | ||||
|         mut stats: Self, | ||||
|         database: Database<Bytes, Bytes>, | ||||
|         before_rtxn: &RoTxn<'_>, | ||||
|         after_rtxn: &RoTxn<'_>, | ||||
|         modified_keys: I, | ||||
|     ) -> heed::Result<Self> | ||||
|     where | ||||
|         I: IntoIterator<Item = K>, | ||||
|         K: AsRef<[u8]>, | ||||
|     { | ||||
|         for key in modified_keys { | ||||
|             let key = key.as_ref(); | ||||
|             if let Some(value) = database.get(after_rtxn, key)? { | ||||
|                 let key_size = key.len() as u64; | ||||
|                 let value_size = value.len() as u64; | ||||
|                 stats.total_key_size = stats.total_key_size.saturating_add(key_size); | ||||
|                 stats.total_value_size = stats.total_value_size.saturating_add(value_size); | ||||
|             } | ||||
|  | ||||
|             if let Some(value) = database.get(before_rtxn, key)? { | ||||
|                 let key_size = key.len() as u64; | ||||
|                 let value_size = value.len() as u64; | ||||
|                 stats.total_key_size = stats.total_key_size.saturating_sub(key_size); | ||||
|                 stats.total_value_size = stats.total_value_size.saturating_sub(value_size); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         stats.number_of_entries = database.len(after_rtxn)?; | ||||
|  | ||||
|         Ok(stats) | ||||
|         Ok(Self { | ||||
|             number_of_entries: entries as u64, | ||||
|             total_key_size: total_key_size as u64, | ||||
|             total_value_size: total_value_size as u64, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn average_key_size(&self) -> u64 { | ||||
| @@ -86,6 +57,10 @@ impl DatabaseStats { | ||||
|         self.number_of_entries | ||||
|     } | ||||
|  | ||||
|     pub fn total_size(&self) -> u64 { | ||||
|         self.total_key_size + self.total_value_size | ||||
|     } | ||||
|  | ||||
|     pub fn total_key_size(&self) -> u64 { | ||||
|         self.total_key_size | ||||
|     } | ||||
|   | ||||
| @@ -411,38 +411,6 @@ impl Index { | ||||
|         Ok(count.unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     /// Updates the stats of the documents database based on the previous stats and the modified docids. | ||||
|     pub fn update_documents_stats( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn<'_>, | ||||
|         modified_docids: roaring::RoaringBitmap, | ||||
|     ) -> Result<()> { | ||||
|         let before_rtxn = self.read_txn()?; | ||||
|         let document_stats = match self.documents_stats(&before_rtxn)? { | ||||
|             Some(before_stats) => DatabaseStats::recompute( | ||||
|                 before_stats, | ||||
|                 self.documents.remap_types(), | ||||
|                 &before_rtxn, | ||||
|                 wtxn, | ||||
|                 modified_docids.iter().map(|docid| docid.to_be_bytes()), | ||||
|             )?, | ||||
|             None => { | ||||
|                 // This should never happen when there are already documents in the index, the documents stats should be present. | ||||
|                 // If it happens, it means that the index was not properly initialized/upgraded. | ||||
|                 debug_assert_eq!( | ||||
|                     self.documents.len(&before_rtxn)?, | ||||
|                     0, | ||||
|                     "The documents stats should be present when there are documents in the index" | ||||
|                 ); | ||||
|                 tracing::warn!("No documents stats found, creating new ones"); | ||||
|                 DatabaseStats::new(self.documents.remap_types(), &*wtxn)? | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         self.put_documents_stats(wtxn, document_stats)?; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     /// Writes the stats of the documents database. | ||||
|     pub fn put_documents_stats( | ||||
|         &self, | ||||
|   | ||||
| @@ -28,6 +28,7 @@ pub use self::helpers::*; | ||||
| pub use self::transform::{Transform, TransformOutput}; | ||||
| use super::facet::clear_facet_levels_based_on_settings_diff; | ||||
| use super::new::StdResult; | ||||
| use crate::database_stats::DatabaseStats; | ||||
| use crate::documents::{obkv_to_object, DocumentsBatchReader}; | ||||
| use crate::error::{Error, InternalError}; | ||||
| use crate::index::{PrefixSearch, PrefixSettings}; | ||||
| @@ -476,7 +477,8 @@ where | ||||
|  | ||||
|         if !settings_diff.settings_update_only { | ||||
|             // Update the stats of the documents database when there is a document update. | ||||
|             self.index.update_documents_stats(self.wtxn, modified_docids)?; | ||||
|             let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?; | ||||
|             self.index.put_documents_stats(self.wtxn, stats)?; | ||||
|         } | ||||
|         // We write the field distribution into the main database | ||||
|         self.index.put_field_distribution(self.wtxn, &field_distribution)?; | ||||
|   | ||||
| @@ -234,7 +234,6 @@ where | ||||
|         embedders, | ||||
|         field_distribution, | ||||
|         document_ids, | ||||
|         modified_docids, | ||||
|     )?; | ||||
|  | ||||
|     Ok(congestion) | ||||
|   | ||||
| @@ -7,6 +7,7 @@ use rand::SeedableRng as _; | ||||
| use time::OffsetDateTime; | ||||
|  | ||||
| use super::super::channel::*; | ||||
| use crate::database_stats::DatabaseStats; | ||||
| use crate::documents::PrimaryKey; | ||||
| use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; | ||||
| use crate::index::IndexEmbeddingConfig; | ||||
| @@ -142,7 +143,6 @@ pub(super) fn update_index( | ||||
|     embedders: EmbeddingConfigs, | ||||
|     field_distribution: std::collections::BTreeMap<String, u64>, | ||||
|     document_ids: roaring::RoaringBitmap, | ||||
|     modified_docids: roaring::RoaringBitmap, | ||||
| ) -> Result<()> { | ||||
|     index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; | ||||
|     if let Some(new_primary_key) = new_primary_key { | ||||
| @@ -153,7 +153,8 @@ pub(super) fn update_index( | ||||
|     index.put_field_distribution(wtxn, &field_distribution)?; | ||||
|     index.put_documents_ids(wtxn, &document_ids)?; | ||||
|     index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; | ||||
|     index.update_documents_stats(wtxn, modified_docids)?; | ||||
|     let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?; | ||||
|     index.put_documents_stats(wtxn, stats)?; | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user