mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Add document database stats
This commit is contained in:
		| @@ -6,6 +6,7 @@ use std::{fs, thread}; | ||||
| use meilisearch_types::heed::types::{SerdeJson, Str}; | ||||
| use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; | ||||
| use meilisearch_types::milli; | ||||
| use meilisearch_types::milli::database_stats::DatabaseStats; | ||||
| use meilisearch_types::milli::update::IndexerConfig; | ||||
| use meilisearch_types::milli::{FieldDistribution, Index}; | ||||
| use serde::{Deserialize, Serialize}; | ||||
| @@ -98,8 +99,9 @@ pub enum IndexStatus { | ||||
| /// The statistics that can be computed from an `Index` object. | ||||
| #[derive(Serialize, Deserialize, Debug)] | ||||
| pub struct IndexStats { | ||||
|     /// Number of documents in the index. | ||||
|     pub number_of_documents: u64, | ||||
|     /// Stats of the documents database. | ||||
|     #[serde(default)] | ||||
|     pub documents_database_stats: DatabaseStats, | ||||
|     /// Size taken up by the index' DB, in bytes. | ||||
|     /// | ||||
|     /// This includes the size taken by both the used and free pages of the DB, and as the free pages | ||||
| @@ -138,9 +140,9 @@ impl IndexStats { | ||||
|     pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> { | ||||
|         let arroy_stats = index.arroy_stats(rtxn)?; | ||||
|         Ok(IndexStats { | ||||
|             number_of_documents: index.number_of_documents(rtxn)?, | ||||
|             number_of_embeddings: Some(arroy_stats.number_of_embeddings), | ||||
|             number_of_embedded_documents: Some(arroy_stats.documents.len()), | ||||
|             documents_database_stats: index.documents_database_stats(rtxn)?, | ||||
|             database_size: index.on_disk_size()?, | ||||
|             used_database_size: index.used_size()?, | ||||
|             primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), | ||||
|   | ||||
| @@ -370,7 +370,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String { | ||||
|         let stats = mapper.stats_of(rtxn, &name).unwrap(); | ||||
|         s.push_str(&format!( | ||||
|             "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", | ||||
|             stats.number_of_documents, stats.field_distribution | ||||
|             stats.documents_database_stats.number_of_entries(), | ||||
|             stats.field_distribution | ||||
|         )); | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -910,7 +910,15 @@ fn create_and_list_index() { | ||||
|         [ | ||||
|           "kefir", | ||||
|           { | ||||
|             "number_of_documents": 0, | ||||
|             "documents_database_stats": { | ||||
|               "numberOfEntries": 0, | ||||
|               "totalKeySize": 0, | ||||
|               "totalValueSize": 0, | ||||
|               "maxKeySize": 0, | ||||
|               "maxValueSize": 0, | ||||
|               "minKeySize": 0, | ||||
|               "minValueSize": 0 | ||||
|             }, | ||||
|             "database_size": "[bytes]", | ||||
|             "number_of_embeddings": 0, | ||||
|             "number_of_embedded_documents": 0, | ||||
|   | ||||
| @@ -494,6 +494,12 @@ pub async fn delete_index( | ||||
| pub struct IndexStats { | ||||
|     /// Number of documents in the index | ||||
|     pub number_of_documents: u64, | ||||
|     /// Size of the documents database, in bytes. | ||||
|     pub raw_document_db_size: u64, | ||||
|     /// Maximum size of a document in the documents database. | ||||
|     pub max_document_size: u64, | ||||
|     /// Average size of a document in the documents database. | ||||
|     pub avg_document_size: u64, | ||||
|     /// Whether or not the index is currently ingesting document | ||||
|     pub is_indexing: bool, | ||||
|     /// Number of embeddings in the index | ||||
| @@ -510,7 +516,10 @@ pub struct IndexStats { | ||||
| impl From<index_scheduler::IndexStats> for IndexStats { | ||||
|     fn from(stats: index_scheduler::IndexStats) -> Self { | ||||
|         IndexStats { | ||||
|             number_of_documents: stats.inner_stats.number_of_documents, | ||||
|             number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(), | ||||
|             raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), | ||||
|             max_document_size: stats.inner_stats.documents_database_stats.max_value_size(), | ||||
|             avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), | ||||
|             is_indexing: stats.is_indexing, | ||||
|             number_of_embeddings: stats.inner_stats.number_of_embeddings, | ||||
|             number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, | ||||
|   | ||||
| @@ -160,6 +160,9 @@ async fn delete_document_by_filter() { | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 4, | ||||
|       "rawDocumentDbSize": 42, | ||||
|       "maxDocumentSize": 13, | ||||
|       "avgDocumentSize": 10, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
| @@ -209,6 +212,9 @@ async fn delete_document_by_filter() { | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "rawDocumentDbSize": 16, | ||||
|       "maxDocumentSize": 12, | ||||
|       "avgDocumentSize": 8, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
| @@ -277,6 +283,9 @@ async fn delete_document_by_filter() { | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 1, | ||||
|       "rawDocumentDbSize": 12, | ||||
|       "maxDocumentSize": 12, | ||||
|       "avgDocumentSize": 12, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|   | ||||
| @@ -187,6 +187,9 @@ async fn import_dump_v1_movie_with_settings() { | ||||
|         @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "rawDocumentDbSize": 21965, | ||||
|       "maxDocumentSize": 743, | ||||
|       "avgDocumentSize": 414, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|   | ||||
							
								
								
									
										100
									
								
								crates/milli/src/database_stats.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								crates/milli/src/database_stats.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,100 @@ | ||||
| use heed::types::Bytes; | ||||
| use heed::Database; | ||||
| use heed::RoTxn; | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use crate::Result; | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| /// The stats of a database. | ||||
| pub struct DatabaseStats { | ||||
|     /// The number of entries in the database. | ||||
|     number_of_entries: u64, | ||||
|     /// The total size of the keys in the database. | ||||
|     total_key_size: u64, | ||||
|     /// The total size of the values in the database. | ||||
|     total_value_size: u64, | ||||
|     /// The maximum size of a key in the database. | ||||
|     max_key_size: u64, | ||||
|     /// The maximum size of a value in the database. | ||||
|     max_value_size: u64, | ||||
|     /// The minimum size of a key in the database. | ||||
|     min_key_size: u64, | ||||
|     /// The minimum size of a value in the database. | ||||
|     min_value_size: u64, | ||||
| } | ||||
|  | ||||
| impl DatabaseStats { | ||||
|     /// Returns the stats of the database. | ||||
|     /// | ||||
|     /// This function iterates over the whole database and computes the stats. | ||||
|     /// It is not efficient and should be cached somewhere. | ||||
|     pub(crate) fn new<'a>(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'a>) -> Result<Self> { | ||||
|         let mut database_stats = Self { | ||||
|             number_of_entries: 0, | ||||
|             total_key_size: 0, | ||||
|             total_value_size: 0, | ||||
|             max_key_size: 0, | ||||
|             max_value_size: 0, | ||||
|             min_key_size: u64::MAX, | ||||
|             min_value_size: u64::MAX, | ||||
|         }; | ||||
|  | ||||
|         let mut iter = database.iter(rtxn)?; | ||||
|         while let Some((key, value)) = iter.next().transpose()? { | ||||
|             let key_size = key.len() as u64; | ||||
|             let value_size = value.len() as u64; | ||||
|             database_stats.number_of_entries += 1; | ||||
|             database_stats.total_key_size += key_size; | ||||
|             database_stats.total_value_size += value_size; | ||||
|             database_stats.max_key_size = database_stats.max_key_size.max(key_size); | ||||
|             database_stats.max_value_size = database_stats.max_value_size.max(value_size); | ||||
|             database_stats.min_key_size = database_stats.min_key_size.min(key_size); | ||||
|             database_stats.min_value_size = database_stats.min_value_size.min(value_size); | ||||
|         } | ||||
|  | ||||
|         if database_stats.number_of_entries == 0 { | ||||
|             database_stats.min_key_size = 0; | ||||
|             database_stats.min_value_size = 0; | ||||
|         } | ||||
|  | ||||
|         Ok(database_stats) | ||||
|     } | ||||
|  | ||||
|     pub fn average_key_size(&self) -> u64 { | ||||
|         self.total_key_size / self.number_of_entries | ||||
|     } | ||||
|  | ||||
|     pub fn average_value_size(&self) -> u64 { | ||||
|         self.total_value_size / self.number_of_entries | ||||
|     } | ||||
|  | ||||
|     pub fn number_of_entries(&self) -> u64 { | ||||
|         self.number_of_entries | ||||
|     } | ||||
|  | ||||
|     pub fn total_key_size(&self) -> u64 { | ||||
|         self.total_key_size | ||||
|     } | ||||
|  | ||||
|     pub fn total_value_size(&self) -> u64 { | ||||
|         self.total_value_size | ||||
|     } | ||||
|  | ||||
|     pub fn max_key_size(&self) -> u64 { | ||||
|         self.max_key_size | ||||
|     } | ||||
|  | ||||
|     pub fn max_value_size(&self) -> u64 { | ||||
|         self.max_value_size | ||||
|     } | ||||
|  | ||||
|     pub fn min_key_size(&self) -> u64 { | ||||
|         self.min_key_size | ||||
|     } | ||||
|  | ||||
|     pub fn min_value_size(&self) -> u64 { | ||||
|         self.min_value_size | ||||
|     } | ||||
| } | ||||
| @@ -11,6 +11,7 @@ use rstar::RTree; | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; | ||||
| use crate::database_stats::DatabaseStats; | ||||
| use crate::documents::PrimaryKey; | ||||
| use crate::error::{InternalError, UserError}; | ||||
| use crate::fields_ids_map::FieldsIdsMap; | ||||
| @@ -403,6 +404,11 @@ impl Index { | ||||
|         Ok(count.unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     /// Returns the stats of the database. | ||||
|     pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result<DatabaseStats> { | ||||
|         Ok(DatabaseStats::new(self.documents.remap_types::<Bytes, Bytes>(), rtxn)?) | ||||
|     } | ||||
|  | ||||
|     /* primary key */ | ||||
|  | ||||
|     /// Writes the documents primary key, this is the field name that is used to store the id. | ||||
|   | ||||
| @@ -10,6 +10,7 @@ pub mod documents; | ||||
|  | ||||
| mod asc_desc; | ||||
| mod criterion; | ||||
| pub mod database_stats; | ||||
| mod error; | ||||
| mod external_documents_ids; | ||||
| pub mod facet; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user