mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 01:46:28 +00:00 
			
		
		
		
	Add embedders stats
This commit is contained in:
		@@ -106,6 +106,12 @@ pub struct IndexStats {
 | 
			
		||||
    /// are not returned to the disk after a deletion, this number is typically larger than
 | 
			
		||||
    /// `used_database_size` that only includes the size of the used pages.
 | 
			
		||||
    pub database_size: u64,
 | 
			
		||||
    /// Number of embeddings in the index.
 | 
			
		||||
    /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
 | 
			
		||||
    pub number_of_embeddings: Option<u64>,
 | 
			
		||||
    /// Number of embedded documents in the index.
 | 
			
		||||
    /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch
 | 
			
		||||
    pub number_of_embedded_documents: Option<u64>,
 | 
			
		||||
    /// Size taken by the used pages of the index' DB, in bytes.
 | 
			
		||||
    ///
 | 
			
		||||
    /// As the DB backend does not return to the disk the pages that are not currently used by the DB,
 | 
			
		||||
@@ -130,8 +136,11 @@ impl IndexStats {
 | 
			
		||||
    ///
 | 
			
		||||
    /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
 | 
			
		||||
    pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
 | 
			
		||||
        let arroy_stats = index.arroy_stats(rtxn)?;
 | 
			
		||||
        Ok(IndexStats {
 | 
			
		||||
            number_of_documents: index.number_of_documents(rtxn)?,
 | 
			
		||||
            number_of_embeddings: Some(arroy_stats.number_of_embeddings),
 | 
			
		||||
            number_of_embedded_documents: Some(arroy_stats.documents.len()),
 | 
			
		||||
            database_size: index.on_disk_size()?,
 | 
			
		||||
            used_database_size: index.used_size()?,
 | 
			
		||||
            primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),
 | 
			
		||||
 
 | 
			
		||||
@@ -496,6 +496,12 @@ pub struct IndexStats {
 | 
			
		||||
    pub number_of_documents: u64,
 | 
			
		||||
    /// Whether or not the index is currently ingesting document
 | 
			
		||||
    pub is_indexing: bool,
 | 
			
		||||
    /// Number of embeddings in the index
 | 
			
		||||
    #[serde(skip_serializing_if = "Option::is_none")]
 | 
			
		||||
    pub number_of_embeddings: Option<u64>,
 | 
			
		||||
    /// Number of embedded documents in the index
 | 
			
		||||
    #[serde(skip_serializing_if = "Option::is_none")]
 | 
			
		||||
    pub number_of_embedded_documents: Option<u64>,
 | 
			
		||||
    /// Association of every field name with the number of times it occurs in the documents.
 | 
			
		||||
    #[schema(value_type = HashMap<String, u64>)]
 | 
			
		||||
    pub field_distribution: FieldDistribution,
 | 
			
		||||
@@ -506,6 +512,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
 | 
			
		||||
        IndexStats {
 | 
			
		||||
            number_of_documents: stats.inner_stats.number_of_documents,
 | 
			
		||||
            is_indexing: stats.is_indexing,
 | 
			
		||||
            number_of_embeddings: stats.inner_stats.number_of_embeddings,
 | 
			
		||||
            number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents,
 | 
			
		||||
            field_distribution: stats.inner_stats.field_distribution,
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
@@ -524,6 +532,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
 | 
			
		||||
        (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!(
 | 
			
		||||
            {
 | 
			
		||||
                "numberOfDocuments": 10,
 | 
			
		||||
                "numberOfEmbeddings": 10,
 | 
			
		||||
                "numberOfEmbeddedDocuments": 10,
 | 
			
		||||
                "isIndexing": true,
 | 
			
		||||
                "fieldDistribution": {
 | 
			
		||||
                    "genre": 10,
 | 
			
		||||
 
 | 
			
		||||
@@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec;
 | 
			
		||||
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
 | 
			
		||||
use crate::order_by_map::OrderByMap;
 | 
			
		||||
use crate::proximity::ProximityPrecision;
 | 
			
		||||
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig};
 | 
			
		||||
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
 | 
			
		||||
use crate::{
 | 
			
		||||
    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
 | 
			
		||||
    FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
 | 
			
		||||
@@ -1731,6 +1731,18 @@ impl Index {
 | 
			
		||||
        let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
 | 
			
		||||
        Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
 | 
			
		||||
        let mut stats = ArroyStats::default();
 | 
			
		||||
        let embedding_configs = self.embedding_configs(rtxn)?;
 | 
			
		||||
        for config in embedding_configs {
 | 
			
		||||
            let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
 | 
			
		||||
            let reader =
 | 
			
		||||
                ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
 | 
			
		||||
            reader.aggregate_stats(rtxn, &mut stats)?;
 | 
			
		||||
        }
 | 
			
		||||
        Ok(stats)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug, Deserialize, Serialize)]
 | 
			
		||||
 
 | 
			
		||||
@@ -410,8 +410,43 @@ impl ArroyWrapper {
 | 
			
		||||
    fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
 | 
			
		||||
        self.database.remap_data_type()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    pub fn aggregate_stats(
 | 
			
		||||
        &self,
 | 
			
		||||
        rtxn: &RoTxn,
 | 
			
		||||
        stats: &mut ArroyStats,
 | 
			
		||||
    ) -> Result<(), arroy::Error> {
 | 
			
		||||
        if self.quantized {
 | 
			
		||||
            for reader in self.readers(rtxn, self.quantized_db()) {
 | 
			
		||||
                let reader = reader?;
 | 
			
		||||
                let documents = reader.item_ids();
 | 
			
		||||
                if documents.is_empty() {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
                stats.documents |= documents;
 | 
			
		||||
                stats.number_of_embeddings += documents.len() as u64;
 | 
			
		||||
            }
 | 
			
		||||
        } else {
 | 
			
		||||
            for reader in self.readers(rtxn, self.angular_db()) {
 | 
			
		||||
                let reader = reader?;
 | 
			
		||||
                let documents = reader.item_ids();
 | 
			
		||||
                if documents.is_empty() {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
                stats.documents |= documents;
 | 
			
		||||
                stats.number_of_embeddings += documents.len() as u64;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug, Default, Clone)]
 | 
			
		||||
pub struct ArroyStats {
 | 
			
		||||
    pub number_of_embeddings: u64,
 | 
			
		||||
    pub documents: RoaringBitmap,
 | 
			
		||||
}
 | 
			
		||||
/// One or multiple embeddings stored consecutively in a flat vector.
 | 
			
		||||
pub struct Embeddings<F> {
 | 
			
		||||
    data: Vec<F>,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user