mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Merge #5341
5341: Embeddings stats r=ManyTheFish a=ManyTheFish # Pull Request ## Related issue Fixes #5321 ## What does this PR do? - Add embedding stats - force dumpless upgrade to recompute stats - add tests Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
		| @@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec; | ||||
| use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; | ||||
| use crate::order_by_map::OrderByMap; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; | ||||
| use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; | ||||
| use crate::{ | ||||
|     default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, | ||||
|     FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, | ||||
| @@ -1731,6 +1731,18 @@ impl Index { | ||||
|         let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); | ||||
|         Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) | ||||
|     } | ||||
|  | ||||
|     pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> { | ||||
|         let mut stats = ArroyStats::default(); | ||||
|         let embedding_configs = self.embedding_configs(rtxn)?; | ||||
|         for config in embedding_configs { | ||||
|             let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); | ||||
|             let reader = | ||||
|                 ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); | ||||
|             reader.aggregate_stats(rtxn, &mut stats)?; | ||||
|         } | ||||
|         Ok(stats) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Deserialize, Serialize)] | ||||
|   | ||||
| @@ -1,7 +1,9 @@ | ||||
| mod v1_12; | ||||
| mod v1_13; | ||||
|  | ||||
| use heed::RwTxn; | ||||
| use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3}; | ||||
| use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; | ||||
| use v1_13::V1_13_0_To_Current; | ||||
|  | ||||
| use crate::progress::{Progress, VariableNameStep}; | ||||
| use crate::{Index, InternalError, Result}; | ||||
| @@ -26,11 +28,13 @@ pub fn upgrade( | ||||
|     progress: Progress, | ||||
| ) -> Result<bool> { | ||||
|     let from = index.get_version(wtxn)?.unwrap_or(db_version); | ||||
|     let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()]; | ||||
|     let upgrade_functions: &[&dyn UpgradeIndex] = | ||||
|         &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()]; | ||||
|  | ||||
|     let start = match from { | ||||
|         (1, 12, 0..=2) => 0, | ||||
|         (1, 12, 3..) => 1, | ||||
|         (1, 13, 0) => 2, | ||||
|         // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. | ||||
|         (1, 13, _) => return Ok(false), | ||||
|         (major, minor, patch) => { | ||||
|   | ||||
| @@ -1,7 +1,6 @@ | ||||
| use heed::RwTxn; | ||||
|  | ||||
| use super::UpgradeIndex; | ||||
| use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; | ||||
| use crate::progress::Progress; | ||||
| use crate::{make_enum_progress, Index, Result}; | ||||
|  | ||||
| @@ -32,9 +31,9 @@ impl UpgradeIndex for V1_12_To_V1_12_3 { | ||||
| } | ||||
|  | ||||
| #[allow(non_camel_case_types)] | ||||
| pub(super) struct V1_12_3_To_Current(); | ||||
| pub(super) struct V1_12_3_To_V1_13_0 {} | ||||
|  | ||||
| impl UpgradeIndex for V1_12_3_To_Current { | ||||
| impl UpgradeIndex for V1_12_3_To_V1_13_0 { | ||||
|     fn upgrade( | ||||
|         &self, | ||||
|         _wtxn: &mut RwTxn, | ||||
| @@ -42,14 +41,11 @@ impl UpgradeIndex for V1_12_3_To_Current { | ||||
|         _original: (u32, u32, u32), | ||||
|         _progress: Progress, | ||||
|     ) -> Result<bool> { | ||||
|         Ok(false) | ||||
|         // recompute the indexes stats | ||||
|         Ok(true) | ||||
|     } | ||||
|  | ||||
|     fn target_version(&self) -> (u32, u32, u32) { | ||||
|         ( | ||||
|             VERSION_MAJOR.parse().unwrap(), | ||||
|             VERSION_MINOR.parse().unwrap(), | ||||
|             VERSION_PATCH.parse().unwrap(), | ||||
|         ) | ||||
|         (1, 13, 0) | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										29
									
								
								crates/milli/src/update/upgrade/v1_13.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								crates/milli/src/update/upgrade/v1_13.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| use heed::RwTxn; | ||||
|  | ||||
| use super::UpgradeIndex; | ||||
| use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; | ||||
| use crate::progress::Progress; | ||||
| use crate::{Index, Result}; | ||||
|  | ||||
| #[allow(non_camel_case_types)] | ||||
| pub(super) struct V1_13_0_To_Current(); | ||||
|  | ||||
| impl UpgradeIndex for V1_13_0_To_Current { | ||||
|     fn upgrade( | ||||
|         &self, | ||||
|         _wtxn: &mut RwTxn, | ||||
|         _index: &Index, | ||||
|         _original: (u32, u32, u32), | ||||
|         _progress: Progress, | ||||
|     ) -> Result<bool> { | ||||
|         Ok(false) | ||||
|     } | ||||
|  | ||||
|     fn target_version(&self) -> (u32, u32, u32) { | ||||
|         ( | ||||
|             VERSION_MAJOR.parse().unwrap(), | ||||
|             VERSION_MINOR.parse().unwrap(), | ||||
|             VERSION_PATCH.parse().unwrap(), | ||||
|         ) | ||||
|     } | ||||
| } | ||||
| @@ -410,8 +410,43 @@ impl ArroyWrapper { | ||||
|     fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> { | ||||
|         self.database.remap_data_type() | ||||
|     } | ||||
|  | ||||
|     pub fn aggregate_stats( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         stats: &mut ArroyStats, | ||||
|     ) -> Result<(), arroy::Error> { | ||||
|         if self.quantized { | ||||
|             for reader in self.readers(rtxn, self.quantized_db()) { | ||||
|                 let reader = reader?; | ||||
|                 let documents = reader.item_ids(); | ||||
|                 if documents.is_empty() { | ||||
|                     break; | ||||
|                 } | ||||
|                 stats.documents |= documents; | ||||
|                 stats.number_of_embeddings += documents.len(); | ||||
|             } | ||||
|         } else { | ||||
|             for reader in self.readers(rtxn, self.angular_db()) { | ||||
|                 let reader = reader?; | ||||
|                 let documents = reader.item_ids(); | ||||
|                 if documents.is_empty() { | ||||
|                     break; | ||||
|                 } | ||||
|                 stats.documents |= documents; | ||||
|                 stats.number_of_embeddings += documents.len(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Default, Clone)] | ||||
| pub struct ArroyStats { | ||||
|     pub number_of_embeddings: u64, | ||||
|     pub documents: RoaringBitmap, | ||||
| } | ||||
| /// One or multiple embeddings stored consecutively in a flat vector. | ||||
| pub struct Embeddings<F> { | ||||
|     data: Vec<F>, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user