mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Merge #5341
5341: Embeddings stats r=ManyTheFish a=ManyTheFish # Pull Request ## Related issue Fixes #5321 ## What does this PR do? - Add embedding stats - force dumpless upgrade to recompute stats - add tests Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
		| @@ -106,6 +106,12 @@ pub struct IndexStats { | ||||
|     /// are not returned to the disk after a deletion, this number is typically larger than | ||||
|     /// `used_database_size` that only includes the size of the used pages. | ||||
|     pub database_size: u64, | ||||
|     /// Number of embeddings in the index. | ||||
|     /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch | ||||
|     pub number_of_embeddings: Option<u64>, | ||||
|     /// Number of embedded documents in the index. | ||||
|     /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch | ||||
|     pub number_of_embedded_documents: Option<u64>, | ||||
|     /// Size taken by the used pages of the index' DB, in bytes. | ||||
|     /// | ||||
|     /// As the DB backend does not return to the disk the pages that are not currently used by the DB, | ||||
| @@ -130,8 +136,11 @@ impl IndexStats { | ||||
|     /// | ||||
|     /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. | ||||
|     pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> { | ||||
|         let arroy_stats = index.arroy_stats(rtxn)?; | ||||
|         Ok(IndexStats { | ||||
|             number_of_documents: index.number_of_documents(rtxn)?, | ||||
|             number_of_embeddings: Some(arroy_stats.number_of_embeddings), | ||||
|             number_of_embedded_documents: Some(arroy_stats.documents.len()), | ||||
|             database_size: index.on_disk_size()?, | ||||
|             used_database_size: index.used_size()?, | ||||
|             primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), | ||||
|   | ||||
| @@ -903,7 +903,7 @@ fn create_and_list_index() { | ||||
|  | ||||
|     index_scheduler.index("kefir").unwrap(); | ||||
|     let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); | ||||
|     snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#" | ||||
|     snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###" | ||||
|     [ | ||||
|       1, | ||||
|       [ | ||||
| @@ -912,6 +912,8 @@ fn create_and_list_index() { | ||||
|           { | ||||
|             "number_of_documents": 0, | ||||
|             "database_size": "[bytes]", | ||||
|             "number_of_embeddings": 0, | ||||
|             "number_of_embedded_documents": 0, | ||||
|             "used_database_size": "[bytes]", | ||||
|             "primary_key": null, | ||||
|             "field_distribution": {}, | ||||
| @@ -921,5 +923,5 @@ fn create_and_list_index() { | ||||
|         ] | ||||
|       ] | ||||
|     ] | ||||
|     "#); | ||||
|     "###); | ||||
| } | ||||
|   | ||||
| @@ -496,6 +496,12 @@ pub struct IndexStats { | ||||
|     pub number_of_documents: u64, | ||||
|     /// Whether or not the index is currently ingesting document | ||||
|     pub is_indexing: bool, | ||||
|     /// Number of embeddings in the index | ||||
|     #[serde(skip_serializing_if = "Option::is_none")] | ||||
|     pub number_of_embeddings: Option<u64>, | ||||
|     /// Number of embedded documents in the index | ||||
|     #[serde(skip_serializing_if = "Option::is_none")] | ||||
|     pub number_of_embedded_documents: Option<u64>, | ||||
|     /// Association of every field name with the number of times it occurs in the documents. | ||||
|     #[schema(value_type = HashMap<String, u64>)] | ||||
|     pub field_distribution: FieldDistribution, | ||||
| @@ -506,6 +512,8 @@ impl From<index_scheduler::IndexStats> for IndexStats { | ||||
|         IndexStats { | ||||
|             number_of_documents: stats.inner_stats.number_of_documents, | ||||
|             is_indexing: stats.is_indexing, | ||||
|             number_of_embeddings: stats.inner_stats.number_of_embeddings, | ||||
|             number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, | ||||
|             field_distribution: stats.inner_stats.field_distribution, | ||||
|         } | ||||
|     } | ||||
| @@ -524,6 +532,8 @@ impl From<index_scheduler::IndexStats> for IndexStats { | ||||
|         (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( | ||||
|             { | ||||
|                 "numberOfDocuments": 10, | ||||
|                 "numberOfEmbeddings": 10, | ||||
|                 "numberOfEmbeddedDocuments": 10, | ||||
|                 "isIndexing": true, | ||||
|                 "fieldDistribution": { | ||||
|                     "genre": 10, | ||||
|   | ||||
| @@ -161,6 +161,8 @@ async fn delete_document_by_filter() { | ||||
|     { | ||||
|       "numberOfDocuments": 4, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "color": 3, | ||||
|         "id": 4 | ||||
| @@ -208,6 +210,8 @@ async fn delete_document_by_filter() { | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "color": 1, | ||||
|         "id": 2 | ||||
| @@ -274,6 +278,8 @@ async fn delete_document_by_filter() { | ||||
|     { | ||||
|       "numberOfDocuments": 1, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "color": 1, | ||||
|         "id": 1 | ||||
|   | ||||
| @@ -27,9 +27,24 @@ async fn import_dump_v1_movie_raw() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -173,6 +188,8 @@ async fn import_dump_v1_movie_with_settings() { | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
| @@ -333,9 +350,24 @@ async fn import_dump_v1_rubygems_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "description": 53, | ||||
|         "id": 53, | ||||
|         "name": 53, | ||||
|         "summary": 53, | ||||
|         "total_downloads": 53, | ||||
|         "version": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -483,9 +515,24 @@ async fn import_dump_v2_movie_raw() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -623,9 +670,24 @@ async fn import_dump_v2_movie_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -773,9 +835,24 @@ async fn import_dump_v2_rubygems_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "description": 53, | ||||
|         "id": 53, | ||||
|         "name": 53, | ||||
|         "summary": 53, | ||||
|         "total_downloads": 53, | ||||
|         "version": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -920,9 +997,24 @@ async fn import_dump_v3_movie_raw() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -1060,9 +1152,24 @@ async fn import_dump_v3_movie_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -1210,9 +1317,24 @@ async fn import_dump_v3_rubygems_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "description": 53, | ||||
|         "id": 53, | ||||
|         "name": 53, | ||||
|         "summary": 53, | ||||
|         "total_downloads": 53, | ||||
|         "version": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -1357,9 +1479,24 @@ async fn import_dump_v4_movie_raw() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -1497,9 +1634,24 @@ async fn import_dump_v4_movie_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "genres": 53, | ||||
|         "id": 53, | ||||
|         "overview": 53, | ||||
|         "poster": 53, | ||||
|         "release_date": 53, | ||||
|         "title": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -1647,9 +1799,24 @@ async fn import_dump_v4_rubygems_with_settings() { | ||||
|  | ||||
|     let (stats, code) = index.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!( | ||||
|         stats, | ||||
|         json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 53, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "description": 53, | ||||
|         "id": 53, | ||||
|         "name": 53, | ||||
|         "summary": 53, | ||||
|         "total_downloads": 53, | ||||
|         "version": 53 | ||||
|       } | ||||
|     } | ||||
|     "### | ||||
|     ); | ||||
|  | ||||
|     let (settings, code) = index.settings().await; | ||||
| @@ -1798,33 +1965,35 @@ async fn import_dump_v5() { | ||||
|         server.wait_task(task["uid"].as_u64().unwrap()).await; | ||||
|     } | ||||
|  | ||||
|     let expected_stats = json!({ | ||||
|         "numberOfDocuments": 10, | ||||
|         "isIndexing": false, | ||||
|         "fieldDistribution": { | ||||
|             "cast": 10, | ||||
|             "director": 10, | ||||
|             "genres": 10, | ||||
|             "id": 10, | ||||
|             "overview": 10, | ||||
|             "popularity": 10, | ||||
|             "poster_path": 10, | ||||
|             "producer": 10, | ||||
|             "production_companies": 10, | ||||
|             "release_date": 10, | ||||
|             "tagline": 10, | ||||
|             "title": 10, | ||||
|             "vote_average": 10, | ||||
|             "vote_count": 10 | ||||
|         } | ||||
|     }); | ||||
|  | ||||
|     let index1 = server.index("test"); | ||||
|     let index2 = server.index("test2"); | ||||
|  | ||||
|     let (stats, code) = index1.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!(stats, expected_stats); | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 10, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "cast": 10, | ||||
|         "director": 10, | ||||
|         "genres": 10, | ||||
|         "id": 10, | ||||
|         "overview": 10, | ||||
|         "popularity": 10, | ||||
|         "poster_path": 10, | ||||
|         "producer": 10, | ||||
|         "production_companies": 10, | ||||
|         "release_date": 10, | ||||
|         "tagline": 10, | ||||
|         "title": 10, | ||||
|         "vote_average": 10, | ||||
|         "vote_count": 10 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     let (docs, code) = index2.get_all_documents(GetAllDocumentsOptions::default()).await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
| @@ -1835,7 +2004,32 @@ async fn import_dump_v5() { | ||||
|  | ||||
|     let (stats, code) = index2.stats().await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|     assert_eq!(stats, expected_stats); | ||||
|     snapshot!( | ||||
|       json_string!(stats), | ||||
|       @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 10, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "cast": 10, | ||||
|         "director": 10, | ||||
|         "genres": 10, | ||||
|         "id": 10, | ||||
|         "overview": 10, | ||||
|         "popularity": 10, | ||||
|         "poster_path": 10, | ||||
|         "producer": 10, | ||||
|         "production_companies": 10, | ||||
|         "release_date": 10, | ||||
|         "tagline": 10, | ||||
|         "title": 10, | ||||
|         "vote_average": 10, | ||||
|         "vote_count": 10 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     let (keys, code) = server.list_api_keys("").await; | ||||
|     snapshot!(code, @"200 OK"); | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| use meili_snap::{json_string, snapshot}; | ||||
| use time::format_description::well_known::Rfc3339; | ||||
| use time::OffsetDateTime; | ||||
|  | ||||
| @@ -74,3 +75,253 @@ async fn stats() { | ||||
|     assert_eq!(response["indexes"]["test"]["fieldDistribution"]["name"], 1); | ||||
|     assert_eq!(response["indexes"]["test"]["fieldDistribution"]["age"], 1); | ||||
| } | ||||
|  | ||||
| #[actix_rt::test] | ||||
| async fn add_remove_embeddings() { | ||||
|     let server = Server::new().await; | ||||
|     let index = server.index("doggo"); | ||||
|  | ||||
|     let (response, code) = index | ||||
|         .update_settings(json!({ | ||||
|           "embedders": { | ||||
|             "manual": { | ||||
|                 "source": "userProvided", | ||||
|                 "dimensions": 3, | ||||
|             }, | ||||
|             "handcrafted": { | ||||
|                 "source": "userProvided", | ||||
|                 "dimensions": 3, | ||||
|             }, | ||||
|  | ||||
|           }, | ||||
|         })) | ||||
|         .await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     server.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     // 2 embedded documents for 5 embeddings in total | ||||
|     let documents = json!([ | ||||
|       {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, | ||||
|       {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, | ||||
|     ]); | ||||
|  | ||||
|     let (response, code) = index.add_documents(documents, None).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 5, | ||||
|       "numberOfEmbeddedDocuments": 2, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     // 2 embedded documents for 3 embeddings in total | ||||
|     let documents = json!([ | ||||
|       {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, | ||||
|     ]); | ||||
|  | ||||
|     let (response, code) = index.update_documents(documents, None).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 3, | ||||
|       "numberOfEmbeddedDocuments": 2, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     // 2 embedded documents for 2 embeddings in total | ||||
|     let documents = json!([ | ||||
|         {"id": 0, "name": "kefir", "_vectors": { "manual": null, "handcrafted": [0, 0, 0] }}, | ||||
|     ]); | ||||
|  | ||||
|     let (response, code) = index.update_documents(documents, None).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 2, | ||||
|       "numberOfEmbeddedDocuments": 2, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     // 1 embedded documents for 2 embeddings in total | ||||
|     let documents = json!([ | ||||
|         {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, | ||||
|         {"id": 1, "name": "echo", "_vectors": { "manual": null, "handcrafted": null }}, | ||||
|     ]); | ||||
|  | ||||
|     let (response, code) = index.update_documents(documents, None).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 2, | ||||
|       "numberOfEmbeddedDocuments": 1, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
| } | ||||
|  | ||||
| #[actix_rt::test] | ||||
| async fn add_remove_embedded_documents() { | ||||
|     let server = Server::new().await; | ||||
|     let index = server.index("doggo"); | ||||
|  | ||||
|     let (response, code) = index | ||||
|         .update_settings(json!({ | ||||
|           "embedders": { | ||||
|             "manual": { | ||||
|                 "source": "userProvided", | ||||
|                 "dimensions": 3, | ||||
|             }, | ||||
|             "handcrafted": { | ||||
|                 "source": "userProvided", | ||||
|                 "dimensions": 3, | ||||
|             }, | ||||
|  | ||||
|           }, | ||||
|         })) | ||||
|         .await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     server.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     // 2 embedded documents for 5 embeddings in total | ||||
|     let documents = json!([ | ||||
|       {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, | ||||
|       {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, | ||||
|     ]); | ||||
|  | ||||
|     let (response, code) = index.add_documents(documents, None).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 5, | ||||
|       "numberOfEmbeddedDocuments": 2, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     // delete one embedded document, remaining 1 embedded documents for 3 embeddings in total | ||||
|     let (response, code) = index.delete_document(0).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 1, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 3, | ||||
|       "numberOfEmbeddedDocuments": 1, | ||||
|       "fieldDistribution": { | ||||
|         "id": 1, | ||||
|         "name": 1 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
| } | ||||
|  | ||||
| #[actix_rt::test] | ||||
| async fn update_embedder_settings() { | ||||
|     let server = Server::new().await; | ||||
|     let index = server.index("doggo"); | ||||
|  | ||||
|     // 2 embedded documents for 3 embeddings in total | ||||
|     // but no embedders are added in the settings yet so we expect 0 embedded documents for 0 embeddings in total | ||||
|     let documents = json!([ | ||||
|       {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, | ||||
|       {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, | ||||
|     ]); | ||||
|  | ||||
|     let (response, code) = index.add_documents(documents, None).await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     index.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
|  | ||||
|     // add embedders to the settings | ||||
|     // 2 embedded documents for 3 embeddings in total | ||||
|     let (response, code) = index | ||||
|         .update_settings(json!({ | ||||
|           "embedders": { | ||||
|             "manual": { | ||||
|                 "source": "userProvided", | ||||
|                 "dimensions": 3, | ||||
|             }, | ||||
|             "handcrafted": { | ||||
|                 "source": "userProvided", | ||||
|                 "dimensions": 3, | ||||
|             }, | ||||
|  | ||||
|           }, | ||||
|         })) | ||||
|         .await; | ||||
|     snapshot!(code, @"202 Accepted"); | ||||
|     server.wait_task(response.uid()).await.succeeded(); | ||||
|  | ||||
|     let (stats, _code) = index.stats().await; | ||||
|     snapshot!(json_string!(stats), @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 2, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 3, | ||||
|       "numberOfEmbeddedDocuments": 2, | ||||
|       "fieldDistribution": { | ||||
|         "id": 2, | ||||
|         "name": 2 | ||||
|       } | ||||
|     } | ||||
|     "###); | ||||
| } | ||||
|   | ||||
| @@ -135,6 +135,8 @@ async fn check_the_index_scheduler(server: &Server) { | ||||
|         "kefir": { | ||||
|           "numberOfDocuments": 1, | ||||
|           "isIndexing": false, | ||||
|           "numberOfEmbeddings": 0, | ||||
|           "numberOfEmbeddedDocuments": 0, | ||||
|           "fieldDistribution": { | ||||
|             "age": 1, | ||||
|             "description": 1, | ||||
| @@ -215,6 +217,8 @@ async fn check_the_index_scheduler(server: &Server) { | ||||
|         "kefir": { | ||||
|           "numberOfDocuments": 1, | ||||
|           "isIndexing": false, | ||||
|           "numberOfEmbeddings": 0, | ||||
|           "numberOfEmbeddedDocuments": 0, | ||||
|           "fieldDistribution": { | ||||
|             "age": 1, | ||||
|             "description": 1, | ||||
| @@ -228,10 +232,12 @@ async fn check_the_index_scheduler(server: &Server) { | ||||
|     "###); | ||||
|     let index = server.index("kefir"); | ||||
|     let (stats, _) = index.stats().await; | ||||
|     snapshot!(stats, @r#" | ||||
|     snapshot!(stats, @r###" | ||||
|     { | ||||
|       "numberOfDocuments": 1, | ||||
|       "isIndexing": false, | ||||
|       "numberOfEmbeddings": 0, | ||||
|       "numberOfEmbeddedDocuments": 0, | ||||
|       "fieldDistribution": { | ||||
|         "age": 1, | ||||
|         "description": 1, | ||||
| @@ -240,7 +246,7 @@ async fn check_the_index_scheduler(server: &Server) { | ||||
|         "surname": 1 | ||||
|       } | ||||
|     } | ||||
|     "#); | ||||
|     "###); | ||||
|  | ||||
|     // Delete all the tasks of a specific batch | ||||
|     let (task, _) = server.delete_tasks("batchUids=10").await; | ||||
|   | ||||
| @@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec; | ||||
| use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; | ||||
| use crate::order_by_map::OrderByMap; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; | ||||
| use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; | ||||
| use crate::{ | ||||
|     default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, | ||||
|     FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, | ||||
| @@ -1731,6 +1731,18 @@ impl Index { | ||||
|         let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); | ||||
|         Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) | ||||
|     } | ||||
|  | ||||
|     pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> { | ||||
|         let mut stats = ArroyStats::default(); | ||||
|         let embedding_configs = self.embedding_configs(rtxn)?; | ||||
|         for config in embedding_configs { | ||||
|             let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); | ||||
|             let reader = | ||||
|                 ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); | ||||
|             reader.aggregate_stats(rtxn, &mut stats)?; | ||||
|         } | ||||
|         Ok(stats) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Deserialize, Serialize)] | ||||
|   | ||||
| @@ -1,7 +1,9 @@ | ||||
| mod v1_12; | ||||
| mod v1_13; | ||||
|  | ||||
| use heed::RwTxn; | ||||
| use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3}; | ||||
| use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; | ||||
| use v1_13::V1_13_0_To_Current; | ||||
|  | ||||
| use crate::progress::{Progress, VariableNameStep}; | ||||
| use crate::{Index, InternalError, Result}; | ||||
| @@ -26,11 +28,13 @@ pub fn upgrade( | ||||
|     progress: Progress, | ||||
| ) -> Result<bool> { | ||||
|     let from = index.get_version(wtxn)?.unwrap_or(db_version); | ||||
|     let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()]; | ||||
|     let upgrade_functions: &[&dyn UpgradeIndex] = | ||||
|         &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()]; | ||||
|  | ||||
|     let start = match from { | ||||
|         (1, 12, 0..=2) => 0, | ||||
|         (1, 12, 3..) => 1, | ||||
|         (1, 13, 0) => 2, | ||||
|         // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. | ||||
|         (1, 13, _) => return Ok(false), | ||||
|         (major, minor, patch) => { | ||||
|   | ||||
| @@ -1,7 +1,6 @@ | ||||
| use heed::RwTxn; | ||||
|  | ||||
| use super::UpgradeIndex; | ||||
| use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; | ||||
| use crate::progress::Progress; | ||||
| use crate::{make_enum_progress, Index, Result}; | ||||
|  | ||||
| @@ -32,9 +31,9 @@ impl UpgradeIndex for V1_12_To_V1_12_3 { | ||||
| } | ||||
|  | ||||
| #[allow(non_camel_case_types)] | ||||
| pub(super) struct V1_12_3_To_Current(); | ||||
| pub(super) struct V1_12_3_To_V1_13_0 {} | ||||
|  | ||||
| impl UpgradeIndex for V1_12_3_To_Current { | ||||
| impl UpgradeIndex for V1_12_3_To_V1_13_0 { | ||||
|     fn upgrade( | ||||
|         &self, | ||||
|         _wtxn: &mut RwTxn, | ||||
| @@ -42,14 +41,11 @@ impl UpgradeIndex for V1_12_3_To_Current { | ||||
|         _original: (u32, u32, u32), | ||||
|         _progress: Progress, | ||||
|     ) -> Result<bool> { | ||||
|         Ok(false) | ||||
|         // recompute the indexes stats | ||||
|         Ok(true) | ||||
|     } | ||||
|  | ||||
|     fn target_version(&self) -> (u32, u32, u32) { | ||||
|         ( | ||||
|             VERSION_MAJOR.parse().unwrap(), | ||||
|             VERSION_MINOR.parse().unwrap(), | ||||
|             VERSION_PATCH.parse().unwrap(), | ||||
|         ) | ||||
|         (1, 13, 0) | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										29
									
								
								crates/milli/src/update/upgrade/v1_13.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								crates/milli/src/update/upgrade/v1_13.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| use heed::RwTxn; | ||||
|  | ||||
| use super::UpgradeIndex; | ||||
| use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; | ||||
| use crate::progress::Progress; | ||||
| use crate::{Index, Result}; | ||||
|  | ||||
| #[allow(non_camel_case_types)] | ||||
| pub(super) struct V1_13_0_To_Current(); | ||||
|  | ||||
| impl UpgradeIndex for V1_13_0_To_Current { | ||||
|     fn upgrade( | ||||
|         &self, | ||||
|         _wtxn: &mut RwTxn, | ||||
|         _index: &Index, | ||||
|         _original: (u32, u32, u32), | ||||
|         _progress: Progress, | ||||
|     ) -> Result<bool> { | ||||
|         Ok(false) | ||||
|     } | ||||
|  | ||||
|     fn target_version(&self) -> (u32, u32, u32) { | ||||
|         ( | ||||
|             VERSION_MAJOR.parse().unwrap(), | ||||
|             VERSION_MINOR.parse().unwrap(), | ||||
|             VERSION_PATCH.parse().unwrap(), | ||||
|         ) | ||||
|     } | ||||
| } | ||||
| @@ -410,8 +410,43 @@ impl ArroyWrapper { | ||||
|     fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> { | ||||
|         self.database.remap_data_type() | ||||
|     } | ||||
|  | ||||
|     pub fn aggregate_stats( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         stats: &mut ArroyStats, | ||||
|     ) -> Result<(), arroy::Error> { | ||||
|         if self.quantized { | ||||
|             for reader in self.readers(rtxn, self.quantized_db()) { | ||||
|                 let reader = reader?; | ||||
|                 let documents = reader.item_ids(); | ||||
|                 if documents.is_empty() { | ||||
|                     break; | ||||
|                 } | ||||
|                 stats.documents |= documents; | ||||
|                 stats.number_of_embeddings += documents.len(); | ||||
|             } | ||||
|         } else { | ||||
|             for reader in self.readers(rtxn, self.angular_db()) { | ||||
|                 let reader = reader?; | ||||
|                 let documents = reader.item_ids(); | ||||
|                 if documents.is_empty() { | ||||
|                     break; | ||||
|                 } | ||||
|                 stats.documents |= documents; | ||||
|                 stats.number_of_embeddings += documents.len(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Default, Clone)] | ||||
| pub struct ArroyStats { | ||||
|     pub number_of_embeddings: u64, | ||||
|     pub documents: RoaringBitmap, | ||||
| } | ||||
| /// One or multiple embeddings stored consecutively in a flat vector. | ||||
| pub struct Embeddings<F> { | ||||
|     data: Vec<F>, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user