From 3ed43f9097c51eb6ceedced7ec0be54c0569179f Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 3 Mar 2025 11:11:53 +0100 Subject: [PATCH 01/37] add a failing test reproducing the bug --- .../tests/documents/add_documents.rs | 65 +++++++++---------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs index ad8bae19f..8c05cd177 100644 --- a/crates/meilisearch/tests/documents/add_documents.rs +++ b/crates/meilisearch/tests/documents/add_documents.rs @@ -1897,11 +1897,11 @@ async fn update_documents_with_geo_field() { }, { "id": "3", - "_geo": { "lat": 1, "lng": 1 }, + "_geo": { "lat": 3, "lng": 0 }, }, { "id": "4", - "_geo": { "lat": "1", "lng": "1" }, + "_geo": { "lat": "4", "lng": "0" }, }, ]); @@ -1928,9 +1928,7 @@ async fn update_documents_with_geo_field() { } "###); - let (response, code) = index - .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) - .await; + let (response, code) = index.search_post(json!({"sort": ["_geoPoint(10,0):asc"]})).await; snapshot!(code, @"200 OK"); // we are expecting docs 4 and 3 first as they have geo snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @@ -1940,18 +1938,18 @@ async fn update_documents_with_geo_field() { { "id": "4", "_geo": { - "lat": "1", - "lng": "1" + "lat": "4", + "lng": "0" }, - "_geoDistance": 5522018 + "_geoDistance": 667170 }, { "id": "3", "_geo": { - "lat": 1, - "lng": 1 + "lat": 3, + "lng": 0 }, - "_geoDistance": 5522018 + "_geoDistance": 778364 }, { "id": "1" @@ -1969,10 +1967,13 @@ async fn update_documents_with_geo_field() { } "###); - let updated_documents = json!([{ - "id": "3", - "doggo": "kefir", - }]); + let updated_documents = json!([ + { + "id": "3", + "doggo": "kefir", + "_geo": { "lat": 5, "lng": 0 }, + } + ]); let (task, _status_code) = index.update_documents(updated_documents, None).await; let response = index.wait_task(task.uid()).await; snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }), @@ -2012,16 +2013,16 @@ async fn update_documents_with_geo_field() { { "id": "3", "_geo": { - "lat": 1, - "lng": 1 + "lat": 5, + "lng": 0 }, "doggo": "kefir" }, { "id": "4", "_geo": { - "lat": "1", - "lng": "1" + "lat": "4", + "lng": "0" } } ], @@ -2031,31 +2032,29 @@ async fn update_documents_with_geo_field() { } "###); - let (response, code) = index - .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]})) - .await; + let (response, code) = index.search_post(json!({"sort": ["_geoPoint(10,0):asc"]})).await; snapshot!(code, @"200 OK"); // the search response should not have changed: we are expecting docs 4 and 3 first as they have geo snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###" { "hits": [ - { - "id": "4", - "_geo": { - "lat": "1", - "lng": "1" - }, - "_geoDistance": 5522018 - }, { "id": "3", "_geo": { - "lat": 1, - "lng": 1 + "lat": 5, + "lng": 0 }, "doggo": "kefir", - "_geoDistance": 5522018 + "_geoDistance": 555975 + }, + { + "id": "4", + "_geo": { + "lat": "4", + "lng": "0" + }, + "_geoDistance": 667170 }, { "id": "1" From d3cd5ea68924430c0bd0a10f0059dd6c4bd2cf4e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 3 Mar 2025 14:45:57 +0100 Subject: [PATCH 02/37] Check if the geo fields changed additionally to the other faceted fields when reindexing facets --- .../milli/src/update/new/document_change.rs | 26 ++++++++++++++++++- .../new/extract/faceted/extract_facets.rs | 7 +++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 38369a4d7..8a8ac4bb3 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,5 +1,6 @@ use bumpalo::Bump; use heed::RoTxn; +use serde_json::Value; use super::document::{ Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, @@ -10,7 +11,7 @@ use super::vector_document::{ use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::vector::EmbeddingConfigs; -use crate::{DocumentId, Index, Result}; +use crate::{DocumentId, Index, InternalError, Result}; pub enum DocumentChange<'doc> { Deletion(Deletion<'doc>), @@ -243,6 +244,29 @@ impl<'doc> Update<'doc> { Ok(has_deleted_fields) } + /// Returns `true` if the geo fields have changed. + pub fn has_changed_for_geo_fields<'t, Mapper: FieldIdMapper>( + &self, + rtxn: &'t RoTxn, + index: &'t Index, + mapper: &'t Mapper, + ) -> Result { + let current = self.current(rtxn, index, mapper)?; + let current_geo = current.geo_field()?; + let updated_geo = self.only_changed_fields().geo_field()?; + match (current_geo, updated_geo) { + (Some(current_geo), Some(updated_geo)) => { + let current: Value = + serde_json::from_str(current_geo.get()).map_err(InternalError::SerdeJson)?; + let updated: Value = + serde_json::from_str(updated_geo.get()).map_err(InternalError::SerdeJson)?; + Ok(current != updated) + } + (None, None) => Ok(false), + _ => Ok(true), + } + } + pub fn only_changed_vectors( &self, doc_alloc: &'doc Bump, diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index b3aa8f984..1b08307a2 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -117,7 +117,7 @@ impl FacetedDocidsExtractor { }, ), DocumentChange::Update(inner) => { - if !inner.has_changed_for_fields( + let has_changed = inner.has_changed_for_fields( &mut |field_name| { match_faceted_field( field_name, @@ -130,7 +130,10 @@ impl FacetedDocidsExtractor { rtxn, index, context.db_fields_ids_map, - )? { + )?; + let has_changed_for_geo_fields = + inner.has_changed_for_geo_fields(rtxn, index, context.db_fields_ids_map)?; + if !has_changed && !has_changed_for_geo_fields { return Ok(()); } From d3e4b2dfe77df6ac1cbff6346867c76866dea074 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 14 Mar 2025 13:07:51 +0100 Subject: [PATCH 03/37] Accept total batch size in human size --- crates/meilisearch/src/analytics/segment_analytics.rs | 3 ++- crates/meilisearch/src/lib.rs | 2 +- crates/meilisearch/src/option.rs | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index a681e9e29..c428aa2b8 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -326,7 +326,8 @@ impl Infos { http_addr: http_addr != default_http_addr(), http_payload_size_limit, experimental_max_number_of_batched_tasks, - experimental_limit_batched_tasks_total_size, + experimental_limit_batched_tasks_total_size: + experimental_limit_batched_tasks_total_size.into(), task_queue_webhook: task_webhook_url.is_some(), task_webhook_authorization_header: task_webhook_authorization_header.is_some(), log_level: log_level.to_string(), diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 1841d5556..7dd1b58b4 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -228,7 +228,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc, Arc< cleanup_enabled: !opt.experimental_replication_parameters, max_number_of_tasks: 1_000_000, max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks, - batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size, + batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.into(), index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize, index_count: DEFAULT_INDEX_COUNT, instance_features: opt.to_instance_features(), diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index acf4393d3..6364f49d8 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -444,7 +444,7 @@ pub struct Opt { /// see: #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())] #[serde(default = "default_limit_batched_tasks_total_size")] - pub experimental_limit_batched_tasks_total_size: u64, + pub experimental_limit_batched_tasks_total_size: Byte, #[serde(flatten)] #[clap(flatten)] @@ -944,8 +944,8 @@ fn default_limit_batched_tasks() -> usize { usize::MAX } -fn default_limit_batched_tasks_total_size() -> u64 { - u64::MAX +fn default_limit_batched_tasks_total_size() -> Byte { + Byte::from_u64(u64::MAX) } fn default_snapshot_dir() -> PathBuf { From cb16baab18cbd8abce27207fe065f9f8b3b70687 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 16 Mar 2025 19:15:39 +0100 Subject: [PATCH 04/37] Add more progress levels to measure merging --- crates/milli/src/progress.rs | 14 ++++++++++++-- crates/milli/src/update/new/indexer/extract.rs | 16 +++++++++++++++- crates/milli/src/update/new/merger.rs | 11 ++--------- crates/milli/src/update/new/steps.rs | 6 ++++++ 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 7eb0cbd6b..75dafa8ec 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -190,8 +190,18 @@ macro_rules! make_atomic_progress { }; } -make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); -make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" ); +make_atomic_progress!(Document alias AtomicDocumentStep => "document"); +make_atomic_progress!(Payload alias AtomicPayloadStep => "payload"); + +make_enum_progress! { + pub enum MergingWordCache { + WordDocids, + WordFieldIdDocids, + ExactWordDocids, + WordPositionDocids, + FieldIdWordCountDocids, + } +} #[derive(Debug, Serialize, Clone, ToSchema)] #[serde(rename_all = "camelCase")] diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index 907a4d1df..bb36ddc37 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -13,6 +13,7 @@ use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; use crate::index::IndexEmbeddingConfig; +use crate::progress::MergingWordCache; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::merger::merge_and_send_rtree; @@ -96,6 +97,7 @@ where { let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); let _entered = span.enter(); + indexing_context.progress.update_progress(IndexingStep::MergingFacetCaches); facet_field_ids_delta = merge_and_send_facet_docids( caches, @@ -117,7 +119,6 @@ where } = { let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); let _entered = span.enter(); - WordDocidsExtractors::run_extraction( document_changes, indexing_context, @@ -126,9 +127,13 @@ where )? }; + indexing_context.progress.update_progress(IndexingStep::MergingWordCaches); + { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); let _entered = span.enter(); + indexing_context.progress.update_progress(MergingWordCache::WordDocids); + merge_and_send_docids( word_docids, index.word_docids.remap_types(), @@ -142,6 +147,8 @@ where let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); let _entered = span.enter(); + indexing_context.progress.update_progress(MergingWordCache::WordFieldIdDocids); + merge_and_send_docids( word_fid_docids, index.word_fid_docids.remap_types(), @@ -155,6 +162,8 @@ where let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); let _entered = span.enter(); + indexing_context.progress.update_progress(MergingWordCache::ExactWordDocids); + merge_and_send_docids( exact_word_docids, index.exact_word_docids.remap_types(), @@ -168,6 +177,8 @@ where let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); let _entered = span.enter(); + indexing_context.progress.update_progress(MergingWordCache::WordPositionDocids); + merge_and_send_docids( word_position_docids, index.word_position_docids.remap_types(), @@ -181,6 +192,8 @@ where let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); let _entered = span.enter(); + indexing_context.progress.update_progress(MergingWordCache::FieldIdWordCountDocids); + merge_and_send_docids( fid_word_count_docids, index.field_id_word_count_docids.remap_types(), @@ -210,6 +223,7 @@ where { let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); let _entered = span.enter(); + indexing_context.progress.update_progress(IndexingStep::MergingWordProximity); merge_and_send_docids( caches, diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 090add6bd..15f06c67d 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -82,14 +82,8 @@ where merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { - Operation::Write(bitmap) => { - docids_sender.write(key, &bitmap)?; - Ok(()) - } - Operation::Delete => { - docids_sender.delete(key)?; - Ok(()) - } + Operation::Write(bitmap) => docids_sender.write(key, &bitmap), + Operation::Delete => docids_sender.delete(key), Operation::Ignore => Ok(()), } }) @@ -130,7 +124,6 @@ pub fn merge_and_send_facet_docids<'extractor>( Operation::Ignore => Ok(()), } })?; - Ok(facet_field_ids_delta) }) .reduce( diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index ad8fe9cb1..e026b4d0d 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -13,6 +13,9 @@ pub enum IndexingStep { ExtractingWords, ExtractingWordProximity, ExtractingEmbeddings, + MergingFacetCaches, + MergingWordCaches, + MergingWordProximity, WritingGeoPoints, WaitingForDatabaseWrites, WaitingForExtractors, @@ -31,6 +34,9 @@ impl Step for IndexingStep { IndexingStep::ExtractingWords => "extracting words", IndexingStep::ExtractingWordProximity => "extracting word proximity", IndexingStep::ExtractingEmbeddings => "extracting embeddings", + IndexingStep::MergingFacetCaches => "merging facet caches", + IndexingStep::MergingWordCaches => "merging word caches", + IndexingStep::MergingWordProximity => "merging word proximity", IndexingStep::WritingGeoPoints => "writing geo points", IndexingStep::WaitingForDatabaseWrites => "waiting for database writes", IndexingStep::WaitingForExtractors => "waiting for extractors", From 49dd50dab2ec70155f781da47a111a543faf5e3c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 17 Mar 2025 11:29:17 +0100 Subject: [PATCH 05/37] Bump ring to v0.17.14 to compile on old aarch64 --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 293d17045..59718aca4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3018,7 +3018,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.1", ] [[package]] @@ -4886,9 +4886,9 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.13" +version = "0.17.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", "cfg-if", From e2156ddfc78f9dc11bcb06357323959a1431b06b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 17 Mar 2025 11:40:50 +0100 Subject: [PATCH 06/37] Simplify the IndexingStep progress enum --- crates/milli/src/update/new/steps.rs | 74 +++++++--------------------- 1 file changed, 19 insertions(+), 55 deletions(-) diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index e026b4d0d..da71819c6 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -1,58 +1,22 @@ -use std::borrow::Cow; +use crate::make_enum_progress; -use enum_iterator::Sequence; - -use crate::progress::Step; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] -#[repr(u8)] -pub enum IndexingStep { - PreparingPayloads, - ExtractingDocuments, - ExtractingFacets, - ExtractingWords, - ExtractingWordProximity, - ExtractingEmbeddings, - MergingFacetCaches, - MergingWordCaches, - MergingWordProximity, - WritingGeoPoints, - WaitingForDatabaseWrites, - WaitingForExtractors, - WritingEmbeddingsToDatabase, - PostProcessingFacets, - PostProcessingWords, - Finalizing, -} - -impl Step for IndexingStep { - fn name(&self) -> Cow<'static, str> { - match self { - IndexingStep::PreparingPayloads => "preparing update file", - IndexingStep::ExtractingDocuments => "extracting documents", - IndexingStep::ExtractingFacets => "extracting facets", - IndexingStep::ExtractingWords => "extracting words", - IndexingStep::ExtractingWordProximity => "extracting word proximity", - IndexingStep::ExtractingEmbeddings => "extracting embeddings", - IndexingStep::MergingFacetCaches => "merging facet caches", - IndexingStep::MergingWordCaches => "merging word caches", - IndexingStep::MergingWordProximity => "merging word proximity", - IndexingStep::WritingGeoPoints => "writing geo points", - IndexingStep::WaitingForDatabaseWrites => "waiting for database writes", - IndexingStep::WaitingForExtractors => "waiting for extractors", - IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database", - IndexingStep::PostProcessingFacets => "post-processing facets", - IndexingStep::PostProcessingWords => "post-processing words", - IndexingStep::Finalizing => "finalizing", - } - .into() - } - - fn current(&self) -> u32 { - *self as u32 - } - - fn total(&self) -> u32 { - Self::CARDINALITY as u32 +make_enum_progress! { + pub enum IndexingStep { + PreparingPayloads, + ExtractingDocuments, + ExtractingFacets, + ExtractingWords, + ExtractingWordProximity, + ExtractingEmbeddings, + MergingFacetCaches, + MergingWordCaches, + MergingWordProximity, + WritingGeoPoints, + WaitingForDatabaseWrites, + WaitingForExtractors, + WritingEmbeddingsToDatabase, + PostProcessingFacets, + PostProcessingWords, + Finalizing, } } From b0b1888ef9052fe5dd049945b7ea5e8427510fcc Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Mar 2025 15:57:35 +0100 Subject: [PATCH 07/37] Add test --- crates/meilisearch/tests/search/mod.rs | 140 +++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs index d7a09b58e..f6e79dbb9 100644 --- a/crates/meilisearch/tests/search/mod.rs +++ b/crates/meilisearch/tests/search/mod.rs @@ -1783,6 +1783,146 @@ async fn test_nested_fields() { .await; } +#[actix_rt::test] +async fn test_typo_settings() { + let documents = json!([ + { + "id": 0, + "title": "The zeroth document", + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule", + }, + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field", + }, + { + "prout": "truc", + "machin": "lol", + }, + ], + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied", + }, + ]); + + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({ + "searchableAttributes": ["title", "nested.object", "nested.machin"], + "typoTolerance": { + "enabled": true, + "disableOnAttributes": ["title"] + } + }), + &json!({"q": "document"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "title": "The zeroth document" + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied" + } + ] + "###); + }, + ) + .await; + + // Test prefix search + test_settings_documents_indexing_swapping_and_search( + &documents, + &json!({ + "searchableAttributes": ["title", "nested.object", "nested.machin"], + "typoTolerance": { + "enabled": true, + "disableOnAttributes": ["title"] + } + }), + &json!({"q": "docume"}), + |response, code| { + assert_eq!(code, 200, "{}", response); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "id": 0, + "title": "The zeroth document" + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule" + } + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field" + }, + { + "prout": "truc", + "machin": "lol" + } + ] + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied" + } + ] + "###); + }, + ) + .await; +} + /// Modifying facets with different casing should work correctly #[actix_rt::test] async fn change_facet_casing() { From bf144a94d8416eed26bfe15d1627834550f72139 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Mar 2025 15:44:41 +0100 Subject: [PATCH 08/37] No more use FST to find a word without any typo --- crates/milli/src/index.rs | 13 ++ .../new/query_term/compute_derivations.rs | 116 ++++++++---------- 2 files changed, 61 insertions(+), 68 deletions(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 771d32175..fcb8962d2 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1755,6 +1755,19 @@ impl Index { } Ok(stats) } + + /// Check if the word is indexed in the index. + /// + /// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids. + /// + /// # Arguments + /// + /// * `rtxn`: The read transaction. + /// * `word`: The word to check. + pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result { + Ok(self.word_docids.remap_data_type::().get(rtxn, word)?.is_some() + || self.exact_word_docids.remap_data_type::().get(rtxn, word)?.is_some()) + } } #[derive(Debug, Deserialize, Serialize)] diff --git a/crates/milli/src/search/new/query_term/compute_derivations.rs b/crates/milli/src/search/new/query_term/compute_derivations.rs index 79cd830ca..3caecb69e 100644 --- a/crates/milli/src/search/new/query_term/compute_derivations.rs +++ b/crates/milli/src/search/new/query_term/compute_derivations.rs @@ -1,10 +1,12 @@ use std::borrow::Cow; +use std::cmp::Ordering; use std::collections::BTreeSet; use std::ops::ControlFlow; use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; +use fst::{IntoStreamer, Streamer}; use heed::types::DecodeIgnore; +use itertools::{merge_join_by, EitherOrBoth}; use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; @@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum NumberOfTypos { - Zero, One, Two, } -pub enum ZeroOrOneTypo { - Zero, - One, -} - impl Interned { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> { let s = ctx.term_interner.get_mut(self); @@ -47,34 +43,45 @@ impl Interned { } fn find_zero_typo_prefix_derivations( + ctx: &mut SearchContext<'_>, word_interned: Interned, - fst: fst::Set>, - word_interner: &mut DedupInterner, mut visit: impl FnMut(Interned) -> Result>, ) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); + let word = ctx.word_interner.get(word_interned).to_owned(); let word = word.as_str(); - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - while let Some(derived_word) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?.to_owned(); - let derived_word_interned = word_interner.insert(derived_word); - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned)?; - if cf.is_break() { - break; + let words = + ctx.index.word_docids.remap_data_type::().prefix_iter(ctx.txn, word)?; + let exact_words = + ctx.index.exact_word_docids.remap_data_type::().prefix_iter(ctx.txn, word)?; + + for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) { + (Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word), + (Err(_), _) | (_, Err(_)) => Ordering::Equal, + }) { + match eob { + EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => { + let (derived_word, _) = kv?; + let derived_word = derived_word.to_string(); + let derived_word_interned = ctx.word_interner.insert(derived_word); + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned)?; + if cf.is_break() { + break; + } + } } } } + Ok(()) } -fn find_zero_one_typo_derivations( +fn find_one_typo_derivations( ctx: &mut SearchContext<'_>, word_interned: Interned, is_prefix: bool, - mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, + mut visit: impl FnMut(Interned) -> Result>, ) -> Result<()> { let fst = ctx.get_words_fst()?; let word = ctx.word_interner.get(word_interned).to_owned(); @@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations( let derived_word = ctx.word_interner.insert(derived_word.to_owned()); let d = dfa.distance(state.1); match d.to_u8() { - 0 => { - if derived_word != word_interned { - let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; - if cf.is_break() { - break; - } - } - } + 0 => (), 1 => { - let cf = visit(derived_word, ZeroOrOneTypo::One)?; + let cf = visit(derived_word)?; if cf.is_break() { break; } @@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations( Ok(()) } -fn find_zero_one_two_typo_derivations( +fn find_one_two_typo_derivations( word_interned: Interned, is_prefix: bool, fst: fst::Set>, @@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations( // correct distance let d = second_dfa.distance((state.1).0); match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; - if cf.is_break() { - break; - } - } - } + 0 => (), 1 => { let cf = visit(derived_word_interned, NumberOfTypos::One)?; if cf.is_break() { @@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word( }); } - let fst = ctx.index.words_fst(ctx.txn)?; - let use_prefix_db = is_prefix && (ctx .index @@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word( let mut zero_typo = None; let mut prefix_of = BTreeSet::new(); - if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() { + if ctx.index.contains_word(ctx.txn, word)? { zero_typo = Some(word_interned); } if is_prefix && use_prefix_db.is_none() { - find_zero_typo_prefix_derivations( - word_interned, - fst, - &mut ctx.word_interner, - |derived_word| { - if prefix_of.len() < limits::MAX_PREFIX_COUNT { - prefix_of.insert(derived_word); - Ok(ControlFlow::Continue(())) - } else { - Ok(ControlFlow::Break(())) - } - }, - )?; + find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| { + if prefix_of.len() < limits::MAX_PREFIX_COUNT { + prefix_of.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) + } + })?; } let synonyms = ctx.index.synonyms(ctx.txn)?; let mut synonym_word_count = 0; @@ -295,18 +281,13 @@ impl Interned { let mut one_typo_words = BTreeSet::new(); if *max_nbr_typos > 0 { - find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } - } + find_one_typo_derivations(ctx, original, is_prefix, |derived_word| { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) } - Ok(ControlFlow::Continue(())) })?; } @@ -357,7 +338,7 @@ impl Interned { let mut two_typo_words = BTreeSet::new(); if *max_nbr_typos > 0 { - find_zero_one_two_typo_derivations( + find_one_two_typo_derivations( *original, *is_prefix, ctx.index.words_fst(ctx.txn)?, @@ -370,7 +351,6 @@ impl Interned { return Ok(ControlFlow::Break(())); } match nbr_typos { - NumberOfTypos::Zero => {} NumberOfTypos::One => { if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { one_typo_words.insert(derived_word); From 69678ed8e17d62f365b65d0923e2ebaf122c33b9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Mar 2025 00:19:49 +0000 Subject: [PATCH 09/37] Bump zip from 2.2.2 to 2.3.0 Bumps [zip](https://github.com/zip-rs/zip2) from 2.2.2 to 2.3.0. - [Release notes](https://github.com/zip-rs/zip2/releases) - [Changelog](https://github.com/zip-rs/zip2/blob/master/CHANGELOG.md) - [Commits](https://github.com/zip-rs/zip2/compare/v2.2.2...v2.3.0) --- updated-dependencies: - dependency-name: zip dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- Cargo.lock | 128 ++++++++++++++++++++++++---------- crates/meilisearch/Cargo.toml | 2 +- 2 files changed, 93 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 293d17045..431a3c534 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,7 +258,7 @@ version = "0.7.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", ] @@ -271,7 +271,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom", + "getrandom 0.2.15", "once_cell", "version_check", "zerocopy", @@ -790,22 +790,20 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.4.4" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2-sys" -version = "0.1.11+1.0.8" +version = "0.1.13+1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" dependencies = [ "cc", - "libc", "pkg-config", ] @@ -1143,7 +1141,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom", + "getrandom 0.2.15", "once_cell", "tiny-keccak", ] @@ -2216,10 +2214,24 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "wasm-bindgen", + "windows-targets 0.52.6", +] + [[package]] name = "gimli" version = "0.27.3" @@ -2923,10 +2935,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -3518,6 +3531,17 @@ dependencies = [ "crc", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "macro_rules_attribute" version = "0.2.0" @@ -3656,7 +3680,7 @@ dependencies = [ "uuid", "wiremock", "yaup", - "zip 2.2.2", + "zip 2.3.0", ] [[package]] @@ -3882,7 +3906,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.48.0", ] @@ -3893,7 +3917,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -4670,7 +4694,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -4762,7 +4786,7 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" dependencies = [ - "getrandom", + "getrandom 0.2.15", "redox_syscall 0.2.16", "thiserror 1.0.69", ] @@ -4892,7 +4916,7 @@ checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "untrusted", "windows-sys 0.52.0", @@ -5576,7 +5600,7 @@ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" dependencies = [ "cfg-if", "fastrand", - "getrandom", + "getrandom 0.2.15", "once_cell", "rustix", "windows-sys 0.52.0", @@ -5751,7 +5775,7 @@ dependencies = [ "aho-corasick", "derive_builder 0.12.0", "esaxx-rs", - "getrandom", + "getrandom 0.2.15", "itertools 0.12.1", "lazy_static", "log", @@ -6238,7 +6262,7 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ - "getrandom", + "getrandom 0.2.15", "serde", ] @@ -6335,24 +6359,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] -name = "wasm-bindgen" -version = "0.2.92" +name = "wasi" +version = "0.13.3+wasi-0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", + "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn 2.0.87", @@ -6373,9 +6407,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6383,9 +6417,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", @@ -6396,9 +6430,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "wasm-streams" @@ -6803,6 +6840,15 @@ dependencies = [ "url", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.9.0", +] + [[package]] name = "write16" version = "1.0.0" @@ -6858,6 +6904,15 @@ dependencies = [ "uuid", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yada" version = "0.5.1" @@ -6999,9 +7054,9 @@ dependencies = [ [[package]] name = "zip" -version = "2.2.2" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae9c1ea7b3a5e1f4b922ff856a129881167511563dc219869afe3787fc0c1a45" +checksum = "84e9a772a54b54236b9b744aaaf8d7be01b4d6e99725523cb82cb32d1c81b1d7" dependencies = [ "aes", "arbitrary", @@ -7012,15 +7067,16 @@ dependencies = [ "deflate64", "displaydoc", "flate2", + "getrandom 0.3.1", "hmac", "indexmap", "lzma-rs", "memchr", "pbkdf2", - "rand", "sha1", "thiserror 2.0.9", "time", + "xz2", "zeroize", "zopfli", "zstd", diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index e25fd9400..4cfc5c2ac 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -140,7 +140,7 @@ reqwest = { version = "0.12.12", features = [ sha-1 = { version = "0.10.1", optional = true } static-files = { version = "0.2.4", optional = true } tempfile = { version = "3.15.0", optional = true } -zip = { version = "2.2.2", optional = true } +zip = { version = "2.3.0", optional = true } [features] default = ["meilisearch-types/all-tokenizations", "mini-dashboard"] From 71f7456748201cd2d48005794f63d769b78ff5b4 Mon Sep 17 00:00:00 2001 From: curquiza Date: Tue, 18 Mar 2025 12:48:38 +0100 Subject: [PATCH 10/37] Update mini-dashboard to v0.2.19 version --- crates/meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 4cfc5c2ac..428f13c10 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -170,5 +170,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.18/build.zip" -sha1 = "b408a30dcb6e20cddb0c153c23385bcac4c8e912" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.19/build.zip" +sha1 = "7974430d5277c97f67cf6e95eec6faaac2788834" From f540a69ac3d3b954b53734492e7b8ef3158ddc4d Mon Sep 17 00:00:00 2001 From: Tee Jun hui Date: Wed, 5 Feb 2025 16:19:05 +0800 Subject: [PATCH 11/37] add 1 to index so it points to correct position --- crates/milli/src/search/new/bucket_sort.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs index 8f1deb265..d0b7d258c 100644 --- a/crates/milli/src/search/new/bucket_sort.rs +++ b/crates/milli/src/search/new/bucket_sort.rs @@ -178,6 +178,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( if current_score < ranking_score_threshold { all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index]; back!(); + cur_ranking_rule_index += 1; continue; } } @@ -213,6 +214,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( continue; } + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket( ctx, logger, @@ -242,7 +244,9 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( if current_score < ranking_score_threshold { all_candidates -= next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; + back!(); + cur_ranking_rule_index += 1; continue; } } From 8c8cc59a6c1a53a80eb63ee0637a402be20449e9 Mon Sep 17 00:00:00 2001 From: Tee Jun hui Date: Wed, 5 Feb 2025 16:41:24 +0800 Subject: [PATCH 12/37] remove new line added by accident --- crates/milli/src/search/new/bucket_sort.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs index d0b7d258c..172bdb3f9 100644 --- a/crates/milli/src/search/new/bucket_sort.rs +++ b/crates/milli/src/search/new/bucket_sort.rs @@ -214,7 +214,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( continue; } - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket( ctx, logger, @@ -244,7 +243,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( if current_score < ranking_score_threshold { all_candidates -= next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; - back!(); cur_ranking_rule_index += 1; continue; From f9807ba32ef36fb3980299e07dec91df49b58bff Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 19 Mar 2025 11:33:44 +0100 Subject: [PATCH 13/37] Fix logic when results are below the threshold --- crates/milli/src/search/new/bucket_sort.rs | 41 +++++++++++----------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs index 172bdb3f9..a659dd226 100644 --- a/crates/milli/src/search/new/bucket_sort.rs +++ b/crates/milli/src/search/new/bucket_sort.rs @@ -173,17 +173,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ranking_rule_scores.push(ScoreDetails::Skipped); // remove candidates from the universe without adding them to result if their score is below the threshold - if let Some(ranking_score_threshold) = ranking_score_threshold { - let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); - if current_score < ranking_score_threshold { - all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index]; - back!(); - cur_ranking_rule_index += 1; - continue; - } - } + let is_below_threshold = + ranking_score_threshold.is_some_and(|ranking_score_threshold| { + let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); + current_score < ranking_score_threshold + }); - maybe_add_to_results!(bucket); + if is_below_threshold { + all_candidates -= &bucket; + all_candidates -= &ranking_rule_universes[cur_ranking_rule_index]; + } else { + maybe_add_to_results!(bucket); + } ranking_rule_scores.pop(); @@ -238,24 +239,24 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ); // remove candidates from the universe without adding them to result if their score is below the threshold - if let Some(ranking_score_threshold) = ranking_score_threshold { + let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| { let current_score = ScoreDetails::global_score(ranking_rule_scores.iter()); - if current_score < ranking_score_threshold { - all_candidates -= - next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; - back!(); - cur_ranking_rule_index += 1; - continue; - } - } + current_score < ranking_score_threshold + }); ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 || (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1) || cur_offset + (next_bucket.candidates.len() as usize) < from + || is_below_threshold { - maybe_add_to_results!(next_bucket.candidates); + if is_below_threshold { + all_candidates -= + next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; + } else { + maybe_add_to_results!(next_bucket.candidates); + } ranking_rule_scores.pop(); continue; } From 2e6aa63efc251124ed3565278de7beef36d2e182 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 24 Mar 2025 14:32:21 +0100 Subject: [PATCH 14/37] Update Charabia v0.9.3 --- Cargo.lock | 90 ++++++++++++++++++++--------------------- crates/milli/Cargo.toml | 2 +- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 53ab34da6..65b85cbcc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -976,9 +976,9 @@ dependencies = [ [[package]] name = "charabia" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf8921fe4d53ab8f9e8f9b72ce6f91726cfc40fffab1243d27db406b5e2e9cc2" +checksum = "650d52f87a36472ea1c803dee49d6bfd23d426efa9363e2f4c4a0e6a236d3407" dependencies = [ "aho-corasick", "csv", @@ -3031,7 +3031,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.48.1", + "windows-targets 0.52.6", ] [[package]] @@ -3075,9 +3075,9 @@ dependencies = [ [[package]] name = "lindera" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cbc1aad631a7da0a7e9bc4b8669fa92ac9ca8eeb7b35a807376dd3034443ff" +checksum = "832c220475557e3b44a46cad1862b57f010f0c6e93d771d0e628e08689c068b1" dependencies = [ "lindera-analyzer", "lindera-core", @@ -3088,9 +3088,9 @@ dependencies = [ [[package]] name = "lindera-analyzer" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74508ffbb24e36905d1718b261460e378a748029b07bcd7e06f0d18500b8194c" +checksum = "a8e26651714abf5167e6b6a80f5cdaa0cad41c5fcb84d8ba96bebafcb9029339" dependencies = [ "anyhow", "bincode", @@ -3118,9 +3118,9 @@ dependencies = [ [[package]] name = "lindera-assets" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a677c371ecb3bd02b751be306ea09876cd47cf426303ad5f10a3fd6f9a4ded6" +checksum = "ebb01f1ca53c1e642234c6c7fdb9ac664ad0c1ab9502f33e4200201bac7e6ce7" dependencies = [ "encoding", "flate2", @@ -3131,9 +3131,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c35944000d05a177e981f037b5f0805f283b32f05a0c35713003bef136ca8cb4" +checksum = "5f7618d9aa947fdd7c38eae2b79f0fd237ecb5067608f1363610ba20d20ab5a8" dependencies = [ "bincode", "byteorder", @@ -3145,9 +3145,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b8f642bc9c9130682569975772a17336c6aab26d11fc0f823f3e663167ace6" +checksum = "efdbcb809d81428935d601a78c94bfb39500749213f7320705f427a7a1d31aec" dependencies = [ "anyhow", "lindera-core", @@ -3157,9 +3157,9 @@ dependencies = [ [[package]] name = "lindera-compress" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7825d8d63592aa5727d67bd209170ac82df56c369533efbf0ddbac277bb68ec" +checksum = "eac178afa2456dac469d3b1a2d7fbaf3e1ea796a1f52321e8ac29545a53c239c" dependencies = [ "anyhow", "flate2", @@ -3168,9 +3168,9 @@ dependencies = [ [[package]] name = "lindera-core" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c28191456debc98af6aa5f7db77872471983e9fa2a737b1c232b6ef543aed62" +checksum = "649777465f48147ce593ab6db347e235e3af8f693a23f4437be94a1cdbdf5fdf" dependencies = [ "anyhow", "bincode", @@ -3185,9 +3185,9 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4788a1ead2f63f3fc2888109272921dedd86a87b7d0bf05e9daab46600daac51" +checksum = "9e3faaceb85e43ac250021866c6db3cdc9997b44b3d3ea498594d04edc91fc45" dependencies = [ "anyhow", "flate2", @@ -3196,9 +3196,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdf5f91725e32b9a21b1656baa7030766c9bafc4de4b4ddeb8ffdde7224dd2f6" +checksum = "31e15b2d2d8a4ad45f2e373a084931cf3dfbde15f124044e2436bb920af3366c" dependencies = [ "anyhow", "bincode", @@ -3221,9 +3221,9 @@ dependencies = [ [[package]] name = "lindera-dictionary-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e41f00ba7ac541b0ffd8c30e7a73f2dd197546cc5780462ec4f2e4782945a780" +checksum = "59802949110545b59b663917ed3fd55dc3b3a8cde6bd20137d7fe24372cfb9aa" dependencies = [ "anyhow", "bincode", @@ -3243,9 +3243,9 @@ dependencies = [ [[package]] name = "lindera-filter" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "273d27e01e1377e2647314a4a5b9bdca4b52a867b319069ebae8c10191146eca" +checksum = "1320f118c3fc9e897f4ebfc16864e5ef8c0b06ba769c0a50e53f193f9d682bf8" dependencies = [ "anyhow", "csv", @@ -3268,9 +3268,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97a52ff0af5acb700093badaf7078051ab9ffd9071859724445a60193995f1f" +checksum = "5b4731bf3730f1f38266d7ee9bca7d460cd336645c9dfd4e6a1082e58ab1e993" dependencies = [ "bincode", "byteorder", @@ -3282,9 +3282,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf5031c52686128db13f774b2c5a8abfd52b4cc1f904041d8411aa19d630ce4d" +checksum = "309966c12e682f67205c3cd3c8dc55bbdcd1eb3b5c7c5cb41fb8acd18906d340" dependencies = [ "anyhow", "lindera-core", @@ -3294,9 +3294,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b36764b27b169aa11d24888141f206a6c246a5b195c1e67127485bac512fb6" +checksum = "e90e919b4cfb9962d24ee1e1d50a7c163bbf356376495ad66d1996e20b9f9e44" dependencies = [ "bincode", "byteorder", @@ -3308,9 +3308,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf36e40ace904741efdd883ed5c4dba6425f65156a0fb5d3f73a386335950dc" +checksum = "7e517df0d501f9f8bf3126da20fc8cb9a5e37921e0eec1824d7a62f096463e02" dependencies = [ "anyhow", "lindera-core", @@ -3320,9 +3320,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c92a1a3564b531953f0238cbcea392f2905f7b27b449978cf9e702a80e1086d" +checksum = "e9c6da4e68bc8b452a54b96d65361ebdceb4b6f36ecf262425c0e1f77960ae82" dependencies = [ "bincode", "byteorder", @@ -3335,9 +3335,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f2c60425abc1548570c2568858f74a1f042105ecd89faa39c651b4315350fd9" +checksum = "afc95884cc8f6dfb176caf5991043a4acf94c359215bbd039ea765e00454f271" dependencies = [ "anyhow", "lindera-core", @@ -3347,9 +3347,9 @@ dependencies = [ [[package]] name = "lindera-tokenizer" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "903e558981bcb6f59870aa7d6b4bcb09e8f7db778886a6a70f67fd74c9fa2ca3" +checksum = "d122042e1232a55c3604692445952a134e523822e9b4b9ab32a53ff890037ad4" dependencies = [ "bincode", "lindera-core", @@ -3361,9 +3361,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d227c3ce9cbd905f865c46c65a0470fd04e89b71104d7f92baa71a212ffe1d4b" +checksum = "cbffae1fb2f2614abdcb50f99b138476dbac19862ffa57bfdc9c7b5d5b22a90c" dependencies = [ "bincode", "byteorder", @@ -3376,9 +3376,9 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.32.2" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e2c50015c242e02c451acb6748667ac6fd1d3d667cd7db48cd89e2f2d2377e" +checksum = "fe50055327712ebd1bcc74b657cf78c728a78b9586e3f99d5dd0b6a0be221c5d" dependencies = [ "anyhow", "lindera-core", @@ -6118,9 +6118,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index e3b9b077a..a2a020587 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -18,7 +18,7 @@ bincode = "1.3.3" bstr = "1.11.3" bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" -charabia = { version = "0.9.2", default-features = false } +charabia = { version = "0.9.3", default-features = false } concat-arrays = "0.1.2" convert_case = "0.6.0" crossbeam-channel = "0.5.14" From a09d08c7b6ed4fc3dd114d8306a1035c2cc0c0e5 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 10 Mar 2025 14:51:23 +0100 Subject: [PATCH 15/37] Avoid reindexing searchable order changes Update settings.rs Update settings.rs --- crates/milli/src/update/settings.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 325a9f15c..9cab74444 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1331,8 +1331,21 @@ impl InnerIndexSettingsDiff { let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; - let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes - != new_settings.user_defined_searchable_attributes; + // Check if any searchable field has been added or removed form the list, + // Changing the order should not be considered as a change for reindexing. + let cache_user_defined_searchables = match ( + &old_settings.user_defined_searchable_attributes, + &new_settings.user_defined_searchable_attributes, + ) { + (Some(old), Some(new)) => { + let old: BTreeSet<_> = old.iter().collect(); + let new: BTreeSet<_> = new.iter().collect(); + + old != new + } + (None, None) => false, + _otherwise => true, + }; // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { From c0fe70c5f081605baf7f904d6469030aa2312f2a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 20 Mar 2025 12:29:08 +0100 Subject: [PATCH 16/37] Make the CI work with merge queue grouping --- .github/workflows/test-suite.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index feb95d8ad..8904b6c75 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -6,6 +6,7 @@ on: # Everyday at 5:00am - cron: "0 5 * * *" pull_request: + merge_group: push: # trying and staging branches are for Bors config branches: From de6c7e551e04fe0cec72685dffc9a7da464b3122 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 20 Mar 2025 15:57:05 +0100 Subject: [PATCH 17/37] Remove bors references from the repository --- .github/workflows/test-suite.yml | 5 ----- CONTRIBUTING.md | 5 ++--- README.md | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 8904b6c75..a13d51086 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -7,11 +7,6 @@ on: - cron: "0 5 * * *" pull_request: merge_group: - push: - # trying and staging branches are for Bors config - branches: - - trying - - staging env: CARGO_TERM_COLOR: always diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 26d5b74b4..e129e5600 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -150,7 +150,7 @@ Some notes on GitHub PRs: - The PR title should be accurate and descriptive of the changes. - [Convert your PR as a draft](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request) if your changes are a work in progress: no one will review it until you pass your PR as ready for review.
The draft PRs are recommended when you want to show that you are working on something and make your work visible. -- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [Bors](https://github.com/bors-ng/bors-ng) to automatically enforce this requirement without the PR author having to rebase manually. +- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [GitHub Merge Queues](https://github.blog/news-insights/product-news/github-merge-queue-is-generally-available/) to automatically enforce this requirement without the PR author having to rebase manually. ## Release Process (for internal team only) @@ -158,8 +158,7 @@ Meilisearch tools follow the [Semantic Versioning Convention](https://semver.org ### Automation to rebase and Merge the PRs -This project integrates a bot that helps us manage pull requests merging.
-_[Read more about this](https://github.com/meilisearch/integration-guides/blob/main/resources/bors.md)._ +This project uses GitHub Merge Queues that helps us manage pull requests merging. ### How to Publish a new Release diff --git a/README.md b/README.md index 42062781a..508efb14b 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@

Dependency status License - Bors enabled + Merge Queues enabled

⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍

From 1ad4235beb12d5093ba2440968694686a26270b9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 20 Mar 2025 16:28:08 +0100 Subject: [PATCH 18/37] Remove the bors file --- bors.toml | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 bors.toml diff --git a/bors.toml b/bors.toml deleted file mode 100644 index 3d04b834c..000000000 --- a/bors.toml +++ /dev/null @@ -1,10 +0,0 @@ -status = [ - 'Tests on ubuntu-22.04', - 'Tests on macos-13', - 'Tests on windows-2022', - 'Run Clippy', - 'Run Rustfmt', - 'Run tests in debug', -] -# 3 hours timeout -timeout-sec = 10800 From 182e5d56321509fc672a664feb5690c907f98459 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 25 Mar 2025 11:12:25 +0100 Subject: [PATCH 19/37] Add database sizes stats to the batches --- crates/dump/src/lib.rs | 1 + crates/meilisearch-types/src/batches.rs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 4e2d6ac2f..ee63f7048 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -326,6 +326,7 @@ pub(crate) mod test { index_uids: maplit::btreemap! { "doggo".to_string() => 1 }, progress_trace: Default::default(), write_channel_congestion: None, + internal_database_sizes: Default::default(), }, enqueued_at: Some(BatchEnqueuedAt { earliest: datetime!(2022-11-11 0:00 UTC), diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs index 904682585..c7b9d6cfa 100644 --- a/crates/meilisearch-types/src/batches.rs +++ b/crates/meilisearch-types/src/batches.rs @@ -64,4 +64,6 @@ pub struct BatchStats { pub progress_trace: serde_json::Map, #[serde(default, skip_serializing_if = "Option::is_none")] pub write_channel_congestion: Option>, + #[serde(default, skip_serializing_if = "serde_json::Map::is_empty")] + pub internal_database_sizes: serde_json::Map, } From fd079c6757c619d67dbcfea8fe956bf2190241b7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 25 Mar 2025 11:40:20 +0100 Subject: [PATCH 20/37] Add an index method to get the database sizes --- crates/milli/src/index.rs | 105 +++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index fcb8962d2..e0c124859 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fs::File; use std::path::Path; -use heed::{types::*, WithoutTls}; +use heed::{types::*, DatabaseStat, WithoutTls}; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; @@ -1768,6 +1768,109 @@ impl Index { Ok(self.word_docids.remap_data_type::().get(rtxn, word)?.is_some() || self.exact_word_docids.remap_data_type::().get(rtxn, word)?.is_some()) } + + /// Returns the sizes in bytes of each of the index database at the given rtxn. + pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> Result> { + let Self { + env: _, + main, + external_documents_ids, + word_docids, + exact_word_docids, + word_prefix_docids, + exact_word_prefix_docids, + word_pair_proximity_docids, + word_position_docids, + word_fid_docids, + word_prefix_position_docids, + word_prefix_fid_docids, + field_id_word_count_docids, + facet_id_f64_docids, + facet_id_string_docids, + facet_id_normalized_string_strings, + facet_id_string_fst, + facet_id_exists_docids, + facet_id_is_null_docids, + facet_id_is_empty_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, + vector_arroy, + embedder_category_id, + documents, + } = self; + + fn compute_size(stats: DatabaseStat) -> usize { + let DatabaseStat { + page_size, + depth: _, + branch_pages, + leaf_pages, + overflow_pages, + entries: _, + } = stats; + + (branch_pages + leaf_pages + overflow_pages) * page_size as usize + } + + let mut sizes = HashMap::new(); + sizes.insert("main", main.stat(rtxn).map(compute_size)?); + sizes + .insert("external_documents_ids", external_documents_ids.stat(rtxn).map(compute_size)?); + sizes.insert("word_docids", word_docids.stat(rtxn).map(compute_size)?); + sizes.insert("exact_word_docids", exact_word_docids.stat(rtxn).map(compute_size)?); + sizes.insert("word_prefix_docids", word_prefix_docids.stat(rtxn).map(compute_size)?); + sizes.insert( + "exact_word_prefix_docids", + exact_word_prefix_docids.stat(rtxn).map(compute_size)?, + ); + sizes.insert( + "word_pair_proximity_docids", + word_pair_proximity_docids.stat(rtxn).map(compute_size)?, + ); + sizes.insert("word_position_docids", word_position_docids.stat(rtxn).map(compute_size)?); + sizes.insert("word_fid_docids", word_fid_docids.stat(rtxn).map(compute_size)?); + sizes.insert( + "word_prefix_position_docids", + word_prefix_position_docids.stat(rtxn).map(compute_size)?, + ); + sizes + .insert("word_prefix_fid_docids", word_prefix_fid_docids.stat(rtxn).map(compute_size)?); + sizes.insert( + "field_id_word_count_docids", + field_id_word_count_docids.stat(rtxn).map(compute_size)?, + ); + sizes.insert("facet_id_f64_docids", facet_id_f64_docids.stat(rtxn).map(compute_size)?); + sizes + .insert("facet_id_string_docids", facet_id_string_docids.stat(rtxn).map(compute_size)?); + sizes.insert( + "facet_id_normalized_string_strings", + facet_id_normalized_string_strings.stat(rtxn).map(compute_size)?, + ); + sizes.insert("facet_id_string_fst", facet_id_string_fst.stat(rtxn).map(compute_size)?); + sizes + .insert("facet_id_exists_docids", facet_id_exists_docids.stat(rtxn).map(compute_size)?); + sizes.insert( + "facet_id_is_null_docids", + facet_id_is_null_docids.stat(rtxn).map(compute_size)?, + ); + sizes.insert( + "facet_id_is_empty_docids", + facet_id_is_empty_docids.stat(rtxn).map(compute_size)?, + ); + sizes.insert( + "field_id_docid_facet_f64s", + field_id_docid_facet_f64s.stat(rtxn).map(compute_size)?, + ); + sizes.insert( + "field_id_docid_facet_strings", + field_id_docid_facet_strings.stat(rtxn).map(compute_size)?, + ); + sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?); + sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?); + sizes.insert("documents", documents.stat(rtxn).map(compute_size)?); + + Ok(sizes) + } } #[derive(Debug, Deserialize, Serialize)] From 637bea0370af5ab727c750eb9ab3445797322615 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 25 Mar 2025 16:52:00 +0100 Subject: [PATCH 21/37] Compute and store the database sizes --- Cargo.lock | 2 + crates/index-scheduler/Cargo.toml | 2 + crates/index-scheduler/src/scheduler/mod.rs | 34 ++++++++++-- .../src/scheduler/process_batch.rs | 52 ++++++++++++++----- crates/meilisearch/Cargo.toml | 6 +-- crates/milli/src/index.rs | 5 +- 6 files changed, 77 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 65b85cbcc..96cfcf76c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2745,6 +2745,7 @@ dependencies = [ "bincode", "bumpalo", "bumparaw-collections", + "byte-unit", "convert_case 0.6.0", "crossbeam-channel", "csv", @@ -2753,6 +2754,7 @@ dependencies = [ "enum-iterator", "file-store", "flate2", + "indexmap", "insta", "maplit", "meili-snap", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 37b3ea835..31ff5f7d0 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -13,6 +13,7 @@ license.workspace = true [dependencies] anyhow = "1.0.95" bincode = "1.3.3" +byte-unit = "5.1.6" bumpalo = "3.16.0" bumparaw-collections = "0.1.4" convert_case = "0.6.0" @@ -22,6 +23,7 @@ dump = { path = "../dump" } enum-iterator = "2.1.0" file-store = { path = "../file-store" } flate2 = "1.0.35" +indexmap = "2.7.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } memmap2 = "0.9.5" diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index 1cbfece34..fe3084034 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -24,6 +24,7 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::heed::{Env, WithoutTls}; use meilisearch_types::milli; use meilisearch_types::tasks::Status; +use process_batch::ProcessBatchInfo; use rayon::current_num_threads; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use roaring::RoaringBitmap; @@ -223,16 +224,16 @@ impl IndexScheduler { let mut stop_scheduler_forever = false; let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?; let mut canceled = RoaringBitmap::new(); - let mut congestion = None; + let mut process_batch_info = ProcessBatchInfo::default(); match res { - Ok((tasks, cong)) => { + Ok((tasks, info)) => { #[cfg(test)] self.breakpoint(crate::test_utils::Breakpoint::ProcessBatchSucceeded); let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32); progress.update_progress(task_progress_obj); - congestion = cong; + process_batch_info = info; let mut success = 0; let mut failure = 0; let mut canceled_by = None; @@ -350,6 +351,9 @@ impl IndexScheduler { // We must re-add the canceled task so they're part of the same batch. ids |= canceled; + let ProcessBatchInfo { congestion, pre_commit_dabases_sizes, post_commit_dabases_sizes } = + process_batch_info; + processing_batch.stats.progress_trace = progress.accumulated_durations().into_iter().map(|(k, v)| (k, v.into())).collect(); processing_batch.stats.write_channel_congestion = congestion.map(|congestion| { @@ -359,6 +363,30 @@ impl IndexScheduler { congestion_info.insert("blocking_ratio".into(), congestion.congestion_ratio().into()); congestion_info }); + processing_batch.stats.internal_database_sizes = pre_commit_dabases_sizes + .iter() + .flat_map(|(dbname, pre_size)| { + post_commit_dabases_sizes + .get(dbname) + .map(|post_size| { + use byte_unit::{Byte, UnitType::Binary}; + use std::cmp::Ordering::{Equal, Greater, Less}; + + let post = Byte::from_u64(*post_size as u64).get_appropriate_unit(Binary); + let diff_size = post_size.abs_diff(*pre_size) as u64; + let diff = Byte::from_u64(diff_size).get_appropriate_unit(Binary); + let sign = match post_size.cmp(pre_size) { + Equal => return None, + Greater => "+", + Less => "-", + }; + + Some((dbname.to_string(), format!("{post:#.2} ({sign}{diff:#.2})").into())) + }) + .into_iter() + .flatten() + }) + .collect(); if let Some(congestion) = congestion { tracing::debug!( diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 8f3987bf6..996b548c2 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -22,6 +22,16 @@ use crate::utils::{ }; use crate::{Error, IndexScheduler, Result, TaskId}; +#[derive(Debug, Default)] +pub struct ProcessBatchInfo { + /// The write channel congestion. None when unavailable: settings update. + pub congestion: Option, + /// The sizes of the different databases before starting the indexation. + pub pre_commit_dabases_sizes: indexmap::IndexMap<&'static str, usize>, + /// The sizes of the different databases after commiting the indexation. + pub post_commit_dabases_sizes: indexmap::IndexMap<&'static str, usize>, +} + impl IndexScheduler { /// Apply the operation associated with the given batch. /// @@ -35,7 +45,7 @@ impl IndexScheduler { batch: Batch, current_batch: &mut ProcessingBatch, progress: Progress, - ) -> Result<(Vec, Option)> { + ) -> Result<(Vec, ProcessBatchInfo)> { #[cfg(test)] { self.maybe_fail(crate::test_utils::FailureLocation::InsideProcessBatch)?; @@ -76,7 +86,7 @@ impl IndexScheduler { canceled_tasks.push(task); - Ok((canceled_tasks, None)) + Ok((canceled_tasks, ProcessBatchInfo::default())) } Batch::TaskDeletions(mut tasks) => { // 1. Retrieve the tasks that matched the query at enqueue-time. @@ -115,14 +125,14 @@ impl IndexScheduler { _ => unreachable!(), } } - Ok((tasks, None)) - } - Batch::SnapshotCreation(tasks) => { - self.process_snapshot(progress, tasks).map(|tasks| (tasks, None)) - } - Batch::Dump(task) => { - self.process_dump_creation(progress, task).map(|tasks| (tasks, None)) + Ok((tasks, ProcessBatchInfo::default())) } + Batch::SnapshotCreation(tasks) => self + .process_snapshot(progress, tasks) + .map(|tasks| (tasks, ProcessBatchInfo::default())), + Batch::Dump(task) => self + .process_dump_creation(progress, task) + .map(|tasks| (tasks, ProcessBatchInfo::default())), Batch::IndexOperation { op, must_create_index } => { let index_uid = op.index_uid().to_string(); let index = if must_create_index { @@ -139,6 +149,7 @@ impl IndexScheduler { .set_currently_updating_index(Some((index_uid.clone(), index.clone()))); let mut index_wtxn = index.write_txn()?; + let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?; let (tasks, congestion) = self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; @@ -153,12 +164,14 @@ impl IndexScheduler { // stats of the index. Since the tasks have already been processed and // this is a non-critical operation. If it fails, we should not fail // the entire batch. + let mut post_commit_dabases_sizes = None; let res = || -> Result<()> { let index_rtxn = index.read_txn()?; let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; let mut wtxn = self.env.write_txn()?; self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?; + post_commit_dabases_sizes = Some(index.database_sizes(&index_rtxn)?); wtxn.commit()?; Ok(()) }(); @@ -171,7 +184,16 @@ impl IndexScheduler { ), } - Ok((tasks, congestion)) + let info = ProcessBatchInfo { + congestion, + // In case we fail to the get post-commit sizes we decide + // that nothing changed and use the pre-commit sizes. + post_commit_dabases_sizes: post_commit_dabases_sizes + .unwrap_or_else(|| pre_commit_dabases_sizes.clone()), + pre_commit_dabases_sizes, + }; + + Ok((tasks, info)) } Batch::IndexCreation { index_uid, primary_key, task } => { progress.update_progress(CreateIndexProgress::CreatingTheIndex); @@ -239,7 +261,7 @@ impl IndexScheduler { ), } - Ok((vec![task], None)) + Ok((vec![task], ProcessBatchInfo::default())) } Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => { progress.update_progress(DeleteIndexProgress::DeletingTheIndex); @@ -273,7 +295,9 @@ impl IndexScheduler { }; } - Ok((tasks, None)) + // Here we could also show that all the internal database sizes goes to 0 + // but it would mean opening the index and that's costly. + Ok((tasks, ProcessBatchInfo::default())) } Batch::IndexSwap { mut task } => { progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap); @@ -321,7 +345,7 @@ impl IndexScheduler { } wtxn.commit()?; task.status = Status::Succeeded; - Ok((vec![task], None)) + Ok((vec![task], ProcessBatchInfo::default())) } Batch::UpgradeDatabase { mut tasks } => { let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else { @@ -351,7 +375,7 @@ impl IndexScheduler { task.error = None; } - Ok((tasks, None)) + Ok((tasks, ProcessBatchInfo::default())) } } } diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 428f13c10..6360cdbde 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -30,11 +30,7 @@ actix-web = { version = "4.9.0", default-features = false, features = [ anyhow = { version = "1.0.95", features = ["backtrace"] } async-trait = "0.1.85" bstr = "1.11.3" -byte-unit = { version = "5.1.6", default-features = false, features = [ - "std", - "byte", - "serde", -] } +byte-unit = { version = "5.1.6", features = ["serde"] } bytes = "1.9.0" clap = { version = "4.5.24", features = ["derive", "env"] } crossbeam-channel = "0.5.14" diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index e0c124859..a2d839d03 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -5,6 +5,7 @@ use std::path::Path; use heed::{types::*, DatabaseStat, WithoutTls}; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; +use indexmap::IndexMap; use roaring::RoaringBitmap; use rstar::RTree; use serde::{Deserialize, Serialize}; @@ -1770,7 +1771,7 @@ impl Index { } /// Returns the sizes in bytes of each of the index database at the given rtxn. - pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> Result> { + pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> heed::Result> { let Self { env: _, main, @@ -1812,7 +1813,7 @@ impl Index { (branch_pages + leaf_pages + overflow_pages) * page_size as usize } - let mut sizes = HashMap::new(); + let mut sizes = IndexMap::new(); sizes.insert("main", main.stat(rtxn).map(compute_size)?); sizes .insert("external_documents_ids", external_documents_ids.stat(rtxn).map(compute_size)?); From 5820d822c8f18846c45f45bc4787a33400a623a3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 25 Mar 2025 16:51:18 +0100 Subject: [PATCH 22/37] Add more details about the finalizing progress step --- crates/index-scheduler/src/insta_snapshot.rs | 1 + crates/index-scheduler/src/processing.rs | 7 + .../src/scheduler/process_batch.rs | 6 +- .../src/scheduler/process_index_operation.rs | 10 +- crates/meilisearch/tests/batches/mod.rs | 126 ++++++++++++------ crates/meilisearch/tests/dumps/mod.rs | 1 + .../batches.snap | 1 + ...rEnqueuedAt_equal_2025-01-16T16_47_41.snap | 3 +- ...rFinishedAt_equal_2025-01-16T16_47_41.snap | 3 +- ...erStartedAt_equal_2025-01-16T16_47_41.snap | 3 +- ...ue_once_everything_has_been_processed.snap | 3 +- .../tests/upgrade/v1_12/v1_12_0.rs | 22 +-- 12 files changed, 122 insertions(+), 64 deletions(-) diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index bcc295afd..949edf369 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -344,6 +344,7 @@ pub fn snapshot_batch(batch: &Batch) -> String { let Batch { uid, details, stats, started_at, finished_at, progress: _, enqueued_at } = batch; let stats = BatchStats { progress_trace: Default::default(), + internal_database_sizes: Default::default(), write_channel_congestion: None, ..stats.clone() }; diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index fed26aeb7..09ce46884 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -64,6 +64,13 @@ make_enum_progress! { } } +make_enum_progress! { + pub enum FinalizingIndexStep { + Committing, + ComputingStats, + } +} + make_enum_progress! { pub enum TaskCancelationProgress { RetrievingTasks, diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 996b548c2..42de1d137 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -12,7 +12,7 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, UpdateIndexProgress, }; @@ -151,9 +151,10 @@ impl IndexScheduler { let mut index_wtxn = index.write_txn()?; let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?; let (tasks, congestion) = - self.apply_index_operation(&mut index_wtxn, &index, op, progress)?; + self.apply_index_operation(&mut index_wtxn, &index, op, &progress)?; { + progress.update_progress(FinalizingIndexStep::Committing); let span = tracing::trace_span!(target: "indexing::scheduler", "commit"); let _entered = span.enter(); @@ -166,6 +167,7 @@ impl IndexScheduler { // the entire batch. let mut post_commit_dabases_sizes = None; let res = || -> Result<()> { + progress.update_progress(FinalizingIndexStep::ComputingStats); let index_rtxn = index.read_txn()?; let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn) .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?; diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 690fe2efd..9b12d61cf 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -32,7 +32,7 @@ impl IndexScheduler { index_wtxn: &mut RwTxn<'i>, index: &'i Index, operation: IndexOperation, - progress: Progress, + progress: &Progress, ) -> Result<(Vec, Option)> { let indexer_alloc = Bump::new(); let started_processing_at = std::time::Instant::now(); @@ -186,7 +186,7 @@ impl IndexScheduler { &document_changes, embedders, &|| must_stop_processing.get(), - &progress, + progress, ) .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?, ); @@ -307,7 +307,7 @@ impl IndexScheduler { &document_changes, embedders, &|| must_stop_processing.get(), - &progress, + progress, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, ); @@ -465,7 +465,7 @@ impl IndexScheduler { &document_changes, embedders, &|| must_stop_processing.get(), - &progress, + progress, ) .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, ); @@ -520,7 +520,7 @@ impl IndexScheduler { index_uid: index_uid.clone(), tasks: cleared_tasks, }, - progress.clone(), + progress, )?; let (settings_tasks, _congestion) = self.apply_index_operation( diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index 468963631..e955c6883 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -281,7 +281,8 @@ async fn test_summarized_document_addition_or_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -303,7 +304,8 @@ async fn test_summarized_document_addition_or_update() { "test": 1 }, "progressTrace": "[progressTrace]", - "writeChannelCongestion": "[writeChannelCongestion]" + "writeChannelCongestion": "[writeChannelCongestion]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -322,7 +324,8 @@ async fn test_summarized_document_addition_or_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -344,7 +347,8 @@ async fn test_summarized_document_addition_or_update() { "test": 1 }, "progressTrace": "[progressTrace]", - "writeChannelCongestion": "[writeChannelCongestion]" + "writeChannelCongestion": "[writeChannelCongestion]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -367,7 +371,8 @@ async fn test_summarized_delete_documents_by_batch() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -388,7 +393,8 @@ async fn test_summarized_delete_documents_by_batch() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -407,7 +413,8 @@ async fn test_summarized_delete_documents_by_batch() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -428,7 +435,8 @@ async fn test_summarized_delete_documents_by_batch() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -453,7 +461,8 @@ async fn test_summarized_delete_documents_by_filter() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -475,7 +484,8 @@ async fn test_summarized_delete_documents_by_filter() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -495,7 +505,8 @@ async fn test_summarized_delete_documents_by_filter() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -517,7 +528,8 @@ async fn test_summarized_delete_documents_by_filter() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -537,7 +549,8 @@ async fn test_summarized_delete_documents_by_filter() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r#" { @@ -559,7 +572,8 @@ async fn test_summarized_delete_documents_by_filter() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -583,7 +597,8 @@ async fn test_summarized_delete_document_by_id() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r#" { @@ -604,7 +619,8 @@ async fn test_summarized_delete_document_by_id() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -623,7 +639,8 @@ async fn test_summarized_delete_document_by_id() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r#" { @@ -644,7 +661,8 @@ async fn test_summarized_delete_document_by_id() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -679,7 +697,8 @@ async fn test_summarized_settings_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -709,7 +728,8 @@ async fn test_summarized_settings_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -732,7 +752,8 @@ async fn test_summarized_index_creation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -750,7 +771,8 @@ async fn test_summarized_index_creation() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -768,7 +790,8 @@ async fn test_summarized_index_creation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -788,7 +811,8 @@ async fn test_summarized_index_creation() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -921,7 +945,8 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -939,7 +964,8 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -957,7 +983,8 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -977,7 +1004,8 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -998,7 +1026,8 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r#" { @@ -1016,7 +1045,8 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1034,7 +1064,8 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -1054,7 +1085,8 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1080,7 +1112,8 @@ async fn test_summarized_index_swap() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -1105,7 +1138,8 @@ async fn test_summarized_index_swap() { "indexSwap": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1129,7 +1163,8 @@ async fn test_summarized_index_swap() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -1147,7 +1182,8 @@ async fn test_summarized_index_swap() { "indexUids": { "doggos": 1 }, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1173,7 +1209,8 @@ async fn test_summarized_batch_cancelation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -1193,7 +1230,8 @@ async fn test_summarized_batch_cancelation() { "taskCancelation": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1219,7 +1257,8 @@ async fn test_summarized_batch_deletion() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -1239,7 +1278,8 @@ async fn test_summarized_batch_deletion() { "taskDeletion": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1262,7 +1302,8 @@ async fn test_summarized_dump_creation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" }, @r###" { @@ -1280,7 +1321,8 @@ async fn test_summarized_dump_creation() { "dumpCreation": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index ff0b027cb..fa05d9ec9 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2236,6 +2236,7 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() { ".results[0].finishedAt" => "[date]", ".results[0].duration" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", + ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]", }), name: "batches"); diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap index b38340ef6..b2dea1f06 100644 --- a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap +++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap @@ -22,6 +22,7 @@ source: crates/meilisearch/tests/dumps/mod.rs "kefir": 1 }, "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]", "writeChannelCongestion": "[writeChannelCongestion]" }, "duration": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index 99caeaf96..b79f55351 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index 99caeaf96..b79f55351 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index 99caeaf96..b79f55351 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap index 623c1f778..3cfed9f74 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]" + "progressTrace": "[progressTrace]", + "internalDatabaseSizes": "[internalDatabaseSizes]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 11ba2882a..8157f0923 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -166,7 +166,7 @@ async fn check_the_index_scheduler(server: &Server) { let (tasks, _) = server.tasks_filter("limit=1000").await; snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "the_whole_task_queue_once_everything_has_been_processed"); let (batches, _) = server.batches_filter("limit=1000").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed"); // Tests all the tasks query parameters let (tasks, _) = server.tasks_filter("uids=10").await; @@ -193,26 +193,26 @@ async fn check_the_index_scheduler(server: &Server) { // Tests all the batches query parameters let (batches, _) = server.batches_filter("uids=10").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_uids_equal_10"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_uids_equal_10"); let (batches, _) = server.batches_filter("batchUids=10").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_batchUids_equal_10"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_batchUids_equal_10"); let (batches, _) = server.batches_filter("statuses=canceled").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_statuses_equal_canceled"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_statuses_equal_canceled"); // types has already been tested above to retrieve the upgrade database let (batches, _) = server.batches_filter("canceledBy=19").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_canceledBy_equal_19"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_canceledBy_equal_19"); let (batches, _) = server.batches_filter("beforeEnqueuedAt=2025-01-16T16:47:41Z").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41"); let (batches, _) = server.batches_filter("afterEnqueuedAt=2025-01-16T16:47:41Z").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41"); let (batches, _) = server.batches_filter("beforeStartedAt=2025-01-16T16:47:41Z").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41"); let (batches, _) = server.batches_filter("afterStartedAt=2025-01-16T16:47:41Z").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterStartedAt_equal_2025-01-16T16_47_41"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterStartedAt_equal_2025-01-16T16_47_41"); let (batches, _) = server.batches_filter("beforeFinishedAt=2025-01-16T16:47:41Z").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41"); let (batches, _) = server.batches_filter("afterFinishedAt=2025-01-16T16:47:41Z").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41"); let (stats, _) = server.stats().await; assert_json_snapshot!(stats, { From 3deb1ef78ff77d59fbb57bd6ba7eaf4c5e4fe0cb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 25 Mar 2025 18:53:32 +0100 Subject: [PATCH 23/37] Fix the snapshots again --- crates/meilisearch/tests/batches/mod.rs | 102 ++++++------------ crates/meilisearch/tests/dumps/mod.rs | 2 +- .../batches.snap | 1 - ...rEnqueuedAt_equal_2025-01-16T16_47_41.snap | 3 +- ...rFinishedAt_equal_2025-01-16T16_47_41.snap | 3 +- ...erStartedAt_equal_2025-01-16T16_47_41.snap | 3 +- ...ue_once_everything_has_been_processed.snap | 3 +- .../tests/upgrade/v1_12/v1_12_0.rs | 2 +- 8 files changed, 40 insertions(+), 79 deletions(-) diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs index e955c6883..6c2aa4aaf 100644 --- a/crates/meilisearch/tests/batches/mod.rs +++ b/crates/meilisearch/tests/batches/mod.rs @@ -347,8 +347,7 @@ async fn test_summarized_document_addition_or_update() { "test": 1 }, "progressTrace": "[progressTrace]", - "writeChannelCongestion": "[writeChannelCongestion]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "writeChannelCongestion": "[writeChannelCongestion]" }, "duration": "[duration]", "startedAt": "[date]", @@ -371,8 +370,7 @@ async fn test_summarized_delete_documents_by_batch() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -393,8 +391,7 @@ async fn test_summarized_delete_documents_by_batch() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -435,8 +432,7 @@ async fn test_summarized_delete_documents_by_batch() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -461,8 +457,7 @@ async fn test_summarized_delete_documents_by_filter() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -484,8 +479,7 @@ async fn test_summarized_delete_documents_by_filter() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -528,8 +522,7 @@ async fn test_summarized_delete_documents_by_filter() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -572,8 +565,7 @@ async fn test_summarized_delete_documents_by_filter() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -597,8 +589,7 @@ async fn test_summarized_delete_document_by_id() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r#" { @@ -619,8 +610,7 @@ async fn test_summarized_delete_document_by_id() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -661,8 +651,7 @@ async fn test_summarized_delete_document_by_id() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -728,8 +717,7 @@ async fn test_summarized_settings_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -752,8 +740,7 @@ async fn test_summarized_index_creation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -771,8 +758,7 @@ async fn test_summarized_index_creation() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -790,8 +776,7 @@ async fn test_summarized_index_creation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -811,8 +796,7 @@ async fn test_summarized_index_creation() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -945,8 +929,7 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -964,8 +947,7 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -983,8 +965,7 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1004,8 +985,7 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1026,8 +1006,7 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r#" { @@ -1045,8 +1024,7 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1064,8 +1042,7 @@ async fn test_summarized_index_update() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1085,8 +1062,7 @@ async fn test_summarized_index_update() { "indexUids": { "test": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1112,8 +1088,7 @@ async fn test_summarized_index_swap() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1138,8 +1113,7 @@ async fn test_summarized_index_swap() { "indexSwap": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1163,8 +1137,7 @@ async fn test_summarized_index_swap() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1182,8 +1155,7 @@ async fn test_summarized_index_swap() { "indexUids": { "doggos": 1 }, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1209,8 +1181,7 @@ async fn test_summarized_batch_cancelation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1230,8 +1201,7 @@ async fn test_summarized_batch_cancelation() { "taskCancelation": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1257,8 +1227,7 @@ async fn test_summarized_batch_deletion() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1278,8 +1247,7 @@ async fn test_summarized_batch_deletion() { "taskDeletion": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", @@ -1302,8 +1270,7 @@ async fn test_summarized_dump_creation() { ".startedAt" => "[date]", ".finishedAt" => "[date]", ".stats.progressTrace" => "[progressTrace]", - ".stats.writeChannelCongestion" => "[writeChannelCongestion]", - ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]" + ".stats.writeChannelCongestion" => "[writeChannelCongestion]" }, @r###" { @@ -1321,8 +1288,7 @@ async fn test_summarized_dump_creation() { "dumpCreation": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index fa05d9ec9..addcbeeb5 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2236,8 +2236,8 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() { ".results[0].finishedAt" => "[date]", ".results[0].duration" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", - ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]", + ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", }), name: "batches"); let (indexes, code) = server.list_indexes(None, None).await; diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap index b2dea1f06..b38340ef6 100644 --- a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap +++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap @@ -22,7 +22,6 @@ source: crates/meilisearch/tests/dumps/mod.rs "kefir": 1 }, "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]", "writeChannelCongestion": "[writeChannelCongestion]" }, "duration": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index b79f55351..99caeaf96 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index b79f55351..99caeaf96 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index b79f55351..99caeaf96 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap index 3cfed9f74..623c1f778 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "upgradeDatabase": 1 }, "indexUids": {}, - "progressTrace": "[progressTrace]", - "internalDatabaseSizes": "[internalDatabaseSizes]" + "progressTrace": "[progressTrace]" }, "duration": "[duration]", "startedAt": "[date]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 8157f0923..9fc4d0e5b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -166,7 +166,7 @@ async fn check_the_index_scheduler(server: &Server) { let (tasks, _) = server.tasks_filter("limit=1000").await; snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "the_whole_task_queue_once_everything_has_been_processed"); let (batches, _) = server.batches_filter("limit=1000").await; - snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed"); + snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed"); // Tests all the tasks query parameters let (tasks, _) = server.tasks_filter("uids=10").await; From 7ed9adde295840170b6cff351c335c30e1e9e1ab Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 26 Mar 2025 16:45:52 +0100 Subject: [PATCH 24/37] Prefer camelCase for internal database sizes db name --- crates/index-scheduler/src/scheduler/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index fe3084034..f0e324a8d 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -20,6 +20,7 @@ use std::path::PathBuf; use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; use std::sync::Arc; +use convert_case::{Case, Casing as _}; use meilisearch_types::error::ResponseError; use meilisearch_types::heed::{Env, WithoutTls}; use meilisearch_types::milli; @@ -381,7 +382,10 @@ impl IndexScheduler { Less => "-", }; - Some((dbname.to_string(), format!("{post:#.2} ({sign}{diff:#.2})").into())) + Some(( + dbname.to_case(Case::Camel), + format!("{post:#.2} ({sign}{diff:#.2})").into(), + )) }) .into_iter() .flatten() From db7ce037634989ebce6040c1d291d7022924b395 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 26 Mar 2025 17:13:09 +0100 Subject: [PATCH 25/37] Improve the performances of computing the size of the documents database --- crates/meilisearch/src/routes/indexes/mod.rs | 2 +- crates/milli/src/database_stats.rs | 77 +++++++------------ crates/milli/src/index.rs | 32 -------- .../milli/src/update/index_documents/mod.rs | 4 +- crates/milli/src/update/new/indexer/mod.rs | 1 - crates/milli/src/update/new/indexer/write.rs | 5 +- 6 files changed, 33 insertions(+), 88 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 5aebf5cac..48ed1cfb1 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -518,7 +518,7 @@ impl From for IndexStats { .inner_stats .number_of_documents .unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()), - raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_size(), avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, number_of_embeddings: stats.inner_stats.number_of_embeddings, diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index d97dc13ba..7da1fbd2b 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -1,8 +1,13 @@ -use heed::types::Bytes; +use std::mem; + use heed::Database; +use heed::DatabaseStat; use heed::RoTxn; +use heed::Unspecified; use serde::{Deserialize, Serialize}; +use crate::BEU32; + #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] #[serde(rename_all = "camelCase")] /// The stats of a database. @@ -20,58 +25,24 @@ impl DatabaseStats { /// /// This function iterates over the whole database and computes the stats. /// It is not efficient and should be cached somewhere. - pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> heed::Result { - let mut database_stats = - Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 }; + pub(crate) fn new( + database: Database, + rtxn: &RoTxn<'_>, + ) -> heed::Result { + let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } = + database.stat(rtxn)?; - let mut iter = database.iter(rtxn)?; - while let Some((key, value)) = iter.next().transpose()? { - let key_size = key.len() as u64; - let value_size = value.len() as u64; - database_stats.total_key_size += key_size; - database_stats.total_value_size += value_size; - } + // We first take the total size without overflow pages as the overflow pages contains the values and only that. + let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize; + // We compute an estimated size for the keys. + let total_key_size = entries * (mem::size_of::() + 4); + let total_value_size = total_size - total_key_size; - database_stats.number_of_entries = database.len(rtxn)?; - - Ok(database_stats) - } - - /// Recomputes the stats of the database and returns the new stats. - /// - /// This function is used to update the stats of the database when some keys are modified. - /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. - pub(crate) fn recompute( - mut stats: Self, - database: Database, - before_rtxn: &RoTxn<'_>, - after_rtxn: &RoTxn<'_>, - modified_keys: I, - ) -> heed::Result - where - I: IntoIterator, - K: AsRef<[u8]>, - { - for key in modified_keys { - let key = key.as_ref(); - if let Some(value) = database.get(after_rtxn, key)? { - let key_size = key.len() as u64; - let value_size = value.len() as u64; - stats.total_key_size = stats.total_key_size.saturating_add(key_size); - stats.total_value_size = stats.total_value_size.saturating_add(value_size); - } - - if let Some(value) = database.get(before_rtxn, key)? { - let key_size = key.len() as u64; - let value_size = value.len() as u64; - stats.total_key_size = stats.total_key_size.saturating_sub(key_size); - stats.total_value_size = stats.total_value_size.saturating_sub(value_size); - } - } - - stats.number_of_entries = database.len(after_rtxn)?; - - Ok(stats) + Ok(Self { + number_of_entries: entries as u64, + total_key_size: total_key_size as u64, + total_value_size: total_value_size as u64, + }) } pub fn average_key_size(&self) -> u64 { @@ -86,6 +57,10 @@ impl DatabaseStats { self.number_of_entries } + pub fn total_size(&self) -> u64 { + self.total_key_size + self.total_value_size + } + pub fn total_key_size(&self) -> u64 { self.total_key_size } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index a2d839d03..5f74863e8 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -411,38 +411,6 @@ impl Index { Ok(count.unwrap_or_default()) } - /// Updates the stats of the documents database based on the previous stats and the modified docids. - pub fn update_documents_stats( - &self, - wtxn: &mut RwTxn<'_>, - modified_docids: roaring::RoaringBitmap, - ) -> Result<()> { - let before_rtxn = self.read_txn()?; - let document_stats = match self.documents_stats(&before_rtxn)? { - Some(before_stats) => DatabaseStats::recompute( - before_stats, - self.documents.remap_types(), - &before_rtxn, - wtxn, - modified_docids.iter().map(|docid| docid.to_be_bytes()), - )?, - None => { - // This should never happen when there are already documents in the index, the documents stats should be present. - // If it happens, it means that the index was not properly initialized/upgraded. - debug_assert_eq!( - self.documents.len(&before_rtxn)?, - 0, - "The documents stats should be present when there are documents in the index" - ); - tracing::warn!("No documents stats found, creating new ones"); - DatabaseStats::new(self.documents.remap_types(), &*wtxn)? - } - }; - - self.put_documents_stats(wtxn, document_stats)?; - Ok(()) - } - /// Writes the stats of the documents database. pub fn put_documents_stats( &self, diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 95342054d..5d445d283 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -28,6 +28,7 @@ pub use self::helpers::*; pub use self::transform::{Transform, TransformOutput}; use super::facet::clear_facet_levels_based_on_settings_diff; use super::new::StdResult; +use crate::database_stats::DatabaseStats; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; use crate::index::{PrefixSearch, PrefixSettings}; @@ -476,7 +477,8 @@ where if !settings_diff.settings_update_only { // Update the stats of the documents database when there is a document update. - self.index.update_documents_stats(self.wtxn, modified_docids)?; + let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?; + self.index.put_documents_stats(self.wtxn, stats)?; } // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 4f2dd19c9..d2a88f4ff 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -234,7 +234,6 @@ where embedders, field_distribution, document_ids, - modified_docids, )?; Ok(congestion) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 8618b4b21..7ab7991b2 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -7,6 +7,7 @@ use rand::SeedableRng as _; use time::OffsetDateTime; use super::super::channel::*; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; @@ -142,7 +143,6 @@ pub(super) fn update_index( embedders: EmbeddingConfigs, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, - modified_docids: roaring::RoaringBitmap, ) -> Result<()> { index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; if let Some(new_primary_key) = new_primary_key { @@ -153,7 +153,8 @@ pub(super) fn update_index( index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - index.update_documents_stats(wtxn, modified_docids)?; + let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?; + index.put_documents_stats(wtxn, stats)?; Ok(()) } From c670e9a39bcde2b0e415be7ddb639823410805b6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 26 Mar 2025 18:08:26 +0100 Subject: [PATCH 26/37] Make sure the snaps are happy --- .../tests/documents/delete_documents.rs | 27 ++-- crates/meilisearch/tests/dumps/mod.rs | 126 ++++++++++++------ crates/meilisearch/tests/stats/mod.rs | 72 ++++++---- .../tests/upgrade/v1_12/v1_12_0.rs | 25 ++-- 4 files changed, 166 insertions(+), 84 deletions(-) diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 4dfe2cc79..060f17958 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -157,11 +157,14 @@ async fn delete_document_by_filter() { index.wait_task(task.uid()).await.succeeded(); let (stats, _) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 4, - "rawDocumentDbSize": 42, - "avgDocumentSize": 10, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -208,11 +211,14 @@ async fn delete_document_by_filter() { "###); let (stats, _) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 16, - "avgDocumentSize": 8, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -278,11 +284,14 @@ async fn delete_document_by_filter() { "###); let (stats, _) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 1, - "rawDocumentDbSize": 12, - "avgDocumentSize": 12, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index addcbeeb5..e5aa52dc6 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -28,12 +28,15 @@ async fn import_dump_v1_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -185,12 +188,15 @@ async fn import_dump_v1_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -355,12 +361,15 @@ async fn import_dump_v1_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 8606, - "avgDocumentSize": 162, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -522,12 +531,15 @@ async fn import_dump_v2_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -679,12 +691,15 @@ async fn import_dump_v2_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -846,12 +861,15 @@ async fn import_dump_v2_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 8606, - "avgDocumentSize": 162, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1010,12 +1028,15 @@ async fn import_dump_v3_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1167,12 +1188,15 @@ async fn import_dump_v3_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1334,12 +1358,15 @@ async fn import_dump_v3_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 8606, - "avgDocumentSize": 162, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1498,12 +1525,15 @@ async fn import_dump_v4_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1655,12 +1685,15 @@ async fn import_dump_v4_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 21965, - "avgDocumentSize": 414, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1822,12 +1855,15 @@ async fn import_dump_v4_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 53, - "rawDocumentDbSize": 8606, - "avgDocumentSize": 162, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -1994,11 +2030,14 @@ async fn import_dump_v5() { let (stats, code) = index1.stats().await; snapshot!(code, @"200 OK"); - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 10, - "rawDocumentDbSize": 6782, - "avgDocumentSize": 678, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -2031,12 +2070,15 @@ async fn import_dump_v5() { let (stats, code) = index2.stats().await; snapshot!(code, @"200 OK"); snapshot!( - json_string!(stats), + json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 10, - "rawDocumentDbSize": 6782, - "avgDocumentSize": 678, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index 20a8eaef6..aee626460 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -110,11 +110,14 @@ async fn add_remove_embeddings() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 27, - "avgDocumentSize": 13, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, @@ -135,11 +138,14 @@ async fn add_remove_embeddings() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 27, - "avgDocumentSize": 13, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 2, @@ -160,11 +166,14 @@ async fn add_remove_embeddings() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 27, - "avgDocumentSize": 13, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 2, "numberOfEmbeddedDocuments": 2, @@ -186,11 +195,14 @@ async fn add_remove_embeddings() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 27, - "avgDocumentSize": 13, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 2, "numberOfEmbeddedDocuments": 1, @@ -236,11 +248,14 @@ async fn add_remove_embedded_documents() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 27, - "avgDocumentSize": 13, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, @@ -257,11 +272,14 @@ async fn add_remove_embedded_documents() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 1, - "rawDocumentDbSize": 13, - "avgDocumentSize": 13, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 1, @@ -290,11 +308,14 @@ async fn update_embedder_settings() { index.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 108, - "avgDocumentSize": 54, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -326,11 +347,14 @@ async fn update_embedder_settings() { server.wait_task(response.uid()).await.succeeded(); let (stats, _code) = index.stats().await; - snapshot!(json_string!(stats), @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[size]", + ".avgDocumentSize" => "[size]", + }), @r###" { "numberOfDocuments": 2, - "rawDocumentDbSize": 108, - "avgDocumentSize": 54, + "rawDocumentDbSize": "[size]", + "avgDocumentSize": "[size]", "isIndexing": false, "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 2, diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 9fc4d0e5b..1b2ae054c 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -133,7 +133,9 @@ async fn check_the_index_scheduler(server: &Server) { let (stats, _) = server.stats().await; assert_json_snapshot!(stats, { ".databaseSize" => "[bytes]", - ".usedDatabaseSize" => "[bytes]" + ".usedDatabaseSize" => "[bytes]", + ".indexes.kefir.rawDocumentDbSize" => "[bytes]", + ".indexes.kefir.avgDocumentSize" => "[bytes]", }, @r###" { @@ -143,8 +145,8 @@ async fn check_the_index_scheduler(server: &Server) { "indexes": { "kefir": { "numberOfDocuments": 1, - "rawDocumentDbSize": 109, - "avgDocumentSize": 109, + "rawDocumentDbSize": "[bytes]", + "avgDocumentSize": "[bytes]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -217,7 +219,9 @@ async fn check_the_index_scheduler(server: &Server) { let (stats, _) = server.stats().await; assert_json_snapshot!(stats, { ".databaseSize" => "[bytes]", - ".usedDatabaseSize" => "[bytes]" + ".usedDatabaseSize" => "[bytes]", + ".indexes.kefir.rawDocumentDbSize" => "[bytes]", + ".indexes.kefir.avgDocumentSize" => "[bytes]", }, @r###" { @@ -227,8 +231,8 @@ async fn check_the_index_scheduler(server: &Server) { "indexes": { "kefir": { "numberOfDocuments": 1, - "rawDocumentDbSize": 109, - "avgDocumentSize": 109, + "rawDocumentDbSize": "[bytes]", + "avgDocumentSize": "[bytes]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -245,11 +249,14 @@ async fn check_the_index_scheduler(server: &Server) { "###); let index = server.index("kefir"); let (stats, _) = index.stats().await; - snapshot!(stats, @r###" + snapshot!(json_string!(stats, { + ".rawDocumentDbSize" => "[bytes]", + ".avgDocumentSize" => "[bytes]", + }), @r###" { "numberOfDocuments": 1, - "rawDocumentDbSize": 109, - "avgDocumentSize": 109, + "rawDocumentDbSize": "[bytes]", + "avgDocumentSize": "[bytes]", "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, From 811143cbe940db843be07954a91c4db49150ac57 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 27 Mar 2025 10:17:28 +0100 Subject: [PATCH 27/37] Add more progress precision when doing post processing --- .../src/update/new/indexer/post_processing.rs | 64 +++++++++++++++---- crates/milli/src/update/new/steps.rs | 20 ++++++ 2 files changed, 72 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/update/new/indexer/post_processing.rs b/crates/milli/src/update/new/indexer/post_processing.rs index 2a01fccf3..aace70cff 100644 --- a/crates/milli/src/update/new/indexer/post_processing.rs +++ b/crates/milli/src/update/new/indexer/post_processing.rs @@ -7,12 +7,13 @@ use itertools::{merge_join_by, EitherOrBoth}; use super::document_changes::IndexingContext; use crate::facet::FacetType; use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; +use crate::progress::Progress; use crate::update::del_add::DelAdd; use crate::update::facet::new_incremental::FacetsUpdateIncremental; use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::update::new::facet_search_builder::FacetSearchBuilder; use crate::update::new::merger::FacetFieldIdDelta; -use crate::update::new::steps::IndexingStep; +use crate::update::new::steps::{IndexingStep, PostProcessingFacets, PostProcessingWords}; use crate::update::new::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; use crate::update::new::words_prefix_docids::{ compute_exact_word_prefix_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids, @@ -33,11 +34,23 @@ where { let index = indexing_context.index; indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); - compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?; - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; + compute_facet_level_database( + index, + wtxn, + facet_field_ids_delta, + &mut global_fields_ids_map, + indexing_context.progress, + )?; + compute_facet_search_database(index, wtxn, global_fields_ids_map, indexing_context.progress)?; indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); - if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { - compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?; + if let Some(prefix_delta) = compute_word_fst(index, wtxn, indexing_context.progress)? { + compute_prefix_database( + index, + wtxn, + prefix_delta, + indexing_context.grenad_parameters, + indexing_context.progress, + )?; }; Ok(()) } @@ -48,21 +61,32 @@ fn compute_prefix_database( wtxn: &mut RwTxn, prefix_delta: PrefixDelta, grenad_parameters: &GrenadParameters, + progress: &Progress, ) -> Result<()> { let PrefixDelta { modified, deleted } = prefix_delta; - // Compute word prefix docids + + progress.update_progress(PostProcessingWords::WordPrefixDocids); compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; - // Compute exact word prefix docids + + progress.update_progress(PostProcessingWords::ExactWordPrefixDocids); compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; - // Compute word prefix fid docids + + progress.update_progress(PostProcessingWords::WordPrefixFieldIdDocids); compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?; - // Compute word prefix position docids + + progress.update_progress(PostProcessingWords::WordPrefixPositionDocids); compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters) } #[tracing::instrument(level = "trace", skip_all, target = "indexing")] -fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result> { +fn compute_word_fst( + index: &Index, + wtxn: &mut RwTxn, + progress: &Progress, +) -> Result> { let rtxn = index.read_txn()?; + progress.update_progress(PostProcessingWords::WordFst); + let words_fst = index.words_fst(&rtxn)?; let mut word_fst_builder = WordFstBuilder::new(&words_fst)?; let prefix_settings = index.prefix_settings(&rtxn)?; @@ -112,8 +136,10 @@ fn compute_facet_search_database( index: &Index, wtxn: &mut RwTxn, global_fields_ids_map: GlobalFieldsIdsMap, + progress: &Progress, ) -> Result<()> { let rtxn = index.read_txn()?; + progress.update_progress(PostProcessingFacets::FacetSearch); // if the facet search is not enabled, we can skip the rest of the function if !index.facet_search(wtxn)? { @@ -171,10 +197,16 @@ fn compute_facet_level_database( wtxn: &mut RwTxn, mut facet_field_ids_delta: FacetFieldIdsDelta, global_fields_ids_map: &mut GlobalFieldsIdsMap, + progress: &Progress, ) -> Result<()> { let rtxn = index.read_txn()?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; - for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() { + let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_string_delta().collect(); + // We move all bulks at the front and incrementals (others) at the end. + deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 }); + + for (fid, delta) in deltas { // skip field ids that should not be facet leveled let Some(metadata) = global_fields_ids_map.metadata(fid) else { continue; @@ -187,11 +219,13 @@ fn compute_facet_level_database( let _entered = span.enter(); match delta { FacetFieldIdDelta::Bulk => { + progress.update_progress(PostProcessingFacets::StringsBulk); tracing::debug!(%fid, "bulk string facet processing"); FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String) .execute(wtxn)? } FacetFieldIdDelta::Incremental(delta_data) => { + progress.update_progress(PostProcessingFacets::StringsIncremental); tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing"); FacetsUpdateIncremental::new( index, @@ -207,16 +241,22 @@ fn compute_facet_level_database( } } - for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() { + let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_number_delta().collect(); + // We move all bulks at the front and incrementals (others) at the end. + deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 }); + + for (fid, delta) in deltas { let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number"); let _entered = span.enter(); match delta { FacetFieldIdDelta::Bulk => { + progress.update_progress(PostProcessingFacets::NumbersBulk); tracing::debug!(%fid, "bulk number facet processing"); FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number) .execute(wtxn)? } FacetFieldIdDelta::Incremental(delta_data) => { + progress.update_progress(PostProcessingFacets::NumbersIncremental); tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing"); FacetsUpdateIncremental::new( index, diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index da71819c6..eabf9104e 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -20,3 +20,23 @@ make_enum_progress! { Finalizing, } } + +make_enum_progress! { + pub enum PostProcessingFacets { + StringsBulk, + StringsIncremental, + NumbersBulk, + NumbersIncremental, + FacetSearch, + } +} + +make_enum_progress! { + pub enum PostProcessingWords { + WordFst, + WordPrefixDocids, + ExactWordPrefixDocids, + WordPrefixFieldIdDocids, + WordPrefixPositionDocids, + } +} From 7707fb18dd0c9138721e7e4cfaeb96c363fe8e6c Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 12:51:36 +0100 Subject: [PATCH 28/37] add embedding with dimension mismatch test case --- crates/meilisearch/tests/vector/mod.rs | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 67da51702..c6f32ccc5 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -164,6 +164,56 @@ async fn add_remove_user_provided() { "###); } +#[actix_rt::test] +async fn user_provide_mismatched_embedding_dimension() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + async fn generate_default_user_provided_documents(server: &Server) -> Index { let index = server.index("doggo"); From 62de70b73c3f7ba7fdd62c102a8ac0edbd4de68b Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Wed, 26 Mar 2025 12:57:25 +0100 Subject: [PATCH 29/37] Document problematic case in test and acknowledge PR comment --- crates/meilisearch/tests/vector/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index c6f32ccc5..fd9c314e2 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -212,6 +212,14 @@ async fn user_provide_mismatched_embedding_dimension() { "finishedAt": "[date]" } "#); + + // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass + let new_document = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, + ]); + let (value, code) = index.add_documents(new_document, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); } async fn generate_default_user_provided_documents(server: &Server) -> Index { From 0e475cb5e649fb2b4b78a263f423b6d0ca74b31e Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 27 Mar 2025 11:07:01 +0100 Subject: [PATCH 30/37] fix warn and show what meilisearch understood of the vectors in the cursed test --- crates/meilisearch/tests/vector/mod.rs | 35 +++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index fd9c314e2..14474c210 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -217,9 +217,42 @@ async fn user_provide_mismatched_embedding_dimension() { let new_document = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, ]); - let (value, code) = index.add_documents(new_document, None).await; + let (response, code) = index.add_documents(new_document, None).await; snapshot!(code, @"202 Accepted"); index.wait_task(response.uid()).await.succeeded(); + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 1.0 + ], + [ + 1.0, + 2.0, + 2.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); } async fn generate_default_user_provided_documents(server: &Server) -> Index { From 94ea263befc7f5e49ccbed6c27146dbb331dc95d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 31 Mar 2025 13:43:28 +0200 Subject: [PATCH 31/37] Add new error for dimensions mismatch during indexing --- crates/meilisearch-types/src/error.rs | 5 ++++- crates/milli/src/error.rs | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 859563d8a..6c547d51e 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -454,7 +454,10 @@ impl ErrorCode for milli::Error { } UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, - UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, + UserError::InvalidVectorDimensions { .. } + | UserError::InvalidIndexingVectorDimensions { .. } => { + Code::InvalidVectorDimensions + } UserError::InvalidVectorsMapType { .. } | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index e1098cfa5..e61283e4c 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidGeoField(#[from] GeoError), #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)] InvalidVectorDimensions { expected: usize, found: usize }, + #[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")] + InvalidIndexingVectorDimensions { + embedder_name: String, + document_id: String, + embedding_index: usize, + expected: usize, + found: usize, + }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] From f72986446668e9ea504b79d55e7e8505b00c0685 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 31 Mar 2025 13:43:57 +0200 Subject: [PATCH 32/37] Check dimension mismatch at insertion time --- .../src/update/new/extract/vectors/mod.rs | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 6820ee67b..696864e7f 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { // do we have set embeddings? if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( + update.external_document_id(), update.docid(), embeddings .into_vec(&context.doc_alloc, embedder_name) @@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { document_id: update.external_document_id().to_string(), error: error.to_string(), })?, - ); + )?; } else if new_vectors.regenerate { let new_rendered = prompt.render_document( update.external_document_id(), @@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { chunks.set_regenerate(insertion.docid(), new_vectors.regenerate); if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( + insertion.external_document_id(), insertion.docid(), embeddings .into_vec(&context.doc_alloc, embedder_name) @@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { .to_string(), error: error.to_string(), })?, - ); + )?; } else if new_vectors.regenerate { let rendered = prompt.render_document( insertion.external_document_id(), @@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> { embedder: &'a Embedder, embedder_id: u8, embedder_name: &'a str, + dimensions: usize, prompt: &'a Prompt, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, @@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); let texts = BVec::with_capacity_in(capacity, doc_alloc); let ids = BVec::with_capacity_in(capacity, doc_alloc); + let dimensions = embedder.dimensions(); Self { texts, ids, @@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { embedder_name, user_provided, has_manual_generation: None, + dimensions, } } @@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { } } - fn set_vectors(&self, docid: DocumentId, embeddings: Vec) { + fn set_vectors( + &self, + external_docid: &'a str, + docid: DocumentId, + embeddings: Vec, + ) -> Result<()> { + for (embedding_index, embedding) in embeddings.iter().enumerate() { + if embedding.len() != self.dimensions { + return Err(UserError::InvalidIndexingVectorDimensions { + expected: self.dimensions, + found: embedding.len(), + embedder_name: self.embedder_name.to_string(), + document_id: external_docid.to_string(), + embedding_index, + } + .into()); + } + } self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap(); + Ok(()) } } From 08ff135ad6c48d4936e4a45bc86219be208ff273 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 31 Mar 2025 15:26:31 +0200 Subject: [PATCH 33/37] Fix test --- crates/meilisearch/tests/vector/mod.rs | 60 +++++++++++--------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 14474c210..5e34a4c23 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -188,7 +188,7 @@ async fn user_provide_mismatched_embedding_dimension() { let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); let task = index.wait_task(value.uid()).await; - snapshot!(task, @r#" + snapshot!(task, @r###" { "uid": "[uid]", "batchUid": "[batch_uid]", @@ -201,7 +201,7 @@ async fn user_provide_mismatched_embedding_dimension() { "indexedDocuments": 0 }, "error": { - "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", + "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3", "code": "invalid_vector_dimensions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" @@ -211,46 +211,36 @@ async fn user_provide_mismatched_embedding_dimension() { "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); - // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass let new_document = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, ]); let (response, code) = index.add_documents(new_document, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(response.uid()).await.succeeded(); - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) - .await; - snapshot!(json_string!(documents), @r###" + let task = index.wait_task(response.uid()).await; + snapshot!(task, @r###" { - "results": [ - { - "id": 0, - "name": "kefir", - "_vectors": { - "manual": { - "embeddings": [ - [ - 0.0, - 0.0, - 1.0 - ], - [ - 1.0, - 2.0, - 2.0 - ] - ], - "regenerate": false - } - } - } - ], - "offset": 0, - "limit": 20, - "total": 1 + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" } "###); } From 0656a0d515044f72f9043e42e28331a0fe4fd8cf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Apr 2025 14:25:27 +0200 Subject: [PATCH 34/37] Optimize roaring operation Co-authored-by: Many the fish --- crates/milli/src/search/new/bucket_sort.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs index a659dd226..ca7a4a986 100644 --- a/crates/milli/src/search/new/bucket_sort.rs +++ b/crates/milli/src/search/new/bucket_sort.rs @@ -252,8 +252,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( || is_below_threshold { if is_below_threshold { - all_candidates -= - next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index]; + all_candidates -= &next_bucket.candidates; + all_candidates -= &ranking_rule_universes[cur_ranking_rule_index]; } else { maybe_add_to_results!(next_bucket.candidates); } From 1db550ec7f46984a3532eee859672a449389bc68 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 3 Apr 2025 15:47:56 +0200 Subject: [PATCH 35/37] make meilisearch accept cancelation tasks even when the disk is full --- crates/index-scheduler/src/lib.rs | 4 +-- crates/index-scheduler/src/queue/mod.rs | 2 -- crates/index-scheduler/src/queue/test.rs | 33 ++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 5c8517650..99f62983a 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -625,8 +625,8 @@ impl IndexScheduler { task_id: Option, dry_run: bool, ) -> Result { - // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task - if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty()) + // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incomming task + if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty()) && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40 { return Err(Error::NoSpaceLeftInTaskQueue); diff --git a/crates/index-scheduler/src/queue/mod.rs b/crates/index-scheduler/src/queue/mod.rs index b13e3ffe2..92de10fe1 100644 --- a/crates/index-scheduler/src/queue/mod.rs +++ b/crates/index-scheduler/src/queue/mod.rs @@ -292,8 +292,6 @@ impl Queue { return Ok(task); } - // Get rid of the mutability. - let task = task; self.tasks.register(wtxn, &task)?; Ok(task) diff --git a/crates/index-scheduler/src/queue/test.rs b/crates/index-scheduler/src/queue/test.rs index 3dbdd2db3..91f412025 100644 --- a/crates/index-scheduler/src/queue/test.rs +++ b/crates/index-scheduler/src/queue/test.rs @@ -364,7 +364,7 @@ fn test_task_queue_is_full() { // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); - // Even the task deletion that doesn't delete anything shouldn't be accepted + // Even the task deletion and cancelation that don't delete anything shouldn be refused let result = index_scheduler .register( KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, @@ -373,10 +373,39 @@ fn test_task_queue_is_full() { ) .unwrap_err(); snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); + let result = index_scheduler + .register( + KindWithContent::TaskCancelation { query: S("test"), tasks: RoaringBitmap::new() }, + None, + false, + ) + .unwrap_err(); + snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); + // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); - // But a task deletion that delete something should works + // But a task cancelation that cancel something should works + index_scheduler + .register( + KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + // But we should still be forbidden from enqueuing new tasks + let result = index_scheduler + .register( + KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None }, + None, + false, + ) + .unwrap_err(); + snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations."); + + // And a task deletion that delete something should works index_scheduler .register( KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() }, From 796a325972acffcce682f39be747653db83dd71d Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 3 Apr 2025 15:53:42 +0200 Subject: [PATCH 36/37] Fix typos Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- crates/index-scheduler/src/lib.rs | 2 +- crates/index-scheduler/src/queue/test.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 99f62983a..9052b92f1 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -625,7 +625,7 @@ impl IndexScheduler { task_id: Option, dry_run: bool, ) -> Result { - // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incomming task + // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incoming task if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty()) && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40 { diff --git a/crates/index-scheduler/src/queue/test.rs b/crates/index-scheduler/src/queue/test.rs index 91f412025..7582da0d6 100644 --- a/crates/index-scheduler/src/queue/test.rs +++ b/crates/index-scheduler/src/queue/test.rs @@ -364,7 +364,7 @@ fn test_task_queue_is_full() { // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); - // Even the task deletion and cancelation that don't delete anything shouldn be refused + // Even the task deletion and cancelation that don't delete anything should be refused let result = index_scheduler .register( KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() }, @@ -385,7 +385,7 @@ fn test_task_queue_is_full() { // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice"); - // But a task cancelation that cancel something should works + // But a task cancelation that cancel something should work index_scheduler .register( KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() }, From 61db56f7856280946b1fb8da0c23d8f12a5308c6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 14 Apr 2025 14:55:57 +0200 Subject: [PATCH 37/37] remove duplicated test --- crates/meilisearch/tests/vector/mod.rs | 91 -------------------------- 1 file changed, 91 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index e0fde8660..98555dfac 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -164,97 +164,6 @@ async fn add_remove_user_provided() { "#); } -#[actix_rt::test] -async fn user_provide_mismatched_embedding_dimension() { - let server = Server::new().await; - let index = server.index("doggo"); - - let (response, code) = index - .update_settings(json!({ - "embedders": { - "manual": { - "source": "userProvided", - "dimensions": 3, - } - }, - })) - .await; - snapshot!(code, @"202 Accepted"); - server.wait_task(response.uid()).await.succeeded(); - - let documents = json!([ - {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }}, - ]); - let (value, code) = index.add_documents(documents, None).await; - snapshot!(code, @"202 Accepted"); - let task = index.wait_task(value.uid()).await; - snapshot!(task, @r#" - { - "uid": "[uid]", - "batchUid": "[batch_uid]", - "indexUid": "doggo", - "status": "failed", - "type": "documentAdditionOrUpdate", - "canceledBy": null, - "details": { - "receivedDocuments": 1, - "indexedDocuments": 0 - }, - "error": { - "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", - "code": "invalid_vector_dimensions", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" - }, - "duration": "[duration]", - "enqueuedAt": "[date]", - "startedAt": "[date]", - "finishedAt": "[date]" - } - "#); - - // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass - let new_document = json!([ - {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, - ]); - let (response, code) = index.add_documents(new_document, None).await; - snapshot!(code, @"202 Accepted"); - index.wait_task(response.uid()).await.succeeded(); - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) - .await; - snapshot!(json_string!(documents), @r###" - { - "results": [ - { - "id": 0, - "name": "kefir", - "_vectors": { - "manual": { - "embeddings": [ - [ - 0.0, - 0.0, - 1.0 - ], - [ - 1.0, - 2.0, - 2.0 - ] - ], - "regenerate": false - } - } - } - ], - "offset": 0, - "limit": 20, - "total": 1 - } - "###); -} - #[actix_rt::test] async fn user_provide_mismatched_embedding_dimension() { let server = Server::new().await;