From 3ed43f9097c51eb6ceedced7ec0be54c0569179f Mon Sep 17 00:00:00 2001
From: Tamo <tamo@meilisearch.com>
Date: Mon, 3 Mar 2025 11:11:53 +0100
Subject: [PATCH 01/37] add a failing test reproducing the bug

---
 .../tests/documents/add_documents.rs          | 65 +++++++++----------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/crates/meilisearch/tests/documents/add_documents.rs b/crates/meilisearch/tests/documents/add_documents.rs
index ad8bae19f..8c05cd177 100644
--- a/crates/meilisearch/tests/documents/add_documents.rs
+++ b/crates/meilisearch/tests/documents/add_documents.rs
@@ -1897,11 +1897,11 @@ async fn update_documents_with_geo_field() {
         },
         {
             "id": "3",
-            "_geo": { "lat": 1, "lng": 1 },
+            "_geo": { "lat": 3, "lng": 0 },
         },
         {
             "id": "4",
-            "_geo": { "lat": "1", "lng": "1" },
+            "_geo": { "lat": "4", "lng": "0" },
         },
     ]);
 
@@ -1928,9 +1928,7 @@ async fn update_documents_with_geo_field() {
     }
     "###);
 
-    let (response, code) = index
-        .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
-        .await;
+    let (response, code) = index.search_post(json!({"sort": ["_geoPoint(10,0):asc"]})).await;
     snapshot!(code, @"200 OK");
     // we are expecting docs 4 and 3 first as they have geo
     snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
@@ -1940,18 +1938,18 @@ async fn update_documents_with_geo_field() {
         {
           "id": "4",
           "_geo": {
-            "lat": "1",
-            "lng": "1"
+            "lat": "4",
+            "lng": "0"
           },
-          "_geoDistance": 5522018
+          "_geoDistance": 667170
         },
         {
           "id": "3",
           "_geo": {
-            "lat": 1,
-            "lng": 1
+            "lat": 3,
+            "lng": 0
           },
-          "_geoDistance": 5522018
+          "_geoDistance": 778364
         },
         {
           "id": "1"
@@ -1969,10 +1967,13 @@ async fn update_documents_with_geo_field() {
     }
     "###);
 
-    let updated_documents = json!([{
-      "id": "3",
-      "doggo": "kefir",
-    }]);
+    let updated_documents = json!([
+        {
+          "id": "3",
+          "doggo": "kefir",
+          "_geo": { "lat": 5, "lng": 0 },
+        }
+    ]);
     let (task, _status_code) = index.update_documents(updated_documents, None).await;
     let response = index.wait_task(task.uid()).await;
     snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
@@ -2012,16 +2013,16 @@ async fn update_documents_with_geo_field() {
         {
           "id": "3",
           "_geo": {
-            "lat": 1,
-            "lng": 1
+            "lat": 5,
+            "lng": 0
           },
           "doggo": "kefir"
         },
         {
           "id": "4",
           "_geo": {
-            "lat": "1",
-            "lng": "1"
+            "lat": "4",
+            "lng": "0"
           }
         }
       ],
@@ -2031,31 +2032,29 @@ async fn update_documents_with_geo_field() {
     }
     "###);
 
-    let (response, code) = index
-        .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
-        .await;
+    let (response, code) = index.search_post(json!({"sort": ["_geoPoint(10,0):asc"]})).await;
     snapshot!(code, @"200 OK");
     // the search response should not have changed: we are expecting docs 4 and 3 first as they have geo
     snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
     @r###"
     {
       "hits": [
-        {
-          "id": "4",
-          "_geo": {
-            "lat": "1",
-            "lng": "1"
-          },
-          "_geoDistance": 5522018
-        },
         {
           "id": "3",
           "_geo": {
-            "lat": 1,
-            "lng": 1
+            "lat": 5,
+            "lng": 0
           },
           "doggo": "kefir",
-          "_geoDistance": 5522018
+          "_geoDistance": 555975
+        },
+        {
+          "id": "4",
+          "_geo": {
+            "lat": "4",
+            "lng": "0"
+          },
+          "_geoDistance": 667170
         },
         {
           "id": "1"

From d3cd5ea68924430c0bd0a10f0059dd6c4bd2cf4e Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 3 Mar 2025 14:45:57 +0100
Subject: [PATCH 02/37] Check if the geo fields changed additionally to  the
 other faceted fields when reindexing facets

---
 .../milli/src/update/new/document_change.rs   | 26 ++++++++++++++++++-
 .../new/extract/faceted/extract_facets.rs     |  7 +++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs
index 38369a4d7..8a8ac4bb3 100644
--- a/crates/milli/src/update/new/document_change.rs
+++ b/crates/milli/src/update/new/document_change.rs
@@ -1,5 +1,6 @@
 use bumpalo::Bump;
 use heed::RoTxn;
+use serde_json::Value;
 
 use super::document::{
     Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
@@ -10,7 +11,7 @@ use super::vector_document::{
 use crate::attribute_patterns::PatternMatch;
 use crate::documents::FieldIdMapper;
 use crate::vector::EmbeddingConfigs;
-use crate::{DocumentId, Index, Result};
+use crate::{DocumentId, Index, InternalError, Result};
 
 pub enum DocumentChange<'doc> {
     Deletion(Deletion<'doc>),
@@ -243,6 +244,29 @@ impl<'doc> Update<'doc> {
         Ok(has_deleted_fields)
     }
 
+    /// Returns `true` if the geo fields have changed.
+    pub fn has_changed_for_geo_fields<'t, Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'t RoTxn,
+        index: &'t Index,
+        mapper: &'t Mapper,
+    ) -> Result<bool> {
+        let current = self.current(rtxn, index, mapper)?;
+        let current_geo = current.geo_field()?;
+        let updated_geo = self.only_changed_fields().geo_field()?;
+        match (current_geo, updated_geo) {
+            (Some(current_geo), Some(updated_geo)) => {
+                let current: Value =
+                    serde_json::from_str(current_geo.get()).map_err(InternalError::SerdeJson)?;
+                let updated: Value =
+                    serde_json::from_str(updated_geo.get()).map_err(InternalError::SerdeJson)?;
+                Ok(current != updated)
+            }
+            (None, None) => Ok(false),
+            _ => Ok(true),
+        }
+    }
+
     pub fn only_changed_vectors(
         &self,
         doc_alloc: &'doc Bump,
diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs
index b3aa8f984..1b08307a2 100644
--- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs
@@ -117,7 +117,7 @@ impl FacetedDocidsExtractor {
                 },
             ),
             DocumentChange::Update(inner) => {
-                if !inner.has_changed_for_fields(
+                let has_changed = inner.has_changed_for_fields(
                     &mut |field_name| {
                         match_faceted_field(
                             field_name,
@@ -130,7 +130,10 @@ impl FacetedDocidsExtractor {
                     rtxn,
                     index,
                     context.db_fields_ids_map,
-                )? {
+                )?;
+                let has_changed_for_geo_fields =
+                    inner.has_changed_for_geo_fields(rtxn, index, context.db_fields_ids_map)?;
+                if !has_changed && !has_changed_for_geo_fields {
                     return Ok(());
                 }
 

From d3e4b2dfe77df6ac1cbff6346867c76866dea074 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Fri, 14 Mar 2025 13:07:51 +0100
Subject: [PATCH 03/37] Accept total batch size in human size

---
 crates/meilisearch/src/analytics/segment_analytics.rs | 3 ++-
 crates/meilisearch/src/lib.rs                         | 2 +-
 crates/meilisearch/src/option.rs                      | 6 +++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs
index a681e9e29..c428aa2b8 100644
--- a/crates/meilisearch/src/analytics/segment_analytics.rs
+++ b/crates/meilisearch/src/analytics/segment_analytics.rs
@@ -326,7 +326,8 @@ impl Infos {
             http_addr: http_addr != default_http_addr(),
             http_payload_size_limit,
             experimental_max_number_of_batched_tasks,
-            experimental_limit_batched_tasks_total_size,
+            experimental_limit_batched_tasks_total_size:
+                experimental_limit_batched_tasks_total_size.into(),
             task_queue_webhook: task_webhook_url.is_some(),
             task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
             log_level: log_level.to_string(),
diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs
index 1841d5556..7dd1b58b4 100644
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@@ -228,7 +228,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc<IndexScheduler>, Arc<
         cleanup_enabled: !opt.experimental_replication_parameters,
         max_number_of_tasks: 1_000_000,
         max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
-        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size,
+        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.into(),
         index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
         index_count: DEFAULT_INDEX_COUNT,
         instance_features: opt.to_instance_features(),
diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs
index acf4393d3..6364f49d8 100644
--- a/crates/meilisearch/src/option.rs
+++ b/crates/meilisearch/src/option.rs
@@ -444,7 +444,7 @@ pub struct Opt {
     /// see: <https://github.com/orgs/meilisearch/discussions/801>
     #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())]
     #[serde(default = "default_limit_batched_tasks_total_size")]
-    pub experimental_limit_batched_tasks_total_size: u64,
+    pub experimental_limit_batched_tasks_total_size: Byte,
 
     #[serde(flatten)]
     #[clap(flatten)]
@@ -944,8 +944,8 @@ fn default_limit_batched_tasks() -> usize {
     usize::MAX
 }
 
-fn default_limit_batched_tasks_total_size() -> u64 {
-    u64::MAX
+fn default_limit_batched_tasks_total_size() -> Byte {
+    Byte::from_u64(u64::MAX)
 }
 
 fn default_snapshot_dir() -> PathBuf {

From cb16baab18cbd8abce27207fe065f9f8b3b70687 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Sun, 16 Mar 2025 19:15:39 +0100
Subject: [PATCH 04/37] Add more progress levels to measure merging

---
 crates/milli/src/progress.rs                   | 14 ++++++++++++--
 crates/milli/src/update/new/indexer/extract.rs | 16 +++++++++++++++-
 crates/milli/src/update/new/merger.rs          | 11 ++---------
 crates/milli/src/update/new/steps.rs           |  6 ++++++
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs
index 7eb0cbd6b..75dafa8ec 100644
--- a/crates/milli/src/progress.rs
+++ b/crates/milli/src/progress.rs
@@ -190,8 +190,18 @@ macro_rules! make_atomic_progress {
     };
 }
 
-make_atomic_progress!(Document alias AtomicDocumentStep => "document" );
-make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" );
+make_atomic_progress!(Document alias AtomicDocumentStep => "document");
+make_atomic_progress!(Payload alias AtomicPayloadStep => "payload");
+
+make_enum_progress! {
+    pub enum MergingWordCache {
+        WordDocids,
+        WordFieldIdDocids,
+        ExactWordDocids,
+        WordPositionDocids,
+        FieldIdWordCountDocids,
+    }
+}
 
 #[derive(Debug, Serialize, Clone, ToSchema)]
 #[serde(rename_all = "camelCase")]
diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs
index 907a4d1df..bb36ddc37 100644
--- a/crates/milli/src/update/new/indexer/extract.rs
+++ b/crates/milli/src/update/new/indexer/extract.rs
@@ -13,6 +13,7 @@ use super::super::thread_local::{FullySend, ThreadLocal};
 use super::super::FacetFieldIdsDelta;
 use super::document_changes::{extract, DocumentChanges, IndexingContext};
 use crate::index::IndexEmbeddingConfig;
+use crate::progress::MergingWordCache;
 use crate::proximity::ProximityPrecision;
 use crate::update::new::extract::EmbeddingExtractor;
 use crate::update::new::merger::merge_and_send_rtree;
@@ -96,6 +97,7 @@ where
         {
             let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(IndexingStep::MergingFacetCaches);
 
             facet_field_ids_delta = merge_and_send_facet_docids(
                 caches,
@@ -117,7 +119,6 @@ where
         } = {
             let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
             let _entered = span.enter();
-
             WordDocidsExtractors::run_extraction(
                 document_changes,
                 indexing_context,
@@ -126,9 +127,13 @@ where
             )?
         };
 
+        indexing_context.progress.update_progress(IndexingStep::MergingWordCaches);
+
         {
             let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::WordDocids);
+
             merge_and_send_docids(
                 word_docids,
                 index.word_docids.remap_types(),
@@ -142,6 +147,8 @@ where
             let span =
                 tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::WordFieldIdDocids);
+
             merge_and_send_docids(
                 word_fid_docids,
                 index.word_fid_docids.remap_types(),
@@ -155,6 +162,8 @@ where
             let span =
                 tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::ExactWordDocids);
+
             merge_and_send_docids(
                 exact_word_docids,
                 index.exact_word_docids.remap_types(),
@@ -168,6 +177,8 @@ where
             let span =
                 tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::WordPositionDocids);
+
             merge_and_send_docids(
                 word_position_docids,
                 index.word_position_docids.remap_types(),
@@ -181,6 +192,8 @@ where
             let span =
                 tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::FieldIdWordCountDocids);
+
             merge_and_send_docids(
                 fid_word_count_docids,
                 index.field_id_word_count_docids.remap_types(),
@@ -210,6 +223,7 @@ where
         {
             let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
             let _entered = span.enter();
+            indexing_context.progress.update_progress(IndexingStep::MergingWordProximity);
 
             merge_and_send_docids(
                 caches,
diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs
index 090add6bd..15f06c67d 100644
--- a/crates/milli/src/update/new/merger.rs
+++ b/crates/milli/src/update/new/merger.rs
@@ -82,14 +82,8 @@ where
         merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
             let current = database.get(&rtxn, key)?;
             match merge_cbo_bitmaps(current, del, add)? {
-                Operation::Write(bitmap) => {
-                    docids_sender.write(key, &bitmap)?;
-                    Ok(())
-                }
-                Operation::Delete => {
-                    docids_sender.delete(key)?;
-                    Ok(())
-                }
+                Operation::Write(bitmap) => docids_sender.write(key, &bitmap),
+                Operation::Delete => docids_sender.delete(key),
                 Operation::Ignore => Ok(()),
             }
         })
@@ -130,7 +124,6 @@ pub fn merge_and_send_facet_docids<'extractor>(
                     Operation::Ignore => Ok(()),
                 }
             })?;
-
             Ok(facet_field_ids_delta)
         })
         .reduce(
diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs
index ad8fe9cb1..e026b4d0d 100644
--- a/crates/milli/src/update/new/steps.rs
+++ b/crates/milli/src/update/new/steps.rs
@@ -13,6 +13,9 @@ pub enum IndexingStep {
     ExtractingWords,
     ExtractingWordProximity,
     ExtractingEmbeddings,
+    MergingFacetCaches,
+    MergingWordCaches,
+    MergingWordProximity,
     WritingGeoPoints,
     WaitingForDatabaseWrites,
     WaitingForExtractors,
@@ -31,6 +34,9 @@ impl Step for IndexingStep {
             IndexingStep::ExtractingWords => "extracting words",
             IndexingStep::ExtractingWordProximity => "extracting word proximity",
             IndexingStep::ExtractingEmbeddings => "extracting embeddings",
+            IndexingStep::MergingFacetCaches => "merging facet caches",
+            IndexingStep::MergingWordCaches => "merging word caches",
+            IndexingStep::MergingWordProximity => "merging word proximity",
             IndexingStep::WritingGeoPoints => "writing geo points",
             IndexingStep::WaitingForDatabaseWrites => "waiting for database writes",
             IndexingStep::WaitingForExtractors => "waiting for extractors",

From 49dd50dab2ec70155f781da47a111a543faf5e3c Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 17 Mar 2025 11:29:17 +0100
Subject: [PATCH 05/37] Bump ring to v0.17.14 to compile on old aarch64

---
 Cargo.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 293d17045..59718aca4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3018,7 +3018,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.1",
 ]
 
 [[package]]
@@ -4886,9 +4886,9 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.13"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
  "cc",
  "cfg-if",

From e2156ddfc78f9dc11bcb06357323959a1431b06b Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 17 Mar 2025 11:40:50 +0100
Subject: [PATCH 06/37] Simplify the IndexingStep progress enum

---
 crates/milli/src/update/new/steps.rs | 74 +++++++---------------------
 1 file changed, 19 insertions(+), 55 deletions(-)

diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs
index e026b4d0d..da71819c6 100644
--- a/crates/milli/src/update/new/steps.rs
+++ b/crates/milli/src/update/new/steps.rs
@@ -1,58 +1,22 @@
-use std::borrow::Cow;
+use crate::make_enum_progress;
 
-use enum_iterator::Sequence;
-
-use crate::progress::Step;
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
-#[repr(u8)]
-pub enum IndexingStep {
-    PreparingPayloads,
-    ExtractingDocuments,
-    ExtractingFacets,
-    ExtractingWords,
-    ExtractingWordProximity,
-    ExtractingEmbeddings,
-    MergingFacetCaches,
-    MergingWordCaches,
-    MergingWordProximity,
-    WritingGeoPoints,
-    WaitingForDatabaseWrites,
-    WaitingForExtractors,
-    WritingEmbeddingsToDatabase,
-    PostProcessingFacets,
-    PostProcessingWords,
-    Finalizing,
-}
-
-impl Step for IndexingStep {
-    fn name(&self) -> Cow<'static, str> {
-        match self {
-            IndexingStep::PreparingPayloads => "preparing update file",
-            IndexingStep::ExtractingDocuments => "extracting documents",
-            IndexingStep::ExtractingFacets => "extracting facets",
-            IndexingStep::ExtractingWords => "extracting words",
-            IndexingStep::ExtractingWordProximity => "extracting word proximity",
-            IndexingStep::ExtractingEmbeddings => "extracting embeddings",
-            IndexingStep::MergingFacetCaches => "merging facet caches",
-            IndexingStep::MergingWordCaches => "merging word caches",
-            IndexingStep::MergingWordProximity => "merging word proximity",
-            IndexingStep::WritingGeoPoints => "writing geo points",
-            IndexingStep::WaitingForDatabaseWrites => "waiting for database writes",
-            IndexingStep::WaitingForExtractors => "waiting for extractors",
-            IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
-            IndexingStep::PostProcessingFacets => "post-processing facets",
-            IndexingStep::PostProcessingWords => "post-processing words",
-            IndexingStep::Finalizing => "finalizing",
-        }
-        .into()
-    }
-
-    fn current(&self) -> u32 {
-        *self as u32
-    }
-
-    fn total(&self) -> u32 {
-        Self::CARDINALITY as u32
+make_enum_progress! {
+    pub enum IndexingStep {
+        PreparingPayloads,
+        ExtractingDocuments,
+        ExtractingFacets,
+        ExtractingWords,
+        ExtractingWordProximity,
+        ExtractingEmbeddings,
+        MergingFacetCaches,
+        MergingWordCaches,
+        MergingWordProximity,
+        WritingGeoPoints,
+        WaitingForDatabaseWrites,
+        WaitingForExtractors,
+        WritingEmbeddingsToDatabase,
+        PostProcessingFacets,
+        PostProcessingWords,
+        Finalizing,
     }
 }

From b0b1888ef9052fe5dd049945b7ea5e8427510fcc Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 12 Mar 2025 15:57:35 +0100
Subject: [PATCH 07/37] Add test

---
 crates/meilisearch/tests/search/mod.rs | 140 +++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/crates/meilisearch/tests/search/mod.rs b/crates/meilisearch/tests/search/mod.rs
index d7a09b58e..f6e79dbb9 100644
--- a/crates/meilisearch/tests/search/mod.rs
+++ b/crates/meilisearch/tests/search/mod.rs
@@ -1783,6 +1783,146 @@ async fn test_nested_fields() {
     .await;
 }
 
+#[actix_rt::test]
+async fn test_typo_settings() {
+    let documents = json!([
+        {
+            "id": 0,
+            "title": "The zeroth document",
+        },
+        {
+            "id": 1,
+            "title": "The first document",
+            "nested": {
+                "object": "field",
+                "machin": "bidule",
+            },
+        },
+        {
+            "id": 2,
+            "title": "The second document",
+            "nested": [
+                "array",
+                {
+                    "object": "field",
+                },
+                {
+                    "prout": "truc",
+                    "machin": "lol",
+                },
+            ],
+        },
+        {
+            "id": 3,
+            "title": "The third document",
+            "nested": "I lied",
+        },
+    ]);
+
+    test_settings_documents_indexing_swapping_and_search(
+        &documents,
+        &json!({
+            "searchableAttributes": ["title", "nested.object", "nested.machin"],
+            "typoTolerance": {
+              "enabled": true,
+              "disableOnAttributes": ["title"]
+            }
+        }),
+        &json!({"q": "document"}),
+        |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 0,
+                "title": "The zeroth document"
+              },
+              {
+                "id": 1,
+                "title": "The first document",
+                "nested": {
+                  "object": "field",
+                  "machin": "bidule"
+                }
+              },
+              {
+                "id": 2,
+                "title": "The second document",
+                "nested": [
+                  "array",
+                  {
+                    "object": "field"
+                  },
+                  {
+                    "prout": "truc",
+                    "machin": "lol"
+                  }
+                ]
+              },
+              {
+                "id": 3,
+                "title": "The third document",
+                "nested": "I lied"
+              }
+            ]
+            "###);
+        },
+    )
+    .await;
+
+    // Test prefix search
+    test_settings_documents_indexing_swapping_and_search(
+        &documents,
+        &json!({
+            "searchableAttributes": ["title", "nested.object", "nested.machin"],
+            "typoTolerance": {
+              "enabled": true,
+              "disableOnAttributes": ["title"]
+            }
+        }),
+        &json!({"q": "docume"}),
+        |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            snapshot!(json_string!(response["hits"]), @r###"
+          [
+            {
+              "id": 0,
+              "title": "The zeroth document"
+            },
+            {
+              "id": 1,
+              "title": "The first document",
+              "nested": {
+                "object": "field",
+                "machin": "bidule"
+              }
+            },
+            {
+              "id": 2,
+              "title": "The second document",
+              "nested": [
+                "array",
+                {
+                  "object": "field"
+                },
+                {
+                  "prout": "truc",
+                  "machin": "lol"
+                }
+              ]
+            },
+            {
+              "id": 3,
+              "title": "The third document",
+              "nested": "I lied"
+            }
+          ]
+          "###);
+        },
+    )
+    .await;
+}
+
 /// Modifying facets with different casing should work correctly
 #[actix_rt::test]
 async fn change_facet_casing() {

From bf144a94d8416eed26bfe15d1627834550f72139 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 12 Mar 2025 15:44:41 +0100
Subject: [PATCH 08/37] No more use FST to find a word without any typo

---
 crates/milli/src/index.rs                     |  13 ++
 .../new/query_term/compute_derivations.rs     | 116 ++++++++----------
 2 files changed, 61 insertions(+), 68 deletions(-)

diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs
index 771d32175..fcb8962d2 100644
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -1755,6 +1755,19 @@ impl Index {
         }
         Ok(stats)
     }
+
+    /// Check if the word is indexed in the index.
+    ///
+    /// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids.
+    ///
+    /// # Arguments
+    ///
+    /// * `rtxn`: The read transaction.
+    /// * `word`: The word to check.
+    pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result<bool> {
+        Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
+            || self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
+    }
 }
 
 #[derive(Debug, Deserialize, Serialize)]
diff --git a/crates/milli/src/search/new/query_term/compute_derivations.rs b/crates/milli/src/search/new/query_term/compute_derivations.rs
index 79cd830ca..3caecb69e 100644
--- a/crates/milli/src/search/new/query_term/compute_derivations.rs
+++ b/crates/milli/src/search/new/query_term/compute_derivations.rs
@@ -1,10 +1,12 @@
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::collections::BTreeSet;
 use std::ops::ControlFlow;
 
 use fst::automaton::Str;
-use fst::{Automaton, IntoStreamer, Streamer};
+use fst::{IntoStreamer, Streamer};
 use heed::types::DecodeIgnore;
+use itertools::{merge_join_by, EitherOrBoth};
 
 use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
 use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
@@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH};
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum NumberOfTypos {
-    Zero,
     One,
     Two,
 }
 
-pub enum ZeroOrOneTypo {
-    Zero,
-    One,
-}
-
 impl Interned<QueryTerm> {
     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
         let s = ctx.term_interner.get_mut(self);
@@ -47,34 +43,45 @@ impl Interned<QueryTerm> {
 }
 
 fn find_zero_typo_prefix_derivations(
+    ctx: &mut SearchContext<'_>,
     word_interned: Interned<String>,
-    fst: fst::Set<Cow<'_, [u8]>>,
-    word_interner: &mut DedupInterner<String>,
     mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
 ) -> Result<()> {
-    let word = word_interner.get(word_interned).to_owned();
+    let word = ctx.word_interner.get(word_interned).to_owned();
     let word = word.as_str();
-    let prefix = Str::new(word).starts_with();
-    let mut stream = fst.search(prefix).into_stream();
 
-    while let Some(derived_word) = stream.next() {
-        let derived_word = std::str::from_utf8(derived_word)?.to_owned();
-        let derived_word_interned = word_interner.insert(derived_word);
-        if derived_word_interned != word_interned {
-            let cf = visit(derived_word_interned)?;
-            if cf.is_break() {
-                break;
+    let words =
+        ctx.index.word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
+    let exact_words =
+        ctx.index.exact_word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
+
+    for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) {
+        (Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word),
+        (Err(_), _) | (_, Err(_)) => Ordering::Equal,
+    }) {
+        match eob {
+            EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => {
+                let (derived_word, _) = kv?;
+                let derived_word = derived_word.to_string();
+                let derived_word_interned = ctx.word_interner.insert(derived_word);
+                if derived_word_interned != word_interned {
+                    let cf = visit(derived_word_interned)?;
+                    if cf.is_break() {
+                        break;
+                    }
+                }
             }
         }
     }
+
     Ok(())
 }
 
-fn find_zero_one_typo_derivations(
+fn find_one_typo_derivations(
     ctx: &mut SearchContext<'_>,
     word_interned: Interned<String>,
     is_prefix: bool,
-    mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
+    mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
 ) -> Result<()> {
     let fst = ctx.get_words_fst()?;
     let word = ctx.word_interner.get(word_interned).to_owned();
@@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations(
         let derived_word = ctx.word_interner.insert(derived_word.to_owned());
         let d = dfa.distance(state.1);
         match d.to_u8() {
-            0 => {
-                if derived_word != word_interned {
-                    let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
-                    if cf.is_break() {
-                        break;
-                    }
-                }
-            }
+            0 => (),
             1 => {
-                let cf = visit(derived_word, ZeroOrOneTypo::One)?;
+                let cf = visit(derived_word)?;
                 if cf.is_break() {
                     break;
                 }
@@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations(
     Ok(())
 }
 
-fn find_zero_one_two_typo_derivations(
+fn find_one_two_typo_derivations(
     word_interned: Interned<String>,
     is_prefix: bool,
     fst: fst::Set<Cow<'_, [u8]>>,
@@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations(
             // correct distance
             let d = second_dfa.distance((state.1).0);
             match d.to_u8() {
-                0 => {
-                    if derived_word_interned != word_interned {
-                        let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
-                        if cf.is_break() {
-                            break;
-                        }
-                    }
-                }
+                0 => (),
                 1 => {
                     let cf = visit(derived_word_interned, NumberOfTypos::One)?;
                     if cf.is_break() {
@@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word(
         });
     }
 
-    let fst = ctx.index.words_fst(ctx.txn)?;
-
     let use_prefix_db = is_prefix
         && (ctx
             .index
@@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word(
     let mut zero_typo = None;
     let mut prefix_of = BTreeSet::new();
 
-    if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() {
+    if ctx.index.contains_word(ctx.txn, word)? {
         zero_typo = Some(word_interned);
     }
 
     if is_prefix && use_prefix_db.is_none() {
-        find_zero_typo_prefix_derivations(
-            word_interned,
-            fst,
-            &mut ctx.word_interner,
-            |derived_word| {
-                if prefix_of.len() < limits::MAX_PREFIX_COUNT {
-                    prefix_of.insert(derived_word);
-                    Ok(ControlFlow::Continue(()))
-                } else {
-                    Ok(ControlFlow::Break(()))
-                }
-            },
-        )?;
+        find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
+            if prefix_of.len() < limits::MAX_PREFIX_COUNT {
+                prefix_of.insert(derived_word);
+                Ok(ControlFlow::Continue(()))
+            } else {
+                Ok(ControlFlow::Break(()))
+            }
+        })?;
     }
     let synonyms = ctx.index.synonyms(ctx.txn)?;
     let mut synonym_word_count = 0;
@@ -295,18 +281,13 @@ impl Interned<QueryTerm> {
         let mut one_typo_words = BTreeSet::new();
 
         if *max_nbr_typos > 0 {
-            find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
-                match nbr_typos {
-                    ZeroOrOneTypo::Zero => {}
-                    ZeroOrOneTypo::One => {
-                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
-                            one_typo_words.insert(derived_word);
-                        } else {
-                            return Ok(ControlFlow::Break(()));
-                        }
-                    }
+            find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
+                if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
+                    one_typo_words.insert(derived_word);
+                    Ok(ControlFlow::Continue(()))
+                } else {
+                    Ok(ControlFlow::Break(()))
                 }
-                Ok(ControlFlow::Continue(()))
             })?;
         }
 
@@ -357,7 +338,7 @@ impl Interned<QueryTerm> {
         let mut two_typo_words = BTreeSet::new();
 
         if *max_nbr_typos > 0 {
-            find_zero_one_two_typo_derivations(
+            find_one_two_typo_derivations(
                 *original,
                 *is_prefix,
                 ctx.index.words_fst(ctx.txn)?,
@@ -370,7 +351,6 @@ impl Interned<QueryTerm> {
                         return Ok(ControlFlow::Break(()));
                     }
                     match nbr_typos {
-                        NumberOfTypos::Zero => {}
                         NumberOfTypos::One => {
                             if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
                                 one_typo_words.insert(derived_word);

From 69678ed8e17d62f365b65d0923e2ebaf122c33b9 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 18 Mar 2025 00:19:49 +0000
Subject: [PATCH 09/37] Bump zip from 2.2.2 to 2.3.0

Bumps [zip](https://github.com/zip-rs/zip2) from 2.2.2 to 2.3.0.
- [Release notes](https://github.com/zip-rs/zip2/releases)
- [Changelog](https://github.com/zip-rs/zip2/blob/master/CHANGELOG.md)
- [Commits](https://github.com/zip-rs/zip2/compare/v2.2.2...v2.3.0)

---
updated-dependencies:
- dependency-name: zip
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock                    | 128 ++++++++++++++++++++++++----------
 crates/meilisearch/Cargo.toml |   2 +-
 2 files changed, 93 insertions(+), 37 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 293d17045..431a3c534 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -258,7 +258,7 @@ version = "0.7.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
  "once_cell",
  "version_check",
 ]
@@ -271,7 +271,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
 dependencies = [
  "cfg-if",
  "const-random",
- "getrandom",
+ "getrandom 0.2.15",
  "once_cell",
  "version_check",
  "zerocopy",
@@ -790,22 +790,20 @@ dependencies = [
 
 [[package]]
 name = "bzip2"
-version = "0.4.4"
+version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8"
+checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47"
 dependencies = [
  "bzip2-sys",
- "libc",
 ]
 
 [[package]]
 name = "bzip2-sys"
-version = "0.1.11+1.0.8"
+version = "0.1.13+1.0.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
+checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14"
 dependencies = [
  "cc",
- "libc",
  "pkg-config",
 ]
 
@@ -1143,7 +1141,7 @@ version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
  "once_cell",
  "tiny-keccak",
 ]
@@ -2216,10 +2214,24 @@ dependencies = [
  "cfg-if",
  "js-sys",
  "libc",
- "wasi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi 0.13.3+wasi-0.2.2",
+ "wasm-bindgen",
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "gimli"
 version = "0.27.3"
@@ -2923,10 +2935,11 @@ dependencies = [
 
 [[package]]
 name = "js-sys"
-version = "0.3.69"
+version = "0.3.77"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
+checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
 dependencies = [
+ "once_cell",
  "wasm-bindgen",
 ]
 
@@ -3518,6 +3531,17 @@ dependencies = [
  "crc",
 ]
 
+[[package]]
+name = "lzma-sys"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
+
 [[package]]
 name = "macro_rules_attribute"
 version = "0.2.0"
@@ -3656,7 +3680,7 @@ dependencies = [
  "uuid",
  "wiremock",
  "yaup",
- "zip 2.2.2",
+ "zip 2.3.0",
 ]
 
 [[package]]
@@ -3882,7 +3906,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "log",
- "wasi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
  "windows-sys 0.48.0",
 ]
 
@@ -3893,7 +3917,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd"
 dependencies = [
  "libc",
- "wasi",
+ "wasi 0.11.0+wasi-snapshot-preview1",
  "windows-sys 0.52.0",
 ]
 
@@ -4670,7 +4694,7 @@ version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
 ]
 
 [[package]]
@@ -4762,7 +4786,7 @@ version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
  "redox_syscall 0.2.16",
  "thiserror 1.0.69",
 ]
@@ -4892,7 +4916,7 @@ checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
 dependencies = [
  "cc",
  "cfg-if",
- "getrandom",
+ "getrandom 0.2.15",
  "libc",
  "untrusted",
  "windows-sys 0.52.0",
@@ -5576,7 +5600,7 @@ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
 dependencies = [
  "cfg-if",
  "fastrand",
- "getrandom",
+ "getrandom 0.2.15",
  "once_cell",
  "rustix",
  "windows-sys 0.52.0",
@@ -5751,7 +5775,7 @@ dependencies = [
  "aho-corasick",
  "derive_builder 0.12.0",
  "esaxx-rs",
- "getrandom",
+ "getrandom 0.2.15",
  "itertools 0.12.1",
  "lazy_static",
  "log",
@@ -6238,7 +6262,7 @@ version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a"
 dependencies = [
- "getrandom",
+ "getrandom 0.2.15",
  "serde",
 ]
 
@@ -6335,24 +6359,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
-name = "wasm-bindgen"
-version = "0.2.92"
+name = "wasi"
+version = "0.13.3+wasi-0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
+checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
 dependencies = [
  "cfg-if",
+ "once_cell",
+ "rustversion",
  "wasm-bindgen-macro",
 ]
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.92"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
+checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
 dependencies = [
  "bumpalo",
  "log",
- "once_cell",
  "proc-macro2",
  "quote",
  "syn 2.0.87",
@@ -6373,9 +6407,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.92"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
+checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -6383,9 +6417,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.92"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
+checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6396,9 +6430,12 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.92"
+version = "0.2.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
+checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+dependencies = [
+ "unicode-ident",
+]
 
 [[package]]
 name = "wasm-streams"
@@ -6803,6 +6840,15 @@ dependencies = [
  "url",
 ]
 
+[[package]]
+name = "wit-bindgen-rt"
+version = "0.33.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
+dependencies = [
+ "bitflags 2.9.0",
+]
+
 [[package]]
 name = "write16"
 version = "1.0.0"
@@ -6858,6 +6904,15 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "xz2"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2"
+dependencies = [
+ "lzma-sys",
+]
+
 [[package]]
 name = "yada"
 version = "0.5.1"
@@ -6999,9 +7054,9 @@ dependencies = [
 
 [[package]]
 name = "zip"
-version = "2.2.2"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae9c1ea7b3a5e1f4b922ff856a129881167511563dc219869afe3787fc0c1a45"
+checksum = "84e9a772a54b54236b9b744aaaf8d7be01b4d6e99725523cb82cb32d1c81b1d7"
 dependencies = [
  "aes",
  "arbitrary",
@@ -7012,15 +7067,16 @@ dependencies = [
  "deflate64",
  "displaydoc",
  "flate2",
+ "getrandom 0.3.1",
  "hmac",
  "indexmap",
  "lzma-rs",
  "memchr",
  "pbkdf2",
- "rand",
  "sha1",
  "thiserror 2.0.9",
  "time",
+ "xz2",
  "zeroize",
  "zopfli",
  "zstd",
diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml
index e25fd9400..4cfc5c2ac 100644
--- a/crates/meilisearch/Cargo.toml
+++ b/crates/meilisearch/Cargo.toml
@@ -140,7 +140,7 @@ reqwest = { version = "0.12.12", features = [
 sha-1 = { version = "0.10.1", optional = true }
 static-files = { version = "0.2.4", optional = true }
 tempfile = { version = "3.15.0", optional = true }
-zip = { version = "2.2.2", optional = true }
+zip = { version = "2.3.0", optional = true }
 
 [features]
 default = ["meilisearch-types/all-tokenizations", "mini-dashboard"]

From 71f7456748201cd2d48005794f63d769b78ff5b4 Mon Sep 17 00:00:00 2001
From: curquiza <clementine@meilisearch.com>
Date: Tue, 18 Mar 2025 12:48:38 +0100
Subject: [PATCH 10/37] Update mini-dashboard to v0.2.19 version

---
 crates/meilisearch/Cargo.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml
index 4cfc5c2ac..428f13c10 100644
--- a/crates/meilisearch/Cargo.toml
+++ b/crates/meilisearch/Cargo.toml
@@ -170,5 +170,5 @@ german = ["meilisearch-types/german"]
 turkish = ["meilisearch-types/turkish"]
 
 [package.metadata.mini-dashboard]
-assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.18/build.zip"
-sha1 = "b408a30dcb6e20cddb0c153c23385bcac4c8e912"
+assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.19/build.zip"
+sha1 = "7974430d5277c97f67cf6e95eec6faaac2788834"

From f540a69ac3d3b954b53734492e7b8ef3158ddc4d Mon Sep 17 00:00:00 2001
From: Tee Jun hui <teejunhui@Tees-Mac-mini.local>
Date: Wed, 5 Feb 2025 16:19:05 +0800
Subject: [PATCH 11/37] add 1 to index so it points to correct position

---
 crates/milli/src/search/new/bucket_sort.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs
index 8f1deb265..d0b7d258c 100644
--- a/crates/milli/src/search/new/bucket_sort.rs
+++ b/crates/milli/src/search/new/bucket_sort.rs
@@ -178,6 +178,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
                     if current_score < ranking_score_threshold {
                         all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
                         back!();
+                        cur_ranking_rule_index += 1;
                         continue;
                     }
                 }
@@ -213,6 +214,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
             continue;
         }
 
+
         let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(
             ctx,
             logger,
@@ -242,7 +244,9 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
             if current_score < ranking_score_threshold {
                 all_candidates -=
                     next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
+
                 back!();
+                cur_ranking_rule_index += 1;
                 continue;
             }
         }

From 8c8cc59a6c1a53a80eb63ee0637a402be20449e9 Mon Sep 17 00:00:00 2001
From: Tee Jun hui <teejunhui@Tees-Mac-mini.local>
Date: Wed, 5 Feb 2025 16:41:24 +0800
Subject: [PATCH 12/37] remove new line added by accident

---
 crates/milli/src/search/new/bucket_sort.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs
index d0b7d258c..172bdb3f9 100644
--- a/crates/milli/src/search/new/bucket_sort.rs
+++ b/crates/milli/src/search/new/bucket_sort.rs
@@ -214,7 +214,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
             continue;
         }
 
-
         let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(
             ctx,
             logger,
@@ -244,7 +243,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
             if current_score < ranking_score_threshold {
                 all_candidates -=
                     next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
-
                 back!();
                 cur_ranking_rule_index += 1;
                 continue;

From f9807ba32ef36fb3980299e07dec91df49b58bff Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 19 Mar 2025 11:33:44 +0100
Subject: [PATCH 13/37] Fix logic when results are below the threshold

---
 crates/milli/src/search/new/bucket_sort.rs | 41 +++++++++++-----------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs
index 172bdb3f9..a659dd226 100644
--- a/crates/milli/src/search/new/bucket_sort.rs
+++ b/crates/milli/src/search/new/bucket_sort.rs
@@ -173,17 +173,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
                 ranking_rule_scores.push(ScoreDetails::Skipped);
 
                 // remove candidates from the universe without adding them to result if their score is below the threshold
-                if let Some(ranking_score_threshold) = ranking_score_threshold {
-                    let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
-                    if current_score < ranking_score_threshold {
-                        all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
-                        back!();
-                        cur_ranking_rule_index += 1;
-                        continue;
-                    }
-                }
+                let is_below_threshold =
+                    ranking_score_threshold.is_some_and(|ranking_score_threshold| {
+                        let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
+                        current_score < ranking_score_threshold
+                    });
 
-                maybe_add_to_results!(bucket);
+                if is_below_threshold {
+                    all_candidates -= &bucket;
+                    all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
+                } else {
+                    maybe_add_to_results!(bucket);
+                }
 
                 ranking_rule_scores.pop();
 
@@ -238,24 +239,24 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
         );
 
         // remove candidates from the universe without adding them to result if their score is below the threshold
-        if let Some(ranking_score_threshold) = ranking_score_threshold {
+        let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
             let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
-            if current_score < ranking_score_threshold {
-                all_candidates -=
-                    next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
-                back!();
-                cur_ranking_rule_index += 1;
-                continue;
-            }
-        }
+            current_score < ranking_score_threshold
+        });
 
         ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
 
         if cur_ranking_rule_index == ranking_rules_len - 1
             || (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
             || cur_offset + (next_bucket.candidates.len() as usize) < from
+            || is_below_threshold
         {
-            maybe_add_to_results!(next_bucket.candidates);
+            if is_below_threshold {
+                all_candidates -=
+                    next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
+            } else {
+                maybe_add_to_results!(next_bucket.candidates);
+            }
             ranking_rule_scores.pop();
             continue;
         }

From 2e6aa63efc251124ed3565278de7beef36d2e182 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 24 Mar 2025 14:32:21 +0100
Subject: [PATCH 14/37] Update Charabia v0.9.3

---
 Cargo.lock              | 90 ++++++++++++++++++++---------------------
 crates/milli/Cargo.toml |  2 +-
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 53ab34da6..65b85cbcc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -976,9 +976,9 @@ dependencies = [
 
 [[package]]
 name = "charabia"
-version = "0.9.2"
+version = "0.9.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf8921fe4d53ab8f9e8f9b72ce6f91726cfc40fffab1243d27db406b5e2e9cc2"
+checksum = "650d52f87a36472ea1c803dee49d6bfd23d426efa9363e2f4c4a0e6a236d3407"
 dependencies = [
  "aho-corasick",
  "csv",
@@ -3031,7 +3031,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
 dependencies = [
  "cfg-if",
- "windows-targets 0.48.1",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -3075,9 +3075,9 @@ dependencies = [
 
 [[package]]
 name = "lindera"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c6cbc1aad631a7da0a7e9bc4b8669fa92ac9ca8eeb7b35a807376dd3034443ff"
+checksum = "832c220475557e3b44a46cad1862b57f010f0c6e93d771d0e628e08689c068b1"
 dependencies = [
  "lindera-analyzer",
  "lindera-core",
@@ -3088,9 +3088,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-analyzer"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74508ffbb24e36905d1718b261460e378a748029b07bcd7e06f0d18500b8194c"
+checksum = "a8e26651714abf5167e6b6a80f5cdaa0cad41c5fcb84d8ba96bebafcb9029339"
 dependencies = [
  "anyhow",
  "bincode",
@@ -3118,9 +3118,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-assets"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a677c371ecb3bd02b751be306ea09876cd47cf426303ad5f10a3fd6f9a4ded6"
+checksum = "ebb01f1ca53c1e642234c6c7fdb9ac664ad0c1ab9502f33e4200201bac7e6ce7"
 dependencies = [
  "encoding",
  "flate2",
@@ -3131,9 +3131,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-cc-cedict"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c35944000d05a177e981f037b5f0805f283b32f05a0c35713003bef136ca8cb4"
+checksum = "5f7618d9aa947fdd7c38eae2b79f0fd237ecb5067608f1363610ba20d20ab5a8"
 dependencies = [
  "bincode",
  "byteorder",
@@ -3145,9 +3145,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-cc-cedict-builder"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85b8f642bc9c9130682569975772a17336c6aab26d11fc0f823f3e663167ace6"
+checksum = "efdbcb809d81428935d601a78c94bfb39500749213f7320705f427a7a1d31aec"
 dependencies = [
  "anyhow",
  "lindera-core",
@@ -3157,9 +3157,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-compress"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7825d8d63592aa5727d67bd209170ac82df56c369533efbf0ddbac277bb68ec"
+checksum = "eac178afa2456dac469d3b1a2d7fbaf3e1ea796a1f52321e8ac29545a53c239c"
 dependencies = [
  "anyhow",
  "flate2",
@@ -3168,9 +3168,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-core"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c28191456debc98af6aa5f7db77872471983e9fa2a737b1c232b6ef543aed62"
+checksum = "649777465f48147ce593ab6db347e235e3af8f693a23f4437be94a1cdbdf5fdf"
 dependencies = [
  "anyhow",
  "bincode",
@@ -3185,9 +3185,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-decompress"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4788a1ead2f63f3fc2888109272921dedd86a87b7d0bf05e9daab46600daac51"
+checksum = "9e3faaceb85e43ac250021866c6db3cdc9997b44b3d3ea498594d04edc91fc45"
 dependencies = [
  "anyhow",
  "flate2",
@@ -3196,9 +3196,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-dictionary"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdf5f91725e32b9a21b1656baa7030766c9bafc4de4b4ddeb8ffdde7224dd2f6"
+checksum = "31e15b2d2d8a4ad45f2e373a084931cf3dfbde15f124044e2436bb920af3366c"
 dependencies = [
  "anyhow",
  "bincode",
@@ -3221,9 +3221,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-dictionary-builder"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e41f00ba7ac541b0ffd8c30e7a73f2dd197546cc5780462ec4f2e4782945a780"
+checksum = "59802949110545b59b663917ed3fd55dc3b3a8cde6bd20137d7fe24372cfb9aa"
 dependencies = [
  "anyhow",
  "bincode",
@@ -3243,9 +3243,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-filter"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "273d27e01e1377e2647314a4a5b9bdca4b52a867b319069ebae8c10191146eca"
+checksum = "1320f118c3fc9e897f4ebfc16864e5ef8c0b06ba769c0a50e53f193f9d682bf8"
 dependencies = [
  "anyhow",
  "csv",
@@ -3268,9 +3268,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-ipadic"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97a52ff0af5acb700093badaf7078051ab9ffd9071859724445a60193995f1f"
+checksum = "5b4731bf3730f1f38266d7ee9bca7d460cd336645c9dfd4e6a1082e58ab1e993"
 dependencies = [
  "bincode",
  "byteorder",
@@ -3282,9 +3282,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-ipadic-builder"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf5031c52686128db13f774b2c5a8abfd52b4cc1f904041d8411aa19d630ce4d"
+checksum = "309966c12e682f67205c3cd3c8dc55bbdcd1eb3b5c7c5cb41fb8acd18906d340"
 dependencies = [
  "anyhow",
  "lindera-core",
@@ -3294,9 +3294,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-ipadic-neologd"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6b36764b27b169aa11d24888141f206a6c246a5b195c1e67127485bac512fb6"
+checksum = "e90e919b4cfb9962d24ee1e1d50a7c163bbf356376495ad66d1996e20b9f9e44"
 dependencies = [
  "bincode",
  "byteorder",
@@ -3308,9 +3308,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-ipadic-neologd-builder"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abf36e40ace904741efdd883ed5c4dba6425f65156a0fb5d3f73a386335950dc"
+checksum = "7e517df0d501f9f8bf3126da20fc8cb9a5e37921e0eec1824d7a62f096463e02"
 dependencies = [
  "anyhow",
  "lindera-core",
@@ -3320,9 +3320,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-ko-dic"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c92a1a3564b531953f0238cbcea392f2905f7b27b449978cf9e702a80e1086d"
+checksum = "e9c6da4e68bc8b452a54b96d65361ebdceb4b6f36ecf262425c0e1f77960ae82"
 dependencies = [
  "bincode",
  "byteorder",
@@ -3335,9 +3335,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-ko-dic-builder"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9f2c60425abc1548570c2568858f74a1f042105ecd89faa39c651b4315350fd9"
+checksum = "afc95884cc8f6dfb176caf5991043a4acf94c359215bbd039ea765e00454f271"
 dependencies = [
  "anyhow",
  "lindera-core",
@@ -3347,9 +3347,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-tokenizer"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "903e558981bcb6f59870aa7d6b4bcb09e8f7db778886a6a70f67fd74c9fa2ca3"
+checksum = "d122042e1232a55c3604692445952a134e523822e9b4b9ab32a53ff890037ad4"
 dependencies = [
  "bincode",
  "lindera-core",
@@ -3361,9 +3361,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-unidic"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d227c3ce9cbd905f865c46c65a0470fd04e89b71104d7f92baa71a212ffe1d4b"
+checksum = "cbffae1fb2f2614abdcb50f99b138476dbac19862ffa57bfdc9c7b5d5b22a90c"
 dependencies = [
  "bincode",
  "byteorder",
@@ -3376,9 +3376,9 @@ dependencies = [
 
 [[package]]
 name = "lindera-unidic-builder"
-version = "0.32.2"
+version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99e2c50015c242e02c451acb6748667ac6fd1d3d667cd7db48cd89e2f2d2377e"
+checksum = "fe50055327712ebd1bcc74b657cf78c728a78b9586e3f99d5dd0b6a0be221c5d"
 dependencies = [
  "anyhow",
  "lindera-core",
@@ -6118,9 +6118,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "unicode-normalization"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
 dependencies = [
  "tinyvec",
 ]
diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml
index e3b9b077a..a2a020587 100644
--- a/crates/milli/Cargo.toml
+++ b/crates/milli/Cargo.toml
@@ -18,7 +18,7 @@ bincode = "1.3.3"
 bstr = "1.11.3"
 bytemuck = { version = "1.21.0", features = ["extern_crate_alloc"] }
 byteorder = "1.5.0"
-charabia = { version = "0.9.2", default-features = false }
+charabia = { version = "0.9.3", default-features = false }
 concat-arrays = "0.1.2"
 convert_case = "0.6.0"
 crossbeam-channel = "0.5.14"

From a09d08c7b6ed4fc3dd114d8306a1035c2cc0c0e5 Mon Sep 17 00:00:00 2001
From: Many the fish <many@meilisearch.com>
Date: Mon, 10 Mar 2025 14:51:23 +0100
Subject: [PATCH 15/37] Avoid reindexing searchable order changes

Update settings.rs

Update settings.rs
---
 crates/milli/src/update/settings.rs | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs
index 325a9f15c..9cab74444 100644
--- a/crates/milli/src/update/settings.rs
+++ b/crates/milli/src/update/settings.rs
@@ -1331,8 +1331,21 @@ impl InnerIndexSettingsDiff {
 
         let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
 
-        let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes
-            != new_settings.user_defined_searchable_attributes;
+        // Check if any searchable field has been added or removed form the list,
+        // Changing the order should not be considered as a change for reindexing.
+        let cache_user_defined_searchables = match (
+            &old_settings.user_defined_searchable_attributes,
+            &new_settings.user_defined_searchable_attributes,
+        ) {
+            (Some(old), Some(new)) => {
+                let old: BTreeSet<_> = old.iter().collect();
+                let new: BTreeSet<_> = new.iter().collect();
+
+                old != new
+            }
+            (None, None) => false,
+            _otherwise => true,
+        };
 
         // if the user-defined searchables changed, then we need to reindex prompts.
         if cache_user_defined_searchables {

From c0fe70c5f081605baf7f904d6469030aa2312f2a Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 20 Mar 2025 12:29:08 +0100
Subject: [PATCH 16/37] Make the CI work with merge queue grouping

---
 .github/workflows/test-suite.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml
index feb95d8ad..8904b6c75 100644
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -6,6 +6,7 @@ on:
     # Everyday at 5:00am
     - cron: "0 5 * * *"
   pull_request:
+  merge_group:
   push:
     # trying and staging branches are for Bors config
     branches:

From de6c7e551e04fe0cec72685dffc9a7da464b3122 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 20 Mar 2025 15:57:05 +0100
Subject: [PATCH 17/37] Remove bors references from the repository

---
 .github/workflows/test-suite.yml | 5 -----
 CONTRIBUTING.md                  | 5 ++---
 README.md                        | 2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml
index 8904b6c75..a13d51086 100644
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -7,11 +7,6 @@ on:
     - cron: "0 5 * * *"
   pull_request:
   merge_group:
-  push:
-    # trying and staging branches are for Bors config
-    branches:
-      - trying
-      - staging
 
 env:
   CARGO_TERM_COLOR: always
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 26d5b74b4..e129e5600 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -150,7 +150,7 @@ Some notes on GitHub PRs:
 - The PR title should be accurate and descriptive of the changes.
 - [Convert your PR as a draft](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request) if your changes are a work in progress: no one will review it until you pass your PR as ready for review.<br>
   The draft PRs are recommended when you want to show that you are working on something and make your work visible.
-- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [Bors](https://github.com/bors-ng/bors-ng) to automatically enforce this requirement without the PR author having to rebase manually.
+- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [GitHub Merge Queues](https://github.blog/news-insights/product-news/github-merge-queue-is-generally-available/) to automatically enforce this requirement without the PR author having to rebase manually.
 
 ## Release Process (for internal team only)
 
@@ -158,8 +158,7 @@ Meilisearch tools follow the [Semantic Versioning Convention](https://semver.org
 
 ### Automation to rebase and Merge the PRs
 
-This project integrates a bot that helps us manage pull requests merging.<br>
-_[Read more about this](https://github.com/meilisearch/integration-guides/blob/main/resources/bors.md)._
+This project uses GitHub Merge Queues that helps us manage pull requests merging.
 
 ### How to Publish a new Release
 
diff --git a/README.md b/README.md
index 42062781a..508efb14b 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@
 <p align="center">
   <a href="https://deps.rs/repo/github/meilisearch/meilisearch"><img src="https://deps.rs/repo/github/meilisearch/meilisearch/status.svg" alt="Dependency status"></a>
   <a href="https://github.com/meilisearch/meilisearch/blob/main/LICENSE"><img src="https://img.shields.io/badge/license-MIT-informational" alt="License"></a>
-  <a href="https://ms-bors.herokuapp.com/repositories/52"><img src="https://bors.tech/images/badge_small.svg" alt="Bors enabled"></a>
+  <a href="https://github.com/meilisearch/meilisearch/queue"><img alt="Merge Queues enabled" src="https://img.shields.io/badge/Merge_Queues-enabled-%2357cf60?logo=github"></a>
 </p>
 
 <p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>

From 1ad4235beb12d5093ba2440968694686a26270b9 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 20 Mar 2025 16:28:08 +0100
Subject: [PATCH 18/37] Remove the bors file

---
 bors.toml | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 bors.toml

diff --git a/bors.toml b/bors.toml
deleted file mode 100644
index 3d04b834c..000000000
--- a/bors.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-status = [
-    'Tests on ubuntu-22.04',
-    'Tests on macos-13',
-    'Tests on windows-2022',
-    'Run Clippy',
-    'Run Rustfmt',
-    'Run tests in debug',
-]
-# 3 hours timeout
-timeout-sec = 10800

From 182e5d56321509fc672a664feb5690c907f98459 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 25 Mar 2025 11:12:25 +0100
Subject: [PATCH 19/37] Add database sizes stats to the batches

---
 crates/dump/src/lib.rs                  | 1 +
 crates/meilisearch-types/src/batches.rs | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs
index 4e2d6ac2f..ee63f7048 100644
--- a/crates/dump/src/lib.rs
+++ b/crates/dump/src/lib.rs
@@ -326,6 +326,7 @@ pub(crate) mod test {
                 index_uids: maplit::btreemap! { "doggo".to_string() => 1 },
                 progress_trace: Default::default(),
                 write_channel_congestion: None,
+                internal_database_sizes: Default::default(),
             },
             enqueued_at: Some(BatchEnqueuedAt {
                 earliest: datetime!(2022-11-11 0:00 UTC),
diff --git a/crates/meilisearch-types/src/batches.rs b/crates/meilisearch-types/src/batches.rs
index 904682585..c7b9d6cfa 100644
--- a/crates/meilisearch-types/src/batches.rs
+++ b/crates/meilisearch-types/src/batches.rs
@@ -64,4 +64,6 @@ pub struct BatchStats {
     pub progress_trace: serde_json::Map<String, serde_json::Value>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub write_channel_congestion: Option<serde_json::Map<String, serde_json::Value>>,
+    #[serde(default, skip_serializing_if = "serde_json::Map::is_empty")]
+    pub internal_database_sizes: serde_json::Map<String, serde_json::Value>,
 }

From fd079c6757c619d67dbcfea8fe956bf2190241b7 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 25 Mar 2025 11:40:20 +0100
Subject: [PATCH 20/37] Add an index method to get the database sizes

---
 crates/milli/src/index.rs | 105 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs
index fcb8962d2..e0c124859 100644
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fs::File;
 use std::path::Path;
 
-use heed::{types::*, WithoutTls};
+use heed::{types::*, DatabaseStat, WithoutTls};
 use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
 use roaring::RoaringBitmap;
 use rstar::RTree;
@@ -1768,6 +1768,109 @@ impl Index {
         Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
             || self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
     }
+
+    /// Returns the sizes in bytes of each of the index database at the given rtxn.
+    pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> Result<HashMap<&'static str, usize>> {
+        let Self {
+            env: _,
+            main,
+            external_documents_ids,
+            word_docids,
+            exact_word_docids,
+            word_prefix_docids,
+            exact_word_prefix_docids,
+            word_pair_proximity_docids,
+            word_position_docids,
+            word_fid_docids,
+            word_prefix_position_docids,
+            word_prefix_fid_docids,
+            field_id_word_count_docids,
+            facet_id_f64_docids,
+            facet_id_string_docids,
+            facet_id_normalized_string_strings,
+            facet_id_string_fst,
+            facet_id_exists_docids,
+            facet_id_is_null_docids,
+            facet_id_is_empty_docids,
+            field_id_docid_facet_f64s,
+            field_id_docid_facet_strings,
+            vector_arroy,
+            embedder_category_id,
+            documents,
+        } = self;
+
+        fn compute_size(stats: DatabaseStat) -> usize {
+            let DatabaseStat {
+                page_size,
+                depth: _,
+                branch_pages,
+                leaf_pages,
+                overflow_pages,
+                entries: _,
+            } = stats;
+
+            (branch_pages + leaf_pages + overflow_pages) * page_size as usize
+        }
+
+        let mut sizes = HashMap::new();
+        sizes.insert("main", main.stat(rtxn).map(compute_size)?);
+        sizes
+            .insert("external_documents_ids", external_documents_ids.stat(rtxn).map(compute_size)?);
+        sizes.insert("word_docids", word_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert("exact_word_docids", exact_word_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert("word_prefix_docids", word_prefix_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert(
+            "exact_word_prefix_docids",
+            exact_word_prefix_docids.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert(
+            "word_pair_proximity_docids",
+            word_pair_proximity_docids.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert("word_position_docids", word_position_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert("word_fid_docids", word_fid_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert(
+            "word_prefix_position_docids",
+            word_prefix_position_docids.stat(rtxn).map(compute_size)?,
+        );
+        sizes
+            .insert("word_prefix_fid_docids", word_prefix_fid_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert(
+            "field_id_word_count_docids",
+            field_id_word_count_docids.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert("facet_id_f64_docids", facet_id_f64_docids.stat(rtxn).map(compute_size)?);
+        sizes
+            .insert("facet_id_string_docids", facet_id_string_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert(
+            "facet_id_normalized_string_strings",
+            facet_id_normalized_string_strings.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert("facet_id_string_fst", facet_id_string_fst.stat(rtxn).map(compute_size)?);
+        sizes
+            .insert("facet_id_exists_docids", facet_id_exists_docids.stat(rtxn).map(compute_size)?);
+        sizes.insert(
+            "facet_id_is_null_docids",
+            facet_id_is_null_docids.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert(
+            "facet_id_is_empty_docids",
+            facet_id_is_empty_docids.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert(
+            "field_id_docid_facet_f64s",
+            field_id_docid_facet_f64s.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert(
+            "field_id_docid_facet_strings",
+            field_id_docid_facet_strings.stat(rtxn).map(compute_size)?,
+        );
+        sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?);
+        sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?);
+        sizes.insert("documents", documents.stat(rtxn).map(compute_size)?);
+
+        Ok(sizes)
+    }
 }
 
 #[derive(Debug, Deserialize, Serialize)]

From 637bea0370af5ab727c750eb9ab3445797322615 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 25 Mar 2025 16:52:00 +0100
Subject: [PATCH 21/37] Compute and store the database sizes

---
 Cargo.lock                                    |  2 +
 crates/index-scheduler/Cargo.toml             |  2 +
 crates/index-scheduler/src/scheduler/mod.rs   | 34 ++++++++++--
 .../src/scheduler/process_batch.rs            | 52 ++++++++++++++-----
 crates/meilisearch/Cargo.toml                 |  6 +--
 crates/milli/src/index.rs                     |  5 +-
 6 files changed, 77 insertions(+), 24 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 65b85cbcc..96cfcf76c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2745,6 +2745,7 @@ dependencies = [
  "bincode",
  "bumpalo",
  "bumparaw-collections",
+ "byte-unit",
  "convert_case 0.6.0",
  "crossbeam-channel",
  "csv",
@@ -2753,6 +2754,7 @@ dependencies = [
  "enum-iterator",
  "file-store",
  "flate2",
+ "indexmap",
  "insta",
  "maplit",
  "meili-snap",
diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml
index 37b3ea835..31ff5f7d0 100644
--- a/crates/index-scheduler/Cargo.toml
+++ b/crates/index-scheduler/Cargo.toml
@@ -13,6 +13,7 @@ license.workspace = true
 [dependencies]
 anyhow = "1.0.95"
 bincode = "1.3.3"
+byte-unit = "5.1.6"
 bumpalo = "3.16.0"
 bumparaw-collections = "0.1.4"
 convert_case = "0.6.0"
@@ -22,6 +23,7 @@ dump = { path = "../dump" }
 enum-iterator = "2.1.0"
 file-store = { path = "../file-store" }
 flate2 = "1.0.35"
+indexmap = "2.7.0"
 meilisearch-auth = { path = "../meilisearch-auth" }
 meilisearch-types = { path = "../meilisearch-types" }
 memmap2 = "0.9.5"
diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs
index 1cbfece34..fe3084034 100644
--- a/crates/index-scheduler/src/scheduler/mod.rs
+++ b/crates/index-scheduler/src/scheduler/mod.rs
@@ -24,6 +24,7 @@ use meilisearch_types::error::ResponseError;
 use meilisearch_types::heed::{Env, WithoutTls};
 use meilisearch_types::milli;
 use meilisearch_types::tasks::Status;
+use process_batch::ProcessBatchInfo;
 use rayon::current_num_threads;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use roaring::RoaringBitmap;
@@ -223,16 +224,16 @@ impl IndexScheduler {
         let mut stop_scheduler_forever = false;
         let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?;
         let mut canceled = RoaringBitmap::new();
-        let mut congestion = None;
+        let mut process_batch_info = ProcessBatchInfo::default();
 
         match res {
-            Ok((tasks, cong)) => {
+            Ok((tasks, info)) => {
                 #[cfg(test)]
                 self.breakpoint(crate::test_utils::Breakpoint::ProcessBatchSucceeded);
 
                 let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32);
                 progress.update_progress(task_progress_obj);
-                congestion = cong;
+                process_batch_info = info;
                 let mut success = 0;
                 let mut failure = 0;
                 let mut canceled_by = None;
@@ -350,6 +351,9 @@ impl IndexScheduler {
         // We must re-add the canceled task so they're part of the same batch.
         ids |= canceled;
 
+        let ProcessBatchInfo { congestion, pre_commit_dabases_sizes, post_commit_dabases_sizes } =
+            process_batch_info;
+
         processing_batch.stats.progress_trace =
             progress.accumulated_durations().into_iter().map(|(k, v)| (k, v.into())).collect();
         processing_batch.stats.write_channel_congestion = congestion.map(|congestion| {
@@ -359,6 +363,30 @@ impl IndexScheduler {
             congestion_info.insert("blocking_ratio".into(), congestion.congestion_ratio().into());
             congestion_info
         });
+        processing_batch.stats.internal_database_sizes = pre_commit_dabases_sizes
+            .iter()
+            .flat_map(|(dbname, pre_size)| {
+                post_commit_dabases_sizes
+                    .get(dbname)
+                    .map(|post_size| {
+                        use byte_unit::{Byte, UnitType::Binary};
+                        use std::cmp::Ordering::{Equal, Greater, Less};
+
+                        let post = Byte::from_u64(*post_size as u64).get_appropriate_unit(Binary);
+                        let diff_size = post_size.abs_diff(*pre_size) as u64;
+                        let diff = Byte::from_u64(diff_size).get_appropriate_unit(Binary);
+                        let sign = match post_size.cmp(pre_size) {
+                            Equal => return None,
+                            Greater => "+",
+                            Less => "-",
+                        };
+
+                        Some((dbname.to_string(), format!("{post:#.2} ({sign}{diff:#.2})").into()))
+                    })
+                    .into_iter()
+                    .flatten()
+            })
+            .collect();
 
         if let Some(congestion) = congestion {
             tracing::debug!(
diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs
index 8f3987bf6..996b548c2 100644
--- a/crates/index-scheduler/src/scheduler/process_batch.rs
+++ b/crates/index-scheduler/src/scheduler/process_batch.rs
@@ -22,6 +22,16 @@ use crate::utils::{
 };
 use crate::{Error, IndexScheduler, Result, TaskId};
 
+#[derive(Debug, Default)]
+pub struct ProcessBatchInfo {
+    /// The write channel congestion. None when unavailable: settings update.
+    pub congestion: Option<ChannelCongestion>,
+    /// The sizes of the different databases before starting the indexation.
+    pub pre_commit_dabases_sizes: indexmap::IndexMap<&'static str, usize>,
+    /// The sizes of the different databases after commiting the indexation.
+    pub post_commit_dabases_sizes: indexmap::IndexMap<&'static str, usize>,
+}
+
 impl IndexScheduler {
     /// Apply the operation associated with the given batch.
     ///
@@ -35,7 +45,7 @@ impl IndexScheduler {
         batch: Batch,
         current_batch: &mut ProcessingBatch,
         progress: Progress,
-    ) -> Result<(Vec<Task>, Option<ChannelCongestion>)> {
+    ) -> Result<(Vec<Task>, ProcessBatchInfo)> {
         #[cfg(test)]
         {
             self.maybe_fail(crate::test_utils::FailureLocation::InsideProcessBatch)?;
@@ -76,7 +86,7 @@ impl IndexScheduler {
 
                 canceled_tasks.push(task);
 
-                Ok((canceled_tasks, None))
+                Ok((canceled_tasks, ProcessBatchInfo::default()))
             }
             Batch::TaskDeletions(mut tasks) => {
                 // 1. Retrieve the tasks that matched the query at enqueue-time.
@@ -115,14 +125,14 @@ impl IndexScheduler {
                         _ => unreachable!(),
                     }
                 }
-                Ok((tasks, None))
-            }
-            Batch::SnapshotCreation(tasks) => {
-                self.process_snapshot(progress, tasks).map(|tasks| (tasks, None))
-            }
-            Batch::Dump(task) => {
-                self.process_dump_creation(progress, task).map(|tasks| (tasks, None))
+                Ok((tasks, ProcessBatchInfo::default()))
             }
+            Batch::SnapshotCreation(tasks) => self
+                .process_snapshot(progress, tasks)
+                .map(|tasks| (tasks, ProcessBatchInfo::default())),
+            Batch::Dump(task) => self
+                .process_dump_creation(progress, task)
+                .map(|tasks| (tasks, ProcessBatchInfo::default())),
             Batch::IndexOperation { op, must_create_index } => {
                 let index_uid = op.index_uid().to_string();
                 let index = if must_create_index {
@@ -139,6 +149,7 @@ impl IndexScheduler {
                     .set_currently_updating_index(Some((index_uid.clone(), index.clone())));
 
                 let mut index_wtxn = index.write_txn()?;
+                let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?;
                 let (tasks, congestion) =
                     self.apply_index_operation(&mut index_wtxn, &index, op, progress)?;
 
@@ -153,12 +164,14 @@ impl IndexScheduler {
                 // stats of the index. Since the tasks have already been processed and
                 // this is a non-critical operation. If it fails, we should not fail
                 // the entire batch.
+                let mut post_commit_dabases_sizes = None;
                 let res = || -> Result<()> {
                     let index_rtxn = index.read_txn()?;
                     let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)
                         .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?;
                     let mut wtxn = self.env.write_txn()?;
                     self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?;
+                    post_commit_dabases_sizes = Some(index.database_sizes(&index_rtxn)?);
                     wtxn.commit()?;
                     Ok(())
                 }();
@@ -171,7 +184,16 @@ impl IndexScheduler {
                     ),
                 }
 
-                Ok((tasks, congestion))
+                let info = ProcessBatchInfo {
+                    congestion,
+                    // In case we fail to the get post-commit sizes we decide
+                    // that nothing changed and use the pre-commit sizes.
+                    post_commit_dabases_sizes: post_commit_dabases_sizes
+                        .unwrap_or_else(|| pre_commit_dabases_sizes.clone()),
+                    pre_commit_dabases_sizes,
+                };
+
+                Ok((tasks, info))
             }
             Batch::IndexCreation { index_uid, primary_key, task } => {
                 progress.update_progress(CreateIndexProgress::CreatingTheIndex);
@@ -239,7 +261,7 @@ impl IndexScheduler {
                     ),
                 }
 
-                Ok((vec![task], None))
+                Ok((vec![task], ProcessBatchInfo::default()))
             }
             Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => {
                 progress.update_progress(DeleteIndexProgress::DeletingTheIndex);
@@ -273,7 +295,9 @@ impl IndexScheduler {
                     };
                 }
 
-                Ok((tasks, None))
+                // Here we could also show that all the internal database sizes goes to 0
+                // but it would mean opening the index and that's costly.
+                Ok((tasks, ProcessBatchInfo::default()))
             }
             Batch::IndexSwap { mut task } => {
                 progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap);
@@ -321,7 +345,7 @@ impl IndexScheduler {
                 }
                 wtxn.commit()?;
                 task.status = Status::Succeeded;
-                Ok((vec![task], None))
+                Ok((vec![task], ProcessBatchInfo::default()))
             }
             Batch::UpgradeDatabase { mut tasks } => {
                 let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else {
@@ -351,7 +375,7 @@ impl IndexScheduler {
                     task.error = None;
                 }
 
-                Ok((tasks, None))
+                Ok((tasks, ProcessBatchInfo::default()))
             }
         }
     }
diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml
index 428f13c10..6360cdbde 100644
--- a/crates/meilisearch/Cargo.toml
+++ b/crates/meilisearch/Cargo.toml
@@ -30,11 +30,7 @@ actix-web = { version = "4.9.0", default-features = false, features = [
 anyhow = { version = "1.0.95", features = ["backtrace"] }
 async-trait = "0.1.85"
 bstr = "1.11.3"
-byte-unit = { version = "5.1.6", default-features = false, features = [
-    "std",
-    "byte",
-    "serde",
-] }
+byte-unit = { version = "5.1.6", features = ["serde"] }
 bytes = "1.9.0"
 clap = { version = "4.5.24", features = ["derive", "env"] }
 crossbeam-channel = "0.5.14"
diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs
index e0c124859..a2d839d03 100644
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -5,6 +5,7 @@ use std::path::Path;
 
 use heed::{types::*, DatabaseStat, WithoutTls};
 use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
+use indexmap::IndexMap;
 use roaring::RoaringBitmap;
 use rstar::RTree;
 use serde::{Deserialize, Serialize};
@@ -1770,7 +1771,7 @@ impl Index {
     }
 
     /// Returns the sizes in bytes of each of the index database at the given rtxn.
-    pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> Result<HashMap<&'static str, usize>> {
+    pub fn database_sizes(&self, rtxn: &RoTxn<'_>) -> heed::Result<IndexMap<&'static str, usize>> {
         let Self {
             env: _,
             main,
@@ -1812,7 +1813,7 @@ impl Index {
             (branch_pages + leaf_pages + overflow_pages) * page_size as usize
         }
 
-        let mut sizes = HashMap::new();
+        let mut sizes = IndexMap::new();
         sizes.insert("main", main.stat(rtxn).map(compute_size)?);
         sizes
             .insert("external_documents_ids", external_documents_ids.stat(rtxn).map(compute_size)?);

From 5820d822c8f18846c45f45bc4787a33400a623a3 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 25 Mar 2025 16:51:18 +0100
Subject: [PATCH 22/37] Add more details about the finalizing progress step

---
 crates/index-scheduler/src/insta_snapshot.rs  |   1 +
 crates/index-scheduler/src/processing.rs      |   7 +
 .../src/scheduler/process_batch.rs            |   6 +-
 .../src/scheduler/process_index_operation.rs  |  10 +-
 crates/meilisearch/tests/batches/mod.rs       | 126 ++++++++++++------
 crates/meilisearch/tests/dumps/mod.rs         |   1 +
 .../batches.snap                              |   1 +
 ...rEnqueuedAt_equal_2025-01-16T16_47_41.snap |   3 +-
 ...rFinishedAt_equal_2025-01-16T16_47_41.snap |   3 +-
 ...erStartedAt_equal_2025-01-16T16_47_41.snap |   3 +-
 ...ue_once_everything_has_been_processed.snap |   3 +-
 .../tests/upgrade/v1_12/v1_12_0.rs            |  22 +--
 12 files changed, 122 insertions(+), 64 deletions(-)

diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs
index bcc295afd..949edf369 100644
--- a/crates/index-scheduler/src/insta_snapshot.rs
+++ b/crates/index-scheduler/src/insta_snapshot.rs
@@ -344,6 +344,7 @@ pub fn snapshot_batch(batch: &Batch) -> String {
     let Batch { uid, details, stats, started_at, finished_at, progress: _, enqueued_at } = batch;
     let stats = BatchStats {
         progress_trace: Default::default(),
+        internal_database_sizes: Default::default(),
         write_channel_congestion: None,
         ..stats.clone()
     };
diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs
index fed26aeb7..09ce46884 100644
--- a/crates/index-scheduler/src/processing.rs
+++ b/crates/index-scheduler/src/processing.rs
@@ -64,6 +64,13 @@ make_enum_progress! {
     }
 }
 
+make_enum_progress! {
+    pub enum FinalizingIndexStep {
+        Committing,
+        ComputingStats,
+    }
+}
+
 make_enum_progress! {
     pub enum TaskCancelationProgress {
         RetrievingTasks,
diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs
index 996b548c2..42de1d137 100644
--- a/crates/index-scheduler/src/scheduler/process_batch.rs
+++ b/crates/index-scheduler/src/scheduler/process_batch.rs
@@ -12,7 +12,7 @@ use roaring::RoaringBitmap;
 
 use super::create_batch::Batch;
 use crate::processing::{
-    AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress,
+    AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep,
     InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress,
     UpdateIndexProgress,
 };
@@ -151,9 +151,10 @@ impl IndexScheduler {
                 let mut index_wtxn = index.write_txn()?;
                 let pre_commit_dabases_sizes = index.database_sizes(&index_wtxn)?;
                 let (tasks, congestion) =
-                    self.apply_index_operation(&mut index_wtxn, &index, op, progress)?;
+                    self.apply_index_operation(&mut index_wtxn, &index, op, &progress)?;
 
                 {
+                    progress.update_progress(FinalizingIndexStep::Committing);
                     let span = tracing::trace_span!(target: "indexing::scheduler", "commit");
                     let _entered = span.enter();
 
@@ -166,6 +167,7 @@ impl IndexScheduler {
                 // the entire batch.
                 let mut post_commit_dabases_sizes = None;
                 let res = || -> Result<()> {
+                    progress.update_progress(FinalizingIndexStep::ComputingStats);
                     let index_rtxn = index.read_txn()?;
                     let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)
                         .map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?;
diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs
index 690fe2efd..9b12d61cf 100644
--- a/crates/index-scheduler/src/scheduler/process_index_operation.rs
+++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs
@@ -32,7 +32,7 @@ impl IndexScheduler {
         index_wtxn: &mut RwTxn<'i>,
         index: &'i Index,
         operation: IndexOperation,
-        progress: Progress,
+        progress: &Progress,
     ) -> Result<(Vec<Task>, Option<ChannelCongestion>)> {
         let indexer_alloc = Bump::new();
         let started_processing_at = std::time::Instant::now();
@@ -186,7 +186,7 @@ impl IndexScheduler {
                             &document_changes,
                             embedders,
                             &|| must_stop_processing.get(),
-                            &progress,
+                            progress,
                         )
                         .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?,
                     );
@@ -307,7 +307,7 @@ impl IndexScheduler {
                             &document_changes,
                             embedders,
                             &|| must_stop_processing.get(),
-                            &progress,
+                            progress,
                         )
                         .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
                     );
@@ -465,7 +465,7 @@ impl IndexScheduler {
                             &document_changes,
                             embedders,
                             &|| must_stop_processing.get(),
-                            &progress,
+                            progress,
                         )
                         .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
                     );
@@ -520,7 +520,7 @@ impl IndexScheduler {
                         index_uid: index_uid.clone(),
                         tasks: cleared_tasks,
                     },
-                    progress.clone(),
+                    progress,
                 )?;
 
                 let (settings_tasks, _congestion) = self.apply_index_operation(
diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs
index 468963631..e955c6883 100644
--- a/crates/meilisearch/tests/batches/mod.rs
+++ b/crates/meilisearch/tests/batches/mod.rs
@@ -281,7 +281,8 @@ async fn test_summarized_document_addition_or_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -303,7 +304,8 @@ async fn test_summarized_document_addition_or_update() {
           "test": 1
         },
         "progressTrace": "[progressTrace]",
-        "writeChannelCongestion": "[writeChannelCongestion]"
+        "writeChannelCongestion": "[writeChannelCongestion]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -322,7 +324,8 @@ async fn test_summarized_document_addition_or_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -344,7 +347,8 @@ async fn test_summarized_document_addition_or_update() {
           "test": 1
         },
         "progressTrace": "[progressTrace]",
-        "writeChannelCongestion": "[writeChannelCongestion]"
+        "writeChannelCongestion": "[writeChannelCongestion]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -367,7 +371,8 @@ async fn test_summarized_delete_documents_by_batch() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -388,7 +393,8 @@ async fn test_summarized_delete_documents_by_batch() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -407,7 +413,8 @@ async fn test_summarized_delete_documents_by_batch() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -428,7 +435,8 @@ async fn test_summarized_delete_documents_by_batch() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -453,7 +461,8 @@ async fn test_summarized_delete_documents_by_filter() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -475,7 +484,8 @@ async fn test_summarized_delete_documents_by_filter() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -495,7 +505,8 @@ async fn test_summarized_delete_documents_by_filter() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -517,7 +528,8 @@ async fn test_summarized_delete_documents_by_filter() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -537,7 +549,8 @@ async fn test_summarized_delete_documents_by_filter() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r#"
     {
@@ -559,7 +572,8 @@ async fn test_summarized_delete_documents_by_filter() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -583,7 +597,8 @@ async fn test_summarized_delete_document_by_id() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r#"
     {
@@ -604,7 +619,8 @@ async fn test_summarized_delete_document_by_id() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -623,7 +639,8 @@ async fn test_summarized_delete_document_by_id() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r#"
     {
@@ -644,7 +661,8 @@ async fn test_summarized_delete_document_by_id() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -679,7 +697,8 @@ async fn test_summarized_settings_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -709,7 +728,8 @@ async fn test_summarized_settings_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -732,7 +752,8 @@ async fn test_summarized_index_creation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -750,7 +771,8 @@ async fn test_summarized_index_creation() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -768,7 +790,8 @@ async fn test_summarized_index_creation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -788,7 +811,8 @@ async fn test_summarized_index_creation() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -921,7 +945,8 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -939,7 +964,8 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -957,7 +983,8 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -977,7 +1004,8 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -998,7 +1026,8 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r#"
     {
@@ -1016,7 +1045,8 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1034,7 +1064,8 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -1054,7 +1085,8 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1080,7 +1112,8 @@ async fn test_summarized_index_swap() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -1105,7 +1138,8 @@ async fn test_summarized_index_swap() {
           "indexSwap": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1129,7 +1163,8 @@ async fn test_summarized_index_swap() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -1147,7 +1182,8 @@ async fn test_summarized_index_swap() {
         "indexUids": {
           "doggos": 1
         },
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1173,7 +1209,8 @@ async fn test_summarized_batch_cancelation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -1193,7 +1230,8 @@ async fn test_summarized_batch_cancelation() {
           "taskCancelation": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1219,7 +1257,8 @@ async fn test_summarized_batch_deletion() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -1239,7 +1278,8 @@ async fn test_summarized_batch_deletion() {
           "taskDeletion": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1262,7 +1302,8 @@ async fn test_summarized_dump_creation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
+            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
         },
         @r###"
     {
@@ -1280,7 +1321,8 @@ async fn test_summarized_dump_creation() {
           "dumpCreation": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs
index ff0b027cb..fa05d9ec9 100644
--- a/crates/meilisearch/tests/dumps/mod.rs
+++ b/crates/meilisearch/tests/dumps/mod.rs
@@ -2236,6 +2236,7 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() {
         ".results[0].finishedAt" => "[date]",
         ".results[0].duration" => "[date]",
         ".results[0].stats.progressTrace" => "[progressTrace]",
+        ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]",
         ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]",
     }), name: "batches");
 
diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap
index b38340ef6..b2dea1f06 100644
--- a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap
+++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap
@@ -22,6 +22,7 @@ source: crates/meilisearch/tests/dumps/mod.rs
           "kefir": 1
         },
         "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]",
         "writeChannelCongestion": "[writeChannelCongestion]"
       },
       "duration": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap
index 99caeaf96..b79f55351 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap
@@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap
index 99caeaf96..b79f55351 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap
@@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap
index 99caeaf96..b79f55351 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap
@@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap
index 623c1f778..3cfed9f74 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap
@@ -19,7 +19,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]"
+        "progressTrace": "[progressTrace]",
+        "internalDatabaseSizes": "[internalDatabaseSizes]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
index 11ba2882a..8157f0923 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
+++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
@@ -166,7 +166,7 @@ async fn check_the_index_scheduler(server: &Server) {
     let (tasks, _) = server.tasks_filter("limit=1000").await;
     snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "the_whole_task_queue_once_everything_has_been_processed");
     let (batches, _) = server.batches_filter("limit=1000").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed");
 
     // Tests all the tasks query parameters
     let (tasks, _) = server.tasks_filter("uids=10").await;
@@ -193,26 +193,26 @@ async fn check_the_index_scheduler(server: &Server) {
 
     // Tests all the batches query parameters
     let (batches, _) = server.batches_filter("uids=10").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_uids_equal_10");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_uids_equal_10");
     let (batches, _) = server.batches_filter("batchUids=10").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_batchUids_equal_10");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_batchUids_equal_10");
     let (batches, _) = server.batches_filter("statuses=canceled").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_statuses_equal_canceled");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_statuses_equal_canceled");
     // types has already been tested above to retrieve the upgrade database
     let (batches, _) = server.batches_filter("canceledBy=19").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_canceledBy_equal_19");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_canceledBy_equal_19");
     let (batches, _) = server.batches_filter("beforeEnqueuedAt=2025-01-16T16:47:41Z").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeEnqueuedAt_equal_2025-01-16T16_47_41");
     let (batches, _) = server.batches_filter("afterEnqueuedAt=2025-01-16T16:47:41Z").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41");
     let (batches, _) = server.batches_filter("beforeStartedAt=2025-01-16T16:47:41Z").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeStartedAt_equal_2025-01-16T16_47_41");
     let (batches, _) = server.batches_filter("afterStartedAt=2025-01-16T16:47:41Z").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterStartedAt_equal_2025-01-16T16_47_41");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterStartedAt_equal_2025-01-16T16_47_41");
     let (batches, _) = server.batches_filter("beforeFinishedAt=2025-01-16T16:47:41Z").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_beforeFinishedAt_equal_2025-01-16T16_47_41");
     let (batches, _) = server.batches_filter("afterFinishedAt=2025-01-16T16:47:41Z").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41");
 
     let (stats, _) = server.stats().await;
     assert_json_snapshot!(stats, {

From 3deb1ef78ff77d59fbb57bd6ba7eaf4c5e4fe0cb Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 25 Mar 2025 18:53:32 +0100
Subject: [PATCH 23/37] Fix the snapshots again

---
 crates/meilisearch/tests/batches/mod.rs       | 102 ++++++------------
 crates/meilisearch/tests/dumps/mod.rs         |   2 +-
 .../batches.snap                              |   1 -
 ...rEnqueuedAt_equal_2025-01-16T16_47_41.snap |   3 +-
 ...rFinishedAt_equal_2025-01-16T16_47_41.snap |   3 +-
 ...erStartedAt_equal_2025-01-16T16_47_41.snap |   3 +-
 ...ue_once_everything_has_been_processed.snap |   3 +-
 .../tests/upgrade/v1_12/v1_12_0.rs            |   2 +-
 8 files changed, 40 insertions(+), 79 deletions(-)

diff --git a/crates/meilisearch/tests/batches/mod.rs b/crates/meilisearch/tests/batches/mod.rs
index e955c6883..6c2aa4aaf 100644
--- a/crates/meilisearch/tests/batches/mod.rs
+++ b/crates/meilisearch/tests/batches/mod.rs
@@ -347,8 +347,7 @@ async fn test_summarized_document_addition_or_update() {
           "test": 1
         },
         "progressTrace": "[progressTrace]",
-        "writeChannelCongestion": "[writeChannelCongestion]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "writeChannelCongestion": "[writeChannelCongestion]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -371,8 +370,7 @@ async fn test_summarized_delete_documents_by_batch() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -393,8 +391,7 @@ async fn test_summarized_delete_documents_by_batch() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -435,8 +432,7 @@ async fn test_summarized_delete_documents_by_batch() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -461,8 +457,7 @@ async fn test_summarized_delete_documents_by_filter() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -484,8 +479,7 @@ async fn test_summarized_delete_documents_by_filter() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -528,8 +522,7 @@ async fn test_summarized_delete_documents_by_filter() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -572,8 +565,7 @@ async fn test_summarized_delete_documents_by_filter() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -597,8 +589,7 @@ async fn test_summarized_delete_document_by_id() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r#"
     {
@@ -619,8 +610,7 @@ async fn test_summarized_delete_document_by_id() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -661,8 +651,7 @@ async fn test_summarized_delete_document_by_id() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -728,8 +717,7 @@ async fn test_summarized_settings_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -752,8 +740,7 @@ async fn test_summarized_index_creation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -771,8 +758,7 @@ async fn test_summarized_index_creation() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -790,8 +776,7 @@ async fn test_summarized_index_creation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -811,8 +796,7 @@ async fn test_summarized_index_creation() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -945,8 +929,7 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -964,8 +947,7 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -983,8 +965,7 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1004,8 +985,7 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1026,8 +1006,7 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r#"
     {
@@ -1045,8 +1024,7 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1064,8 +1042,7 @@ async fn test_summarized_index_update() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1085,8 +1062,7 @@ async fn test_summarized_index_update() {
         "indexUids": {
           "test": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1112,8 +1088,7 @@ async fn test_summarized_index_swap() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1138,8 +1113,7 @@ async fn test_summarized_index_swap() {
           "indexSwap": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1163,8 +1137,7 @@ async fn test_summarized_index_swap() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1182,8 +1155,7 @@ async fn test_summarized_index_swap() {
         "indexUids": {
           "doggos": 1
         },
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1209,8 +1181,7 @@ async fn test_summarized_batch_cancelation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1230,8 +1201,7 @@ async fn test_summarized_batch_cancelation() {
           "taskCancelation": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1257,8 +1227,7 @@ async fn test_summarized_batch_deletion() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1278,8 +1247,7 @@ async fn test_summarized_batch_deletion() {
           "taskDeletion": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
@@ -1302,8 +1270,7 @@ async fn test_summarized_dump_creation() {
             ".startedAt" => "[date]",
             ".finishedAt" => "[date]",
             ".stats.progressTrace" => "[progressTrace]",
-            ".stats.writeChannelCongestion" => "[writeChannelCongestion]",
-            ".stats.internalDatabaseSizes" => "[internalDatabaseSizes]"
+            ".stats.writeChannelCongestion" => "[writeChannelCongestion]"
         },
         @r###"
     {
@@ -1321,8 +1288,7 @@ async fn test_summarized_dump_creation() {
           "dumpCreation": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs
index fa05d9ec9..addcbeeb5 100644
--- a/crates/meilisearch/tests/dumps/mod.rs
+++ b/crates/meilisearch/tests/dumps/mod.rs
@@ -2236,8 +2236,8 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() {
         ".results[0].finishedAt" => "[date]",
         ".results[0].duration" => "[date]",
         ".results[0].stats.progressTrace" => "[progressTrace]",
-        ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]",
         ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]",
+        ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]",
     }), name: "batches");
 
     let (indexes, code) = server.list_indexes(None, None).await;
diff --git a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap
index b2dea1f06..b38340ef6 100644
--- a/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap
+++ b/crates/meilisearch/tests/dumps/snapshots/mod.rs/import_dump_v6_containing_batches_and_enqueued_tasks/batches.snap
@@ -22,7 +22,6 @@ source: crates/meilisearch/tests/dumps/mod.rs
           "kefir": 1
         },
         "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]",
         "writeChannelCongestion": "[writeChannelCongestion]"
       },
       "duration": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap
index b79f55351..99caeaf96 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap
@@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap
index b79f55351..99caeaf96 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap
@@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap
index b79f55351..99caeaf96 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap
@@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap
index 3cfed9f74..623c1f778 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap
@@ -19,8 +19,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
           "upgradeDatabase": 1
         },
         "indexUids": {},
-        "progressTrace": "[progressTrace]",
-        "internalDatabaseSizes": "[internalDatabaseSizes]"
+        "progressTrace": "[progressTrace]"
       },
       "duration": "[duration]",
       "startedAt": "[date]",
diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
index 8157f0923..9fc4d0e5b 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
+++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
@@ -166,7 +166,7 @@ async fn check_the_index_scheduler(server: &Server) {
     let (tasks, _) = server.tasks_filter("limit=1000").await;
     snapshot!(json_string!(tasks, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]" }), name: "the_whole_task_queue_once_everything_has_been_processed");
     let (batches, _) = server.batches_filter("limit=1000").await;
-    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.internalDatabaseSizes" => "[internalDatabaseSizes]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed");
+    snapshot!(json_string!(batches, { ".results[0].duration" => "[duration]", ".results[0].enqueuedAt" => "[date]", ".results[0].startedAt" => "[date]", ".results[0].finishedAt" => "[date]", ".results[0].stats.progressTrace" => "[progressTrace]", ".results[0].stats.writeChannelCongestion" => "[writeChannelCongestion]" }), name: "the_whole_batch_queue_once_everything_has_been_processed");
 
     // Tests all the tasks query parameters
     let (tasks, _) = server.tasks_filter("uids=10").await;

From 7ed9adde295840170b6cff351c335c30e1e9e1ab Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 26 Mar 2025 16:45:52 +0100
Subject: [PATCH 24/37] Prefer camelCase for internal database sizes db name

---
 crates/index-scheduler/src/scheduler/mod.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs
index fe3084034..f0e324a8d 100644
--- a/crates/index-scheduler/src/scheduler/mod.rs
+++ b/crates/index-scheduler/src/scheduler/mod.rs
@@ -20,6 +20,7 @@ use std::path::PathBuf;
 use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
 use std::sync::Arc;
 
+use convert_case::{Case, Casing as _};
 use meilisearch_types::error::ResponseError;
 use meilisearch_types::heed::{Env, WithoutTls};
 use meilisearch_types::milli;
@@ -381,7 +382,10 @@ impl IndexScheduler {
                             Less => "-",
                         };
 
-                        Some((dbname.to_string(), format!("{post:#.2} ({sign}{diff:#.2})").into()))
+                        Some((
+                            dbname.to_case(Case::Camel),
+                            format!("{post:#.2} ({sign}{diff:#.2})").into(),
+                        ))
                     })
                     .into_iter()
                     .flatten()

From db7ce037634989ebce6040c1d291d7022924b395 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 26 Mar 2025 17:13:09 +0100
Subject: [PATCH 25/37] Improve the performances of computing the size of the
 documents database

---
 crates/meilisearch/src/routes/indexes/mod.rs  |  2 +-
 crates/milli/src/database_stats.rs            | 77 +++++++------------
 crates/milli/src/index.rs                     | 32 --------
 .../milli/src/update/index_documents/mod.rs   |  4 +-
 crates/milli/src/update/new/indexer/mod.rs    |  1 -
 crates/milli/src/update/new/indexer/write.rs  |  5 +-
 6 files changed, 33 insertions(+), 88 deletions(-)

diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs
index 5aebf5cac..48ed1cfb1 100644
--- a/crates/meilisearch/src/routes/indexes/mod.rs
+++ b/crates/meilisearch/src/routes/indexes/mod.rs
@@ -518,7 +518,7 @@ impl From<index_scheduler::IndexStats> for IndexStats {
                 .inner_stats
                 .number_of_documents
                 .unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()),
-            raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
+            raw_document_db_size: stats.inner_stats.documents_database_stats.total_size(),
             avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
             is_indexing: stats.is_indexing,
             number_of_embeddings: stats.inner_stats.number_of_embeddings,
diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs
index d97dc13ba..7da1fbd2b 100644
--- a/crates/milli/src/database_stats.rs
+++ b/crates/milli/src/database_stats.rs
@@ -1,8 +1,13 @@
-use heed::types::Bytes;
+use std::mem;
+
 use heed::Database;
+use heed::DatabaseStat;
 use heed::RoTxn;
+use heed::Unspecified;
 use serde::{Deserialize, Serialize};
 
+use crate::BEU32;
+
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
 #[serde(rename_all = "camelCase")]
 /// The stats of a database.
@@ -20,58 +25,24 @@ impl DatabaseStats {
     ///
     /// This function iterates over the whole database and computes the stats.
     /// It is not efficient and should be cached somewhere.
-    pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> {
-        let mut database_stats =
-            Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 };
+    pub(crate) fn new(
+        database: Database<BEU32, Unspecified>,
+        rtxn: &RoTxn<'_>,
+    ) -> heed::Result<Self> {
+        let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } =
+            database.stat(rtxn)?;
 
-        let mut iter = database.iter(rtxn)?;
-        while let Some((key, value)) = iter.next().transpose()? {
-            let key_size = key.len() as u64;
-            let value_size = value.len() as u64;
-            database_stats.total_key_size += key_size;
-            database_stats.total_value_size += value_size;
-        }
+        // We first take the total size without overflow pages as the overflow pages contains the values and only that.
+        let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize;
+        // We compute an estimated size for the keys.
+        let total_key_size = entries * (mem::size_of::<u32>() + 4);
+        let total_value_size = total_size - total_key_size;
 
-        database_stats.number_of_entries = database.len(rtxn)?;
-
-        Ok(database_stats)
-    }
-
-    /// Recomputes the stats of the database and returns the new stats.
-    ///
-    /// This function is used to update the stats of the database when some keys are modified.
-    /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states.
-    pub(crate) fn recompute<I, K>(
-        mut stats: Self,
-        database: Database<Bytes, Bytes>,
-        before_rtxn: &RoTxn<'_>,
-        after_rtxn: &RoTxn<'_>,
-        modified_keys: I,
-    ) -> heed::Result<Self>
-    where
-        I: IntoIterator<Item = K>,
-        K: AsRef<[u8]>,
-    {
-        for key in modified_keys {
-            let key = key.as_ref();
-            if let Some(value) = database.get(after_rtxn, key)? {
-                let key_size = key.len() as u64;
-                let value_size = value.len() as u64;
-                stats.total_key_size = stats.total_key_size.saturating_add(key_size);
-                stats.total_value_size = stats.total_value_size.saturating_add(value_size);
-            }
-
-            if let Some(value) = database.get(before_rtxn, key)? {
-                let key_size = key.len() as u64;
-                let value_size = value.len() as u64;
-                stats.total_key_size = stats.total_key_size.saturating_sub(key_size);
-                stats.total_value_size = stats.total_value_size.saturating_sub(value_size);
-            }
-        }
-
-        stats.number_of_entries = database.len(after_rtxn)?;
-
-        Ok(stats)
+        Ok(Self {
+            number_of_entries: entries as u64,
+            total_key_size: total_key_size as u64,
+            total_value_size: total_value_size as u64,
+        })
     }
 
     pub fn average_key_size(&self) -> u64 {
@@ -86,6 +57,10 @@ impl DatabaseStats {
         self.number_of_entries
     }
 
+    pub fn total_size(&self) -> u64 {
+        self.total_key_size + self.total_value_size
+    }
+
     pub fn total_key_size(&self) -> u64 {
         self.total_key_size
     }
diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs
index a2d839d03..5f74863e8 100644
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -411,38 +411,6 @@ impl Index {
         Ok(count.unwrap_or_default())
     }
 
-    /// Updates the stats of the documents database based on the previous stats and the modified docids.
-    pub fn update_documents_stats(
-        &self,
-        wtxn: &mut RwTxn<'_>,
-        modified_docids: roaring::RoaringBitmap,
-    ) -> Result<()> {
-        let before_rtxn = self.read_txn()?;
-        let document_stats = match self.documents_stats(&before_rtxn)? {
-            Some(before_stats) => DatabaseStats::recompute(
-                before_stats,
-                self.documents.remap_types(),
-                &before_rtxn,
-                wtxn,
-                modified_docids.iter().map(|docid| docid.to_be_bytes()),
-            )?,
-            None => {
-                // This should never happen when there are already documents in the index, the documents stats should be present.
-                // If it happens, it means that the index was not properly initialized/upgraded.
-                debug_assert_eq!(
-                    self.documents.len(&before_rtxn)?,
-                    0,
-                    "The documents stats should be present when there are documents in the index"
-                );
-                tracing::warn!("No documents stats found, creating new ones");
-                DatabaseStats::new(self.documents.remap_types(), &*wtxn)?
-            }
-        };
-
-        self.put_documents_stats(wtxn, document_stats)?;
-        Ok(())
-    }
-
     /// Writes the stats of the documents database.
     pub fn put_documents_stats(
         &self,
diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs
index 95342054d..5d445d283 100644
--- a/crates/milli/src/update/index_documents/mod.rs
+++ b/crates/milli/src/update/index_documents/mod.rs
@@ -28,6 +28,7 @@ pub use self::helpers::*;
 pub use self::transform::{Transform, TransformOutput};
 use super::facet::clear_facet_levels_based_on_settings_diff;
 use super::new::StdResult;
+use crate::database_stats::DatabaseStats;
 use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError};
 use crate::index::{PrefixSearch, PrefixSettings};
@@ -476,7 +477,8 @@ where
 
         if !settings_diff.settings_update_only {
             // Update the stats of the documents database when there is a document update.
-            self.index.update_documents_stats(self.wtxn, modified_docids)?;
+            let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?;
+            self.index.put_documents_stats(self.wtxn, stats)?;
         }
         // We write the field distribution into the main database
         self.index.put_field_distribution(self.wtxn, &field_distribution)?;
diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs
index 4f2dd19c9..d2a88f4ff 100644
--- a/crates/milli/src/update/new/indexer/mod.rs
+++ b/crates/milli/src/update/new/indexer/mod.rs
@@ -234,7 +234,6 @@ where
         embedders,
         field_distribution,
         document_ids,
-        modified_docids,
     )?;
 
     Ok(congestion)
diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs
index 8618b4b21..7ab7991b2 100644
--- a/crates/milli/src/update/new/indexer/write.rs
+++ b/crates/milli/src/update/new/indexer/write.rs
@@ -7,6 +7,7 @@ use rand::SeedableRng as _;
 use time::OffsetDateTime;
 
 use super::super::channel::*;
+use crate::database_stats::DatabaseStats;
 use crate::documents::PrimaryKey;
 use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
 use crate::index::IndexEmbeddingConfig;
@@ -142,7 +143,6 @@ pub(super) fn update_index(
     embedders: EmbeddingConfigs,
     field_distribution: std::collections::BTreeMap<String, u64>,
     document_ids: roaring::RoaringBitmap,
-    modified_docids: roaring::RoaringBitmap,
 ) -> Result<()> {
     index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
     if let Some(new_primary_key) = new_primary_key {
@@ -153,7 +153,8 @@ pub(super) fn update_index(
     index.put_field_distribution(wtxn, &field_distribution)?;
     index.put_documents_ids(wtxn, &document_ids)?;
     index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
-    index.update_documents_stats(wtxn, modified_docids)?;
+    let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?;
+    index.put_documents_stats(wtxn, stats)?;
     Ok(())
 }
 

From c670e9a39bcde2b0e415be7ddb639823410805b6 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 26 Mar 2025 18:08:26 +0100
Subject: [PATCH 26/37] Make sure the snaps are happy

---
 .../tests/documents/delete_documents.rs       |  27 ++--
 crates/meilisearch/tests/dumps/mod.rs         | 126 ++++++++++++------
 crates/meilisearch/tests/stats/mod.rs         |  72 ++++++----
 .../tests/upgrade/v1_12/v1_12_0.rs            |  25 ++--
 4 files changed, 166 insertions(+), 84 deletions(-)

diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs
index 4dfe2cc79..060f17958 100644
--- a/crates/meilisearch/tests/documents/delete_documents.rs
+++ b/crates/meilisearch/tests/documents/delete_documents.rs
@@ -157,11 +157,14 @@ async fn delete_document_by_filter() {
     index.wait_task(task.uid()).await.succeeded();
 
     let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 4,
-      "rawDocumentDbSize": 42,
-      "avgDocumentSize": 10,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -208,11 +211,14 @@ async fn delete_document_by_filter() {
     "###);
 
     let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 16,
-      "avgDocumentSize": 8,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -278,11 +284,14 @@ async fn delete_document_by_filter() {
     "###);
 
     let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 1,
-      "rawDocumentDbSize": 12,
-      "avgDocumentSize": 12,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs
index addcbeeb5..e5aa52dc6 100644
--- a/crates/meilisearch/tests/dumps/mod.rs
+++ b/crates/meilisearch/tests/dumps/mod.rs
@@ -28,12 +28,15 @@ async fn import_dump_v1_movie_raw() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -185,12 +188,15 @@ async fn import_dump_v1_movie_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-        json_string!(stats),
+        json_string!(stats, {
+            ".rawDocumentDbSize" => "[size]",
+            ".avgDocumentSize" => "[size]",
+        }),
         @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -355,12 +361,15 @@ async fn import_dump_v1_rubygems_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -522,12 +531,15 @@ async fn import_dump_v2_movie_raw() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -679,12 +691,15 @@ async fn import_dump_v2_movie_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -846,12 +861,15 @@ async fn import_dump_v2_rubygems_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1010,12 +1028,15 @@ async fn import_dump_v3_movie_raw() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1167,12 +1188,15 @@ async fn import_dump_v3_movie_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1334,12 +1358,15 @@ async fn import_dump_v3_rubygems_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1498,12 +1525,15 @@ async fn import_dump_v4_movie_raw() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1655,12 +1685,15 @@ async fn import_dump_v4_movie_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1822,12 +1855,15 @@ async fn import_dump_v4_rubygems_with_settings() {
     let (stats, code) = index.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -1994,11 +2030,14 @@ async fn import_dump_v5() {
 
     let (stats, code) = index1.stats().await;
     snapshot!(code, @"200 OK");
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 10,
-      "rawDocumentDbSize": 6782,
-      "avgDocumentSize": 678,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -2031,12 +2070,15 @@ async fn import_dump_v5() {
     let (stats, code) = index2.stats().await;
     snapshot!(code, @"200 OK");
     snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
       @r###"
     {
       "numberOfDocuments": 10,
-      "rawDocumentDbSize": 6782,
-      "avgDocumentSize": 678,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs
index 20a8eaef6..aee626460 100644
--- a/crates/meilisearch/tests/stats/mod.rs
+++ b/crates/meilisearch/tests/stats/mod.rs
@@ -110,11 +110,14 @@ async fn add_remove_embeddings() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 5,
       "numberOfEmbeddedDocuments": 2,
@@ -135,11 +138,14 @@ async fn add_remove_embeddings() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 3,
       "numberOfEmbeddedDocuments": 2,
@@ -160,11 +166,14 @@ async fn add_remove_embeddings() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 2,
       "numberOfEmbeddedDocuments": 2,
@@ -186,11 +195,14 @@ async fn add_remove_embeddings() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 2,
       "numberOfEmbeddedDocuments": 1,
@@ -236,11 +248,14 @@ async fn add_remove_embedded_documents() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 5,
       "numberOfEmbeddedDocuments": 2,
@@ -257,11 +272,14 @@ async fn add_remove_embedded_documents() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 1,
-      "rawDocumentDbSize": 13,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 3,
       "numberOfEmbeddedDocuments": 1,
@@ -290,11 +308,14 @@ async fn update_embedder_settings() {
     index.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 108,
-      "avgDocumentSize": 54,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,
@@ -326,11 +347,14 @@ async fn update_embedder_settings() {
     server.wait_task(response.uid()).await.succeeded();
 
     let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
     {
       "numberOfDocuments": 2,
-      "rawDocumentDbSize": 108,
-      "avgDocumentSize": 54,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
       "isIndexing": false,
       "numberOfEmbeddings": 3,
       "numberOfEmbeddedDocuments": 2,
diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
index 9fc4d0e5b..1b2ae054c 100644
--- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
+++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
@@ -133,7 +133,9 @@ async fn check_the_index_scheduler(server: &Server) {
     let (stats, _) = server.stats().await;
     assert_json_snapshot!(stats, {
         ".databaseSize" => "[bytes]",
-        ".usedDatabaseSize" => "[bytes]"
+        ".usedDatabaseSize" => "[bytes]",
+        ".indexes.kefir.rawDocumentDbSize" => "[bytes]",
+        ".indexes.kefir.avgDocumentSize" => "[bytes]",
     },
     @r###"
     {
@@ -143,8 +145,8 @@ async fn check_the_index_scheduler(server: &Server) {
       "indexes": {
         "kefir": {
           "numberOfDocuments": 1,
-          "rawDocumentDbSize": 109,
-          "avgDocumentSize": 109,
+          "rawDocumentDbSize": "[bytes]",
+          "avgDocumentSize": "[bytes]",
           "isIndexing": false,
           "numberOfEmbeddings": 0,
           "numberOfEmbeddedDocuments": 0,
@@ -217,7 +219,9 @@ async fn check_the_index_scheduler(server: &Server) {
     let (stats, _) = server.stats().await;
     assert_json_snapshot!(stats, {
         ".databaseSize" => "[bytes]",
-        ".usedDatabaseSize" => "[bytes]"
+        ".usedDatabaseSize" => "[bytes]",
+        ".indexes.kefir.rawDocumentDbSize" => "[bytes]",
+        ".indexes.kefir.avgDocumentSize" => "[bytes]",
     },
     @r###"
     {
@@ -227,8 +231,8 @@ async fn check_the_index_scheduler(server: &Server) {
       "indexes": {
         "kefir": {
           "numberOfDocuments": 1,
-          "rawDocumentDbSize": 109,
-          "avgDocumentSize": 109,
+          "rawDocumentDbSize": "[bytes]",
+          "avgDocumentSize": "[bytes]",
           "isIndexing": false,
           "numberOfEmbeddings": 0,
           "numberOfEmbeddedDocuments": 0,
@@ -245,11 +249,14 @@ async fn check_the_index_scheduler(server: &Server) {
     "###);
     let index = server.index("kefir");
     let (stats, _) = index.stats().await;
-    snapshot!(stats, @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[bytes]",
+        ".avgDocumentSize" => "[bytes]",
+    }), @r###"
     {
       "numberOfDocuments": 1,
-      "rawDocumentDbSize": 109,
-      "avgDocumentSize": 109,
+      "rawDocumentDbSize": "[bytes]",
+      "avgDocumentSize": "[bytes]",
       "isIndexing": false,
       "numberOfEmbeddings": 0,
       "numberOfEmbeddedDocuments": 0,

From 811143cbe940db843be07954a91c4db49150ac57 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 27 Mar 2025 10:17:28 +0100
Subject: [PATCH 27/37] Add more progress precision when doing post processing

---
 .../src/update/new/indexer/post_processing.rs | 64 +++++++++++++++----
 crates/milli/src/update/new/steps.rs          | 20 ++++++
 2 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/crates/milli/src/update/new/indexer/post_processing.rs b/crates/milli/src/update/new/indexer/post_processing.rs
index 2a01fccf3..aace70cff 100644
--- a/crates/milli/src/update/new/indexer/post_processing.rs
+++ b/crates/milli/src/update/new/indexer/post_processing.rs
@@ -7,12 +7,13 @@ use itertools::{merge_join_by, EitherOrBoth};
 use super::document_changes::IndexingContext;
 use crate::facet::FacetType;
 use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
+use crate::progress::Progress;
 use crate::update::del_add::DelAdd;
 use crate::update::facet::new_incremental::FacetsUpdateIncremental;
 use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
 use crate::update::new::facet_search_builder::FacetSearchBuilder;
 use crate::update::new::merger::FacetFieldIdDelta;
-use crate::update::new::steps::IndexingStep;
+use crate::update::new::steps::{IndexingStep, PostProcessingFacets, PostProcessingWords};
 use crate::update::new::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
 use crate::update::new::words_prefix_docids::{
     compute_exact_word_prefix_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids,
@@ -33,11 +34,23 @@ where
 {
     let index = indexing_context.index;
     indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
-    compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?;
-    compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
+    compute_facet_level_database(
+        index,
+        wtxn,
+        facet_field_ids_delta,
+        &mut global_fields_ids_map,
+        indexing_context.progress,
+    )?;
+    compute_facet_search_database(index, wtxn, global_fields_ids_map, indexing_context.progress)?;
     indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
-    if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
-        compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?;
+    if let Some(prefix_delta) = compute_word_fst(index, wtxn, indexing_context.progress)? {
+        compute_prefix_database(
+            index,
+            wtxn,
+            prefix_delta,
+            indexing_context.grenad_parameters,
+            indexing_context.progress,
+        )?;
     };
     Ok(())
 }
@@ -48,21 +61,32 @@ fn compute_prefix_database(
     wtxn: &mut RwTxn,
     prefix_delta: PrefixDelta,
     grenad_parameters: &GrenadParameters,
+    progress: &Progress,
 ) -> Result<()> {
     let PrefixDelta { modified, deleted } = prefix_delta;
-    // Compute word prefix docids
+
+    progress.update_progress(PostProcessingWords::WordPrefixDocids);
     compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
-    // Compute exact word prefix docids
+
+    progress.update_progress(PostProcessingWords::ExactWordPrefixDocids);
     compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
-    // Compute word prefix fid docids
+
+    progress.update_progress(PostProcessingWords::WordPrefixFieldIdDocids);
     compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
-    // Compute word prefix position docids
+
+    progress.update_progress(PostProcessingWords::WordPrefixPositionDocids);
     compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters)
 }
 
 #[tracing::instrument(level = "trace", skip_all, target = "indexing")]
-fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result<Option<PrefixDelta>> {
+fn compute_word_fst(
+    index: &Index,
+    wtxn: &mut RwTxn,
+    progress: &Progress,
+) -> Result<Option<PrefixDelta>> {
     let rtxn = index.read_txn()?;
+    progress.update_progress(PostProcessingWords::WordFst);
+
     let words_fst = index.words_fst(&rtxn)?;
     let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
     let prefix_settings = index.prefix_settings(&rtxn)?;
@@ -112,8 +136,10 @@ fn compute_facet_search_database(
     index: &Index,
     wtxn: &mut RwTxn,
     global_fields_ids_map: GlobalFieldsIdsMap,
+    progress: &Progress,
 ) -> Result<()> {
     let rtxn = index.read_txn()?;
+    progress.update_progress(PostProcessingFacets::FacetSearch);
 
     // if the facet search is not enabled, we can skip the rest of the function
     if !index.facet_search(wtxn)? {
@@ -171,10 +197,16 @@ fn compute_facet_level_database(
     wtxn: &mut RwTxn,
     mut facet_field_ids_delta: FacetFieldIdsDelta,
     global_fields_ids_map: &mut GlobalFieldsIdsMap,
+    progress: &Progress,
 ) -> Result<()> {
     let rtxn = index.read_txn()?;
+
     let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?;
-    for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() {
+    let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_string_delta().collect();
+    // We move all bulks at the front and incrementals (others) at the end.
+    deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
+
+    for (fid, delta) in deltas {
         // skip field ids that should not be facet leveled
         let Some(metadata) = global_fields_ids_map.metadata(fid) else {
             continue;
@@ -187,11 +219,13 @@ fn compute_facet_level_database(
         let _entered = span.enter();
         match delta {
             FacetFieldIdDelta::Bulk => {
+                progress.update_progress(PostProcessingFacets::StringsBulk);
                 tracing::debug!(%fid, "bulk string facet processing");
                 FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String)
                     .execute(wtxn)?
             }
             FacetFieldIdDelta::Incremental(delta_data) => {
+                progress.update_progress(PostProcessingFacets::StringsIncremental);
                 tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing");
                 FacetsUpdateIncremental::new(
                     index,
@@ -207,16 +241,22 @@ fn compute_facet_level_database(
         }
     }
 
-    for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() {
+    let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_number_delta().collect();
+    // We move all bulks at the front and incrementals (others) at the end.
+    deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
+
+    for (fid, delta) in deltas {
         let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
         let _entered = span.enter();
         match delta {
             FacetFieldIdDelta::Bulk => {
+                progress.update_progress(PostProcessingFacets::NumbersBulk);
                 tracing::debug!(%fid, "bulk number facet processing");
                 FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number)
                     .execute(wtxn)?
             }
             FacetFieldIdDelta::Incremental(delta_data) => {
+                progress.update_progress(PostProcessingFacets::NumbersIncremental);
                 tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing");
                 FacetsUpdateIncremental::new(
                     index,
diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs
index da71819c6..eabf9104e 100644
--- a/crates/milli/src/update/new/steps.rs
+++ b/crates/milli/src/update/new/steps.rs
@@ -20,3 +20,23 @@ make_enum_progress! {
         Finalizing,
     }
 }
+
+make_enum_progress! {
+    pub enum PostProcessingFacets {
+        StringsBulk,
+        StringsIncremental,
+        NumbersBulk,
+        NumbersIncremental,
+        FacetSearch,
+    }
+}
+
+make_enum_progress! {
+    pub enum PostProcessingWords {
+        WordFst,
+        WordPrefixDocids,
+        ExactWordPrefixDocids,
+        WordPrefixFieldIdDocids,
+        WordPrefixPositionDocids,
+    }
+}

From 7707fb18dd0c9138721e7e4cfaeb96c363fe8e6c Mon Sep 17 00:00:00 2001
From: vuthanhtung2412 <vuthanhtung2016hn@gmail.com>
Date: Tue, 25 Mar 2025 12:51:36 +0100
Subject: [PATCH 28/37] add embedding with dimension mismatch test case

---
 crates/meilisearch/tests/vector/mod.rs | 50 ++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs
index 67da51702..c6f32ccc5 100644
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -164,6 +164,56 @@ async fn add_remove_user_provided() {
     "###);
 }
 
+#[actix_rt::test]
+async fn user_provide_mismatched_embedding_dimension() {
+    let server = Server::new().await;
+    let index = server.index("doggo");
+
+    let (response, code) = index
+        .update_settings(json!({
+          "embedders": {
+              "manual": {
+                  "source": "userProvided",
+                  "dimensions": 3,
+              }
+          },
+        }))
+        .await;
+    snapshot!(code, @"202 Accepted");
+    server.wait_task(response.uid()).await.succeeded();
+
+    let documents = json!([
+      {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
+    ]);
+    let (value, code) = index.add_documents(documents, None).await;
+    snapshot!(code, @"202 Accepted");
+    let task = index.wait_task(value.uid()).await;
+    snapshot!(task, @r#"
+    {
+      "uid": "[uid]",
+      "batchUid": "[batch_uid]",
+      "indexUid": "doggo",
+      "status": "failed",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 1,
+        "indexedDocuments": 0
+      },
+      "error": {
+        "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.",
+        "code": "invalid_vector_dimensions",
+        "type": "invalid_request",
+        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
+      },
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
+    }
+    "#);
+}
+
 async fn generate_default_user_provided_documents(server: &Server) -> Index {
     let index = server.index("doggo");
 

From 62de70b73c3f7ba7fdd62c102a8ac0edbd4de68b Mon Sep 17 00:00:00 2001
From: vuthanhtung2412 <vuthanhtung2016hn@gmail.com>
Date: Wed, 26 Mar 2025 12:57:25 +0100
Subject: [PATCH 29/37] Document problematic case in test and acknowledge PR
 comment

---
 crates/meilisearch/tests/vector/mod.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs
index c6f32ccc5..fd9c314e2 100644
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -212,6 +212,14 @@ async fn user_provide_mismatched_embedding_dimension() {
       "finishedAt": "[date]"
     }
     "#);
+
+    // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass
+    let new_document = json!([
+      {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
+    ]);
+    let (value, code) = index.add_documents(new_document, None).await;
+    snapshot!(code, @"202 Accepted");
+    index.wait_task(response.uid()).await.succeeded();
 }
 
 async fn generate_default_user_provided_documents(server: &Server) -> Index {

From 0e475cb5e649fb2b4b78a263f423b6d0ca74b31e Mon Sep 17 00:00:00 2001
From: Tamo <tamo@meilisearch.com>
Date: Thu, 27 Mar 2025 11:07:01 +0100
Subject: [PATCH 30/37] fix warn and show what meilisearch understood of the
 vectors in the cursed test

---
 crates/meilisearch/tests/vector/mod.rs | 35 +++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs
index fd9c314e2..14474c210 100644
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -217,9 +217,42 @@ async fn user_provide_mismatched_embedding_dimension() {
     let new_document = json!([
       {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
     ]);
-    let (value, code) = index.add_documents(new_document, None).await;
+    let (response, code) = index.add_documents(new_document, None).await;
     snapshot!(code, @"202 Accepted");
     index.wait_task(response.uid()).await.succeeded();
+    let (documents, _code) = index
+        .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
+        .await;
+    snapshot!(json_string!(documents), @r###"
+    {
+      "results": [
+        {
+          "id": 0,
+          "name": "kefir",
+          "_vectors": {
+            "manual": {
+              "embeddings": [
+                [
+                  0.0,
+                  0.0,
+                  1.0
+                ],
+                [
+                  1.0,
+                  2.0,
+                  2.0
+                ]
+              ],
+              "regenerate": false
+            }
+          }
+        }
+      ],
+      "offset": 0,
+      "limit": 20,
+      "total": 1
+    }
+    "###);
 }
 
 async fn generate_default_user_provided_documents(server: &Server) -> Index {

From 94ea263befc7f5e49ccbed6c27146dbb331dc95d Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 31 Mar 2025 13:43:28 +0200
Subject: [PATCH 31/37] Add new error for dimensions mismatch during indexing

---
 crates/meilisearch-types/src/error.rs | 5 ++++-
 crates/milli/src/error.rs             | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs
index 859563d8a..6c547d51e 100644
--- a/crates/meilisearch-types/src/error.rs
+++ b/crates/meilisearch-types/src/error.rs
@@ -454,7 +454,10 @@ impl ErrorCode for milli::Error {
                     }
                     UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
                     UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
-                    UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
+                    UserError::InvalidVectorDimensions { .. }
+                    | UserError::InvalidIndexingVectorDimensions { .. } => {
+                        Code::InvalidVectorDimensions
+                    }
                     UserError::InvalidVectorsMapType { .. }
                     | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
                     UserError::TooManyVectors(_, _) => Code::TooManyVectors,
diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs
index e1098cfa5..e61283e4c 100644
--- a/crates/milli/src/error.rs
+++ b/crates/milli/src/error.rs
@@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
     InvalidGeoField(#[from] GeoError),
     #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
     InvalidVectorDimensions { expected: usize, found: usize },
+    #[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n  - note: embedding #{embedding_index} has dimensions {found}\n  - note: embedder `{embedder_name}` requires {expected}")]
+    InvalidIndexingVectorDimensions {
+        embedder_name: String,
+        document_id: String,
+        embedding_index: usize,
+        expected: usize,
+        found: usize,
+    },
     #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
     InvalidVectorsMapType { document_id: String, value: Value },
     #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]

From f72986446668e9ea504b79d55e7e8505b00c0685 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 31 Mar 2025 13:43:57 +0200
Subject: [PATCH 32/37] Check dimension mismatch at insertion time

---
 .../src/update/new/extract/vectors/mod.rs     | 29 +++++++++++++++++--
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs
index 6820ee67b..696864e7f 100644
--- a/crates/milli/src/update/new/extract/vectors/mod.rs
+++ b/crates/milli/src/update/new/extract/vectors/mod.rs
@@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                             // do we have set embeddings?
                             if let Some(embeddings) = new_vectors.embeddings {
                                 chunks.set_vectors(
+                                    update.external_document_id(),
                                     update.docid(),
                                     embeddings
                                         .into_vec(&context.doc_alloc, embedder_name)
@@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                             document_id: update.external_document_id().to_string(),
                                             error: error.to_string(),
                                         })?,
-                                );
+                                )?;
                             } else if new_vectors.regenerate {
                                 let new_rendered = prompt.render_document(
                                     update.external_document_id(),
@@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                             chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
                             if let Some(embeddings) = new_vectors.embeddings {
                                 chunks.set_vectors(
+                                    insertion.external_document_id(),
                                     insertion.docid(),
                                     embeddings
                                         .into_vec(&context.doc_alloc, embedder_name)
@@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                                 .to_string(),
                                             error: error.to_string(),
                                         })?,
-                                );
+                                )?;
                             } else if new_vectors.regenerate {
                                 let rendered = prompt.render_document(
                                     insertion.external_document_id(),
@@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
     embedder: &'a Embedder,
     embedder_id: u8,
     embedder_name: &'a str,
+    dimensions: usize,
     prompt: &'a Prompt,
     possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
     user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
@@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
         let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
         let texts = BVec::with_capacity_in(capacity, doc_alloc);
         let ids = BVec::with_capacity_in(capacity, doc_alloc);
+        let dimensions = embedder.dimensions();
         Self {
             texts,
             ids,
@@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
             embedder_name,
             user_provided,
             has_manual_generation: None,
+            dimensions,
         }
     }
 
@@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
         }
     }
 
-    fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
+    fn set_vectors(
+        &self,
+        external_docid: &'a str,
+        docid: DocumentId,
+        embeddings: Vec<Embedding>,
+    ) -> Result<()> {
+        for (embedding_index, embedding) in embeddings.iter().enumerate() {
+            if embedding.len() != self.dimensions {
+                return Err(UserError::InvalidIndexingVectorDimensions {
+                    expected: self.dimensions,
+                    found: embedding.len(),
+                    embedder_name: self.embedder_name.to_string(),
+                    document_id: external_docid.to_string(),
+                    embedding_index,
+                }
+                .into());
+            }
+        }
         self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
+        Ok(())
     }
 }

From 08ff135ad6c48d4936e4a45bc86219be208ff273 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 31 Mar 2025 15:26:31 +0200
Subject: [PATCH 33/37] Fix test

---
 crates/meilisearch/tests/vector/mod.rs | 60 +++++++++++---------------
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs
index 14474c210..5e34a4c23 100644
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -188,7 +188,7 @@ async fn user_provide_mismatched_embedding_dimension() {
     let (value, code) = index.add_documents(documents, None).await;
     snapshot!(code, @"202 Accepted");
     let task = index.wait_task(value.uid()).await;
-    snapshot!(task, @r#"
+    snapshot!(task, @r###"
     {
       "uid": "[uid]",
       "batchUid": "[batch_uid]",
@@ -201,7 +201,7 @@ async fn user_provide_mismatched_embedding_dimension() {
         "indexedDocuments": 0
       },
       "error": {
-        "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.",
+        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
         "code": "invalid_vector_dimensions",
         "type": "invalid_request",
         "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
@@ -211,46 +211,36 @@ async fn user_provide_mismatched_embedding_dimension() {
       "startedAt": "[date]",
       "finishedAt": "[date]"
     }
-    "#);
+    "###);
 
-    // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass
     let new_document = json!([
       {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
     ]);
     let (response, code) = index.add_documents(new_document, None).await;
     snapshot!(code, @"202 Accepted");
-    index.wait_task(response.uid()).await.succeeded();
-    let (documents, _code) = index
-        .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
-        .await;
-    snapshot!(json_string!(documents), @r###"
+    let task = index.wait_task(response.uid()).await;
+    snapshot!(task, @r###"
     {
-      "results": [
-        {
-          "id": 0,
-          "name": "kefir",
-          "_vectors": {
-            "manual": {
-              "embeddings": [
-                [
-                  0.0,
-                  0.0,
-                  1.0
-                ],
-                [
-                  1.0,
-                  2.0,
-                  2.0
-                ]
-              ],
-              "regenerate": false
-            }
-          }
-        }
-      ],
-      "offset": 0,
-      "limit": 20,
-      "total": 1
+      "uid": "[uid]",
+      "batchUid": "[batch_uid]",
+      "indexUid": "doggo",
+      "status": "failed",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 1,
+        "indexedDocuments": 0
+      },
+      "error": {
+        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
+        "code": "invalid_vector_dimensions",
+        "type": "invalid_request",
+        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
+      },
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
     }
     "###);
 }

From 0656a0d515044f72f9043e42e28331a0fe4fd8cf Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis.dureuil@gmail.com>
Date: Tue, 1 Apr 2025 14:25:27 +0200
Subject: [PATCH 34/37] Optimize roaring operation

Co-authored-by: Many the fish <many@meilisearch.com>
---
 crates/milli/src/search/new/bucket_sort.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs
index a659dd226..ca7a4a986 100644
--- a/crates/milli/src/search/new/bucket_sort.rs
+++ b/crates/milli/src/search/new/bucket_sort.rs
@@ -252,8 +252,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
             || is_below_threshold
         {
             if is_below_threshold {
-                all_candidates -=
-                    next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
+                all_candidates -= &next_bucket.candidates;
+                all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
             } else {
                 maybe_add_to_results!(next_bucket.candidates);
             }

From 1db550ec7f46984a3532eee859672a449389bc68 Mon Sep 17 00:00:00 2001
From: Tamo <tamo@meilisearch.com>
Date: Thu, 3 Apr 2025 15:47:56 +0200
Subject: [PATCH 35/37] make meilisearch accept cancelation tasks even when the
 disk is full

---
 crates/index-scheduler/src/lib.rs        |  4 +--
 crates/index-scheduler/src/queue/mod.rs  |  2 --
 crates/index-scheduler/src/queue/test.rs | 33 ++++++++++++++++++++++--
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs
index 5c8517650..99f62983a 100644
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@@ -625,8 +625,8 @@ impl IndexScheduler {
         task_id: Option<TaskId>,
         dry_run: bool,
     ) -> Result<Task> {
-        // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
-        if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
+        // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incomming task
+        if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty())
             && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40
         {
             return Err(Error::NoSpaceLeftInTaskQueue);
diff --git a/crates/index-scheduler/src/queue/mod.rs b/crates/index-scheduler/src/queue/mod.rs
index b13e3ffe2..92de10fe1 100644
--- a/crates/index-scheduler/src/queue/mod.rs
+++ b/crates/index-scheduler/src/queue/mod.rs
@@ -292,8 +292,6 @@ impl Queue {
             return Ok(task);
         }
 
-        // Get rid of the mutability.
-        let task = task;
         self.tasks.register(wtxn, &task)?;
 
         Ok(task)
diff --git a/crates/index-scheduler/src/queue/test.rs b/crates/index-scheduler/src/queue/test.rs
index 3dbdd2db3..91f412025 100644
--- a/crates/index-scheduler/src/queue/test.rs
+++ b/crates/index-scheduler/src/queue/test.rs
@@ -364,7 +364,7 @@ fn test_task_queue_is_full() {
     // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
     snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
 
-    // Even the task deletion that doesn't delete anything shouldn't be accepted
+    // Even the task deletion and cancelation that don't delete anything shouldn be refused
     let result = index_scheduler
         .register(
             KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() },
@@ -373,10 +373,39 @@ fn test_task_queue_is_full() {
         )
         .unwrap_err();
     snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
+    let result = index_scheduler
+        .register(
+            KindWithContent::TaskCancelation { query: S("test"), tasks: RoaringBitmap::new() },
+            None,
+            false,
+        )
+        .unwrap_err();
+    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
+
     // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
     snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
 
-    // But a task deletion that delete something should works
+    // But a task cancelation that cancel something should works
+    index_scheduler
+        .register(
+            KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() },
+            None,
+            false,
+        )
+        .unwrap();
+    handle.advance_one_successful_batch();
+
+    // But we should still be forbidden from enqueuing new tasks
+    let result = index_scheduler
+        .register(
+            KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None },
+            None,
+            false,
+        )
+        .unwrap_err();
+    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
+
+    // And a task deletion that delete something should works
     index_scheduler
         .register(
             KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() },

From 796a325972acffcce682f39be747653db83dd71d Mon Sep 17 00:00:00 2001
From: Tamo <tamo@meilisearch.com>
Date: Thu, 3 Apr 2025 15:53:42 +0200
Subject: [PATCH 36/37] Fix typos

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 crates/index-scheduler/src/lib.rs        | 2 +-
 crates/index-scheduler/src/queue/test.rs | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs
index 99f62983a..9052b92f1 100644
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@@ -625,7 +625,7 @@ impl IndexScheduler {
         task_id: Option<TaskId>,
         dry_run: bool,
     ) -> Result<Task> {
-        // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incomming task
+        // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incoming task
         if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty())
             && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40
         {
diff --git a/crates/index-scheduler/src/queue/test.rs b/crates/index-scheduler/src/queue/test.rs
index 91f412025..7582da0d6 100644
--- a/crates/index-scheduler/src/queue/test.rs
+++ b/crates/index-scheduler/src/queue/test.rs
@@ -364,7 +364,7 @@ fn test_task_queue_is_full() {
     // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
     snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
 
-    // Even the task deletion and cancelation that don't delete anything shouldn be refused
+    // Even the task deletion and cancelation that don't delete anything should be refused
     let result = index_scheduler
         .register(
             KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() },
@@ -385,7 +385,7 @@ fn test_task_queue_is_full() {
     // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
     snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");
 
-    // But a task cancelation that cancel something should works
+    // But a task cancelation that cancel something should work
     index_scheduler
         .register(
             KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() },

From 61db56f7856280946b1fb8da0c23d8f12a5308c6 Mon Sep 17 00:00:00 2001
From: Tamo <tamo@meilisearch.com>
Date: Mon, 14 Apr 2025 14:55:57 +0200
Subject: [PATCH 37/37] remove duplicated test

---
 crates/meilisearch/tests/vector/mod.rs | 91 --------------------------
 1 file changed, 91 deletions(-)

diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs
index e0fde8660..98555dfac 100644
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -164,97 +164,6 @@ async fn add_remove_user_provided() {
     "#);
 }
 
-#[actix_rt::test]
-async fn user_provide_mismatched_embedding_dimension() {
-    let server = Server::new().await;
-    let index = server.index("doggo");
-
-    let (response, code) = index
-        .update_settings(json!({
-          "embedders": {
-              "manual": {
-                  "source": "userProvided",
-                  "dimensions": 3,
-              }
-          },
-        }))
-        .await;
-    snapshot!(code, @"202 Accepted");
-    server.wait_task(response.uid()).await.succeeded();
-
-    let documents = json!([
-      {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
-    ]);
-    let (value, code) = index.add_documents(documents, None).await;
-    snapshot!(code, @"202 Accepted");
-    let task = index.wait_task(value.uid()).await;
-    snapshot!(task, @r#"
-    {
-      "uid": "[uid]",
-      "batchUid": "[batch_uid]",
-      "indexUid": "doggo",
-      "status": "failed",
-      "type": "documentAdditionOrUpdate",
-      "canceledBy": null,
-      "details": {
-        "receivedDocuments": 1,
-        "indexedDocuments": 0
-      },
-      "error": {
-        "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.",
-        "code": "invalid_vector_dimensions",
-        "type": "invalid_request",
-        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
-      },
-      "duration": "[duration]",
-      "enqueuedAt": "[date]",
-      "startedAt": "[date]",
-      "finishedAt": "[date]"
-    }
-    "#);
-
-    // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass
-    let new_document = json!([
-      {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
-    ]);
-    let (response, code) = index.add_documents(new_document, None).await;
-    snapshot!(code, @"202 Accepted");
-    index.wait_task(response.uid()).await.succeeded();
-    let (documents, _code) = index
-        .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
-        .await;
-    snapshot!(json_string!(documents), @r###"
-    {
-      "results": [
-        {
-          "id": 0,
-          "name": "kefir",
-          "_vectors": {
-            "manual": {
-              "embeddings": [
-                [
-                  0.0,
-                  0.0,
-                  1.0
-                ],
-                [
-                  1.0,
-                  2.0,
-                  2.0
-                ]
-              ],
-              "regenerate": false
-            }
-          }
-        }
-      ],
-      "offset": 0,
-      "limit": 20,
-      "total": 1
-    }
-    "###);
-}
-
 #[actix_rt::test]
 async fn user_provide_mismatched_embedding_dimension() {
     let server = Server::new().await;