Use a patch version of arroy to disable simd

Merge pull request #5492 from meilisearch/accept-cancelation-tasks-when-disk-full
make meilisearch accept cancelation tasks even when the disk is full
2025-12-16 01:16:56 +00:00 · 2025-06-26 16:54:18 +02:00 · 2025-04-03 15:46:46 +00:00 · 2025-04-03 15:53:42 +02:00 · 2025-04-03 15:47:56 +02:00 · 2025-04-01 13:10:55 +00:00
23 changed files with 452 additions and 217 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4

 [[package]]
 name = "actix-codec"
@@ -394,8 +394,7 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 [[package]]
 name = "arroy"
 version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
+source = "git+https://github.com/meilisearch/arroy?branch=no-simd-x86-arroy#2ebbed058a6e3292707486e5a57d754d94f3fa2a"
 dependencies = [
 "bytemuck",
 "byteorder",
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@@ -625,8 +625,8 @@ impl IndexScheduler {
        task_id: Option<TaskId>,
        dry_run: bool,
    ) -> Result<Task> {
-        // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
-        if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
+        // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incoming task
+        if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty())
            && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40
        {
            return Err(Error::NoSpaceLeftInTaskQueue);
--- a/crates/index-scheduler/src/queue/mod.rs
+++ b/crates/index-scheduler/src/queue/mod.rs
@@ -292,8 +292,6 @@ impl Queue {
            return Ok(task);
        }

-        // Get rid of the mutability.
-        let task = task;
        self.tasks.register(wtxn, &task)?;

        Ok(task)
--- a/crates/index-scheduler/src/queue/test.rs
+++ b/crates/index-scheduler/src/queue/test.rs
@@ -364,7 +364,7 @@ fn test_task_queue_is_full() {
    // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
    snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");

-    // Even the task deletion that doesn't delete anything shouldn't be accepted
+    // Even the task deletion and cancelation that don't delete anything should be refused
    let result = index_scheduler
        .register(
            KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() },
@@ -373,10 +373,39 @@ fn test_task_queue_is_full() {
        )
        .unwrap_err();
    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
+    let result = index_scheduler
+        .register(
+            KindWithContent::TaskCancelation { query: S("test"), tasks: RoaringBitmap::new() },
+            None,
+            false,
+        )
+        .unwrap_err();
+    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
+
    // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
    snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");

-    // But a task deletion that delete something should works
+    // But a task cancelation that cancel something should work
+    index_scheduler
+        .register(
+            KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() },
+            None,
+            false,
+        )
+        .unwrap();
+    handle.advance_one_successful_batch();
+
+    // But we should still be forbidden from enqueuing new tasks
+    let result = index_scheduler
+        .register(
+            KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None },
+            None,
+            false,
+        )
+        .unwrap_err();
+    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
+
+    // And a task deletion that delete something should works
    index_scheduler
        .register(
            KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() },
--- a/crates/index-scheduler/src/scheduler/mod.rs
+++ b/crates/index-scheduler/src/scheduler/mod.rs
@@ -20,6 +20,7 @@ use std::path::PathBuf;
 use std::sync::atomic::{AtomicBool, AtomicU32, Ordering};
 use std::sync::Arc;

+use convert_case::{Case, Casing as _};
 use meilisearch_types::error::ResponseError;
 use meilisearch_types::heed::{Env, WithoutTls};
 use meilisearch_types::milli;
@@ -381,7 +382,10 @@ impl IndexScheduler {
                            Less => "-",
                        };

-                        Some((dbname.to_string(), format!("{post:#.2} ({sign}{diff:#.2})").into()))
+                        Some((
+                            dbname.to_case(Case::Camel),
+                            format!("{post:#.2} ({sign}{diff:#.2})").into(),
+                        ))
                    })
                    .into_iter()
                    .flatten()
--- a/crates/meilisearch-types/src/error.rs
+++ b/crates/meilisearch-types/src/error.rs
@@ -454,7 +454,10 @@ impl ErrorCode for milli::Error {
                    }
                    UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
                    UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
-                    UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
+                    UserError::InvalidVectorDimensions { .. }
+                    | UserError::InvalidIndexingVectorDimensions { .. } => {
+                        Code::InvalidVectorDimensions
+                    }
                    UserError::InvalidVectorsMapType { .. }
                    | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
                    UserError::TooManyVectors(_, _) => Code::TooManyVectors,
--- a/crates/meilisearch/src/routes/indexes/mod.rs
+++ b/crates/meilisearch/src/routes/indexes/mod.rs
@@ -518,7 +518,7 @@ impl From<index_scheduler::IndexStats> for IndexStats {
                .inner_stats
                .number_of_documents
                .unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()),
-            raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
+            raw_document_db_size: stats.inner_stats.documents_database_stats.total_size(),
            avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
            is_indexing: stats.is_indexing,
            number_of_embeddings: stats.inner_stats.number_of_embeddings,
--- a/crates/meilisearch/tests/documents/delete_documents.rs
+++ b/crates/meilisearch/tests/documents/delete_documents.rs
@@ -157,11 +157,14 @@ async fn delete_document_by_filter() {
    index.wait_task(task.uid()).await.succeeded();

    let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 4,
-      "rawDocumentDbSize": 42,
-      "avgDocumentSize": 10,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -208,11 +211,14 @@ async fn delete_document_by_filter() {
    "###);

    let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 16,
-      "avgDocumentSize": 8,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -278,11 +284,14 @@ async fn delete_document_by_filter() {
    "###);

    let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 1,
-      "rawDocumentDbSize": 12,
-      "avgDocumentSize": 12,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
--- a/crates/meilisearch/tests/dumps/mod.rs
+++ b/crates/meilisearch/tests/dumps/mod.rs
@@ -28,12 +28,15 @@ async fn import_dump_v1_movie_raw() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -185,12 +188,15 @@ async fn import_dump_v1_movie_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-        json_string!(stats),
+        json_string!(stats, {
+            ".rawDocumentDbSize" => "[size]",
+            ".avgDocumentSize" => "[size]",
+        }),
        @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -355,12 +361,15 @@ async fn import_dump_v1_rubygems_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -522,12 +531,15 @@ async fn import_dump_v2_movie_raw() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -679,12 +691,15 @@ async fn import_dump_v2_movie_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -846,12 +861,15 @@ async fn import_dump_v2_rubygems_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1010,12 +1028,15 @@ async fn import_dump_v3_movie_raw() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1167,12 +1188,15 @@ async fn import_dump_v3_movie_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1334,12 +1358,15 @@ async fn import_dump_v3_rubygems_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1498,12 +1525,15 @@ async fn import_dump_v4_movie_raw() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1655,12 +1685,15 @@ async fn import_dump_v4_movie_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 21965,
-      "avgDocumentSize": 414,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1822,12 +1855,15 @@ async fn import_dump_v4_rubygems_with_settings() {
    let (stats, code) = index.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 53,
-      "rawDocumentDbSize": 8606,
-      "avgDocumentSize": 162,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -1994,11 +2030,14 @@ async fn import_dump_v5() {

    let (stats, code) = index1.stats().await;
    snapshot!(code, @"200 OK");
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 10,
-      "rawDocumentDbSize": 6782,
-      "avgDocumentSize": 678,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -2031,12 +2070,15 @@ async fn import_dump_v5() {
    let (stats, code) = index2.stats().await;
    snapshot!(code, @"200 OK");
    snapshot!(
-      json_string!(stats),
+      json_string!(stats, {
+          ".rawDocumentDbSize" => "[size]",
+          ".avgDocumentSize" => "[size]",
+      }),
      @r###"
    {
      "numberOfDocuments": 10,
-      "rawDocumentDbSize": 6782,
-      "avgDocumentSize": 678,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
--- a/crates/meilisearch/tests/stats/mod.rs
+++ b/crates/meilisearch/tests/stats/mod.rs
@@ -110,11 +110,14 @@ async fn add_remove_embeddings() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 5,
      "numberOfEmbeddedDocuments": 2,
@@ -135,11 +138,14 @@ async fn add_remove_embeddings() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 3,
      "numberOfEmbeddedDocuments": 2,
@@ -160,11 +166,14 @@ async fn add_remove_embeddings() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 2,
      "numberOfEmbeddedDocuments": 2,
@@ -186,11 +195,14 @@ async fn add_remove_embeddings() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 2,
      "numberOfEmbeddedDocuments": 1,
@@ -236,11 +248,14 @@ async fn add_remove_embedded_documents() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 27,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 5,
      "numberOfEmbeddedDocuments": 2,
@@ -257,11 +272,14 @@ async fn add_remove_embedded_documents() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 1,
-      "rawDocumentDbSize": 13,
-      "avgDocumentSize": 13,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 3,
      "numberOfEmbeddedDocuments": 1,
@@ -290,11 +308,14 @@ async fn update_embedder_settings() {
    index.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 108,
-      "avgDocumentSize": 54,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
@@ -326,11 +347,14 @@ async fn update_embedder_settings() {
    server.wait_task(response.uid()).await.succeeded();

    let (stats, _code) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[size]",
+        ".avgDocumentSize" => "[size]",
+    }), @r###"
    {
      "numberOfDocuments": 2,
-      "rawDocumentDbSize": 108,
-      "avgDocumentSize": 54,
+      "rawDocumentDbSize": "[size]",
+      "avgDocumentSize": "[size]",
      "isIndexing": false,
      "numberOfEmbeddings": 3,
      "numberOfEmbeddedDocuments": 2,
--- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
+++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
@@ -133,7 +133,9 @@ async fn check_the_index_scheduler(server: &Server) {
    let (stats, _) = server.stats().await;
    assert_json_snapshot!(stats, {
        ".databaseSize" => "[bytes]",
-        ".usedDatabaseSize" => "[bytes]"
+        ".usedDatabaseSize" => "[bytes]",
+        ".indexes.kefir.rawDocumentDbSize" => "[bytes]",
+        ".indexes.kefir.avgDocumentSize" => "[bytes]",
    },
    @r###"
    {
@@ -143,8 +145,8 @@ async fn check_the_index_scheduler(server: &Server) {
      "indexes": {
        "kefir": {
          "numberOfDocuments": 1,
-          "rawDocumentDbSize": 109,
-          "avgDocumentSize": 109,
+          "rawDocumentDbSize": "[bytes]",
+          "avgDocumentSize": "[bytes]",
          "isIndexing": false,
          "numberOfEmbeddings": 0,
          "numberOfEmbeddedDocuments": 0,
@@ -217,7 +219,9 @@ async fn check_the_index_scheduler(server: &Server) {
    let (stats, _) = server.stats().await;
    assert_json_snapshot!(stats, {
        ".databaseSize" => "[bytes]",
-        ".usedDatabaseSize" => "[bytes]"
+        ".usedDatabaseSize" => "[bytes]",
+        ".indexes.kefir.rawDocumentDbSize" => "[bytes]",
+        ".indexes.kefir.avgDocumentSize" => "[bytes]",
    },
    @r###"
    {
@@ -227,8 +231,8 @@ async fn check_the_index_scheduler(server: &Server) {
      "indexes": {
        "kefir": {
          "numberOfDocuments": 1,
-          "rawDocumentDbSize": 109,
-          "avgDocumentSize": 109,
+          "rawDocumentDbSize": "[bytes]",
+          "avgDocumentSize": "[bytes]",
          "isIndexing": false,
          "numberOfEmbeddings": 0,
          "numberOfEmbeddedDocuments": 0,
@@ -245,11 +249,14 @@ async fn check_the_index_scheduler(server: &Server) {
    "###);
    let index = server.index("kefir");
    let (stats, _) = index.stats().await;
-    snapshot!(stats, @r###"
+    snapshot!(json_string!(stats, {
+        ".rawDocumentDbSize" => "[bytes]",
+        ".avgDocumentSize" => "[bytes]",
+    }), @r###"
    {
      "numberOfDocuments": 1,
-      "rawDocumentDbSize": 109,
-      "avgDocumentSize": 109,
+      "rawDocumentDbSize": "[bytes]",
+      "avgDocumentSize": "[bytes]",
      "isIndexing": false,
      "numberOfEmbeddings": 0,
      "numberOfEmbeddedDocuments": 0,
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -164,6 +164,87 @@ async fn add_remove_user_provided() {
    "###);
 }

+#[actix_rt::test]
+async fn user_provide_mismatched_embedding_dimension() {
+    let server = Server::new().await;
+    let index = server.index("doggo");
+
+    let (response, code) = index
+        .update_settings(json!({
+          "embedders": {
+              "manual": {
+                  "source": "userProvided",
+                  "dimensions": 3,
+              }
+          },
+        }))
+        .await;
+    snapshot!(code, @"202 Accepted");
+    server.wait_task(response.uid()).await.succeeded();
+
+    let documents = json!([
+      {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
+    ]);
+    let (value, code) = index.add_documents(documents, None).await;
+    snapshot!(code, @"202 Accepted");
+    let task = index.wait_task(value.uid()).await;
+    snapshot!(task, @r###"
+    {
+      "uid": "[uid]",
+      "batchUid": "[batch_uid]",
+      "indexUid": "doggo",
+      "status": "failed",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 1,
+        "indexedDocuments": 0
+      },
+      "error": {
+        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
+        "code": "invalid_vector_dimensions",
+        "type": "invalid_request",
+        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
+      },
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
+    }
+    "###);
+
+    let new_document = json!([
+      {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
+    ]);
+    let (response, code) = index.add_documents(new_document, None).await;
+    snapshot!(code, @"202 Accepted");
+    let task = index.wait_task(response.uid()).await;
+    snapshot!(task, @r###"
+    {
+      "uid": "[uid]",
+      "batchUid": "[batch_uid]",
+      "indexUid": "doggo",
+      "status": "failed",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 1,
+        "indexedDocuments": 0
+      },
+      "error": {
+        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
+        "code": "invalid_vector_dimensions",
+        "type": "invalid_request",
+        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
+      },
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
+    }
+    "###);
+}
+
 async fn generate_default_user_provided_documents(server: &Server) -> Index {
    let index = server.index("doggo");

--- a/crates/milli/Cargo.toml
+++ b/crates/milli/Cargo.toml
@@ -87,7 +87,7 @@ rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838
    "no_time",
    "sync",
 ] }
-arroy = "0.6.1"
+arroy = { git = "https://github.com/meilisearch/arroy", branch = "no-simd-x86-arroy" }
 rand = "0.8.5"
 tracing = "0.1.41"
 ureq = { version = "2.12.1", features = ["json"] }
--- a/crates/milli/src/database_stats.rs
+++ b/crates/milli/src/database_stats.rs
@@ -1,8 +1,13 @@
-use heed::types::Bytes;
+use std::mem;
+
 use heed::Database;
+use heed::DatabaseStat;
 use heed::RoTxn;
+use heed::Unspecified;
 use serde::{Deserialize, Serialize};

+use crate::BEU32;
+
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
 #[serde(rename_all = "camelCase")]
 /// The stats of a database.
@@ -20,58 +25,24 @@ impl DatabaseStats {
    ///
    /// This function iterates over the whole database and computes the stats.
    /// It is not efficient and should be cached somewhere.
-    pub(crate) fn new(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'_>) -> heed::Result<Self> {
-        let mut database_stats =
-            Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 };
+    pub(crate) fn new(
+        database: Database<BEU32, Unspecified>,
+        rtxn: &RoTxn<'_>,
+    ) -> heed::Result<Self> {
+        let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } =
+            database.stat(rtxn)?;

-        let mut iter = database.iter(rtxn)?;
-        while let Some((key, value)) = iter.next().transpose()? {
-            let key_size = key.len() as u64;
-            let value_size = value.len() as u64;
-            database_stats.total_key_size += key_size;
-            database_stats.total_value_size += value_size;
-        }
+        // We first take the total size without overflow pages as the overflow pages contains the values and only that.
+        let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize;
+        // We compute an estimated size for the keys.
+        let total_key_size = entries * (mem::size_of::<u32>() + 4);
+        let total_value_size = total_size - total_key_size;

-        database_stats.number_of_entries = database.len(rtxn)?;
-
-        Ok(database_stats)
-    }
-
-    /// Recomputes the stats of the database and returns the new stats.
-    ///
-    /// This function is used to update the stats of the database when some keys are modified.
-    /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states.
-    pub(crate) fn recompute<I, K>(
-        mut stats: Self,
-        database: Database<Bytes, Bytes>,
-        before_rtxn: &RoTxn<'_>,
-        after_rtxn: &RoTxn<'_>,
-        modified_keys: I,
-    ) -> heed::Result<Self>
-    where
-        I: IntoIterator<Item = K>,
-        K: AsRef<[u8]>,
-    {
-        for key in modified_keys {
-            let key = key.as_ref();
-            if let Some(value) = database.get(after_rtxn, key)? {
-                let key_size = key.len() as u64;
-                let value_size = value.len() as u64;
-                stats.total_key_size = stats.total_key_size.saturating_add(key_size);
-                stats.total_value_size = stats.total_value_size.saturating_add(value_size);
-            }
-
-            if let Some(value) = database.get(before_rtxn, key)? {
-                let key_size = key.len() as u64;
-                let value_size = value.len() as u64;
-                stats.total_key_size = stats.total_key_size.saturating_sub(key_size);
-                stats.total_value_size = stats.total_value_size.saturating_sub(value_size);
-            }
-        }
-
-        stats.number_of_entries = database.len(after_rtxn)?;
-
-        Ok(stats)
+        Ok(Self {
+            number_of_entries: entries as u64,
+            total_key_size: total_key_size as u64,
+            total_value_size: total_value_size as u64,
+        })
    }

    pub fn average_key_size(&self) -> u64 {
@@ -86,6 +57,10 @@ impl DatabaseStats {
        self.number_of_entries
    }

+    pub fn total_size(&self) -> u64 {
+        self.total_key_size + self.total_value_size
+    }
+
    pub fn total_key_size(&self) -> u64 {
        self.total_key_size
    }
--- a/crates/milli/src/error.rs
+++ b/crates/milli/src/error.rs
@@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
    InvalidGeoField(#[from] GeoError),
    #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
    InvalidVectorDimensions { expected: usize, found: usize },
+    #[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n  - note: embedding #{embedding_index} has dimensions {found}\n  - note: embedder `{embedder_name}` requires {expected}")]
+    InvalidIndexingVectorDimensions {
+        embedder_name: String,
+        document_id: String,
+        embedding_index: usize,
+        expected: usize,
+        found: usize,
+    },
    #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
    InvalidVectorsMapType { document_id: String, value: Value },
    #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -411,38 +411,6 @@ impl Index {
        Ok(count.unwrap_or_default())
    }

-    /// Updates the stats of the documents database based on the previous stats and the modified docids.
-    pub fn update_documents_stats(
-        &self,
-        wtxn: &mut RwTxn<'_>,
-        modified_docids: roaring::RoaringBitmap,
-    ) -> Result<()> {
-        let before_rtxn = self.read_txn()?;
-        let document_stats = match self.documents_stats(&before_rtxn)? {
-            Some(before_stats) => DatabaseStats::recompute(
-                before_stats,
-                self.documents.remap_types(),
-                &before_rtxn,
-                wtxn,
-                modified_docids.iter().map(|docid| docid.to_be_bytes()),
-            )?,
-            None => {
-                // This should never happen when there are already documents in the index, the documents stats should be present.
-                // If it happens, it means that the index was not properly initialized/upgraded.
-                debug_assert_eq!(
-                    self.documents.len(&before_rtxn)?,
-                    0,
-                    "The documents stats should be present when there are documents in the index"
-                );
-                tracing::warn!("No documents stats found, creating new ones");
-                DatabaseStats::new(self.documents.remap_types(), &*wtxn)?
-            }
-        };
-
-        self.put_documents_stats(wtxn, document_stats)?;
-        Ok(())
-    }
-
    /// Writes the stats of the documents database.
    pub fn put_documents_stats(
        &self,
--- a/crates/milli/src/search/new/bucket_sort.rs
+++ b/crates/milli/src/search/new/bucket_sort.rs
@@ -173,16 +173,18 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
                ranking_rule_scores.push(ScoreDetails::Skipped);

                // remove candidates from the universe without adding them to result if their score is below the threshold
-                if let Some(ranking_score_threshold) = ranking_score_threshold {
-                    let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
-                    if current_score < ranking_score_threshold {
-                        all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
-                        back!();
-                        continue;
-                    }
-                }
+                let is_below_threshold =
+                    ranking_score_threshold.is_some_and(|ranking_score_threshold| {
+                        let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
+                        current_score < ranking_score_threshold
+                    });

-                maybe_add_to_results!(bucket);
+                if is_below_threshold {
+                    all_candidates -= &bucket;
+                    all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
+                } else {
+                    maybe_add_to_results!(bucket);
+                }

                ranking_rule_scores.pop();

@@ -237,23 +239,24 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
        );

        // remove candidates from the universe without adding them to result if their score is below the threshold
-        if let Some(ranking_score_threshold) = ranking_score_threshold {
+        let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
            let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
-            if current_score < ranking_score_threshold {
-                all_candidates -=
-                    next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
-                back!();
-                continue;
-            }
-        }
+            current_score < ranking_score_threshold
+        });

        ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;

        if cur_ranking_rule_index == ranking_rules_len - 1
            || (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
            || cur_offset + (next_bucket.candidates.len() as usize) < from
+            || is_below_threshold
        {
-            maybe_add_to_results!(next_bucket.candidates);
+            if is_below_threshold {
+                all_candidates -= &next_bucket.candidates;
+                all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
+            } else {
+                maybe_add_to_results!(next_bucket.candidates);
+            }
            ranking_rule_scores.pop();
            continue;
        }
--- a/crates/milli/src/update/index_documents/mod.rs
+++ b/crates/milli/src/update/index_documents/mod.rs
@@ -28,6 +28,7 @@ pub use self::helpers::*;
 pub use self::transform::{Transform, TransformOutput};
 use super::facet::clear_facet_levels_based_on_settings_diff;
 use super::new::StdResult;
+use crate::database_stats::DatabaseStats;
 use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError};
 use crate::index::{PrefixSearch, PrefixSettings};
@@ -476,7 +477,8 @@ where

        if !settings_diff.settings_update_only {
            // Update the stats of the documents database when there is a document update.
-            self.index.update_documents_stats(self.wtxn, modified_docids)?;
+            let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?;
+            self.index.put_documents_stats(self.wtxn, stats)?;
        }
        // We write the field distribution into the main database
        self.index.put_field_distribution(self.wtxn, &field_distribution)?;
--- a/crates/milli/src/update/new/extract/vectors/mod.rs
+++ b/crates/milli/src/update/new/extract/vectors/mod.rs
@@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                            // do we have set embeddings?
                            if let Some(embeddings) = new_vectors.embeddings {
                                chunks.set_vectors(
+                                    update.external_document_id(),
                                    update.docid(),
                                    embeddings
                                        .into_vec(&context.doc_alloc, embedder_name)
@@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                            document_id: update.external_document_id().to_string(),
                                            error: error.to_string(),
                                        })?,
-                                );
+                                )?;
                            } else if new_vectors.regenerate {
                                let new_rendered = prompt.render_document(
                                    update.external_document_id(),
@@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                            chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
                            if let Some(embeddings) = new_vectors.embeddings {
                                chunks.set_vectors(
+                                    insertion.external_document_id(),
                                    insertion.docid(),
                                    embeddings
                                        .into_vec(&context.doc_alloc, embedder_name)
@@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                                .to_string(),
                                            error: error.to_string(),
                                        })?,
-                                );
+                                )?;
                            } else if new_vectors.regenerate {
                                let rendered = prompt.render_document(
                                    insertion.external_document_id(),
@@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
    embedder: &'a Embedder,
    embedder_id: u8,
    embedder_name: &'a str,
+    dimensions: usize,
    prompt: &'a Prompt,
    possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
    user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
@@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
        let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
        let texts = BVec::with_capacity_in(capacity, doc_alloc);
        let ids = BVec::with_capacity_in(capacity, doc_alloc);
+        let dimensions = embedder.dimensions();
        Self {
            texts,
            ids,
@@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
            embedder_name,
            user_provided,
            has_manual_generation: None,
+            dimensions,
        }
    }

@@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
        }
    }

-    fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
+    fn set_vectors(
+        &self,
+        external_docid: &'a str,
+        docid: DocumentId,
+        embeddings: Vec<Embedding>,
+    ) -> Result<()> {
+        for (embedding_index, embedding) in embeddings.iter().enumerate() {
+            if embedding.len() != self.dimensions {
+                return Err(UserError::InvalidIndexingVectorDimensions {
+                    expected: self.dimensions,
+                    found: embedding.len(),
+                    embedder_name: self.embedder_name.to_string(),
+                    document_id: external_docid.to_string(),
+                    embedding_index,
+                }
+                .into());
+            }
+        }
        self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
+        Ok(())
    }
 }
--- a/crates/milli/src/update/new/indexer/mod.rs
+++ b/crates/milli/src/update/new/indexer/mod.rs
@@ -234,7 +234,6 @@ where
        embedders,
        field_distribution,
        document_ids,
-        modified_docids,
    )?;

    Ok(congestion)
--- a/crates/milli/src/update/new/indexer/post_processing.rs
+++ b/crates/milli/src/update/new/indexer/post_processing.rs
@@ -7,12 +7,13 @@ use itertools::{merge_join_by, EitherOrBoth};
 use super::document_changes::IndexingContext;
 use crate::facet::FacetType;
 use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
+use crate::progress::Progress;
 use crate::update::del_add::DelAdd;
 use crate::update::facet::new_incremental::FacetsUpdateIncremental;
 use crate::update::facet::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
 use crate::update::new::facet_search_builder::FacetSearchBuilder;
 use crate::update::new::merger::FacetFieldIdDelta;
-use crate::update::new::steps::IndexingStep;
+use crate::update::new::steps::{IndexingStep, PostProcessingFacets, PostProcessingWords};
 use crate::update::new::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
 use crate::update::new::words_prefix_docids::{
    compute_exact_word_prefix_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids,
@@ -33,11 +34,23 @@ where
 {
    let index = indexing_context.index;
    indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
-    compute_facet_level_database(index, wtxn, facet_field_ids_delta, &mut global_fields_ids_map)?;
-    compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
+    compute_facet_level_database(
+        index,
+        wtxn,
+        facet_field_ids_delta,
+        &mut global_fields_ids_map,
+        indexing_context.progress,
+    )?;
+    compute_facet_search_database(index, wtxn, global_fields_ids_map, indexing_context.progress)?;
    indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
-    if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
-        compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?;
+    if let Some(prefix_delta) = compute_word_fst(index, wtxn, indexing_context.progress)? {
+        compute_prefix_database(
+            index,
+            wtxn,
+            prefix_delta,
+            indexing_context.grenad_parameters,
+            indexing_context.progress,
+        )?;
    };
    Ok(())
 }
@@ -48,21 +61,32 @@ fn compute_prefix_database(
    wtxn: &mut RwTxn,
    prefix_delta: PrefixDelta,
    grenad_parameters: &GrenadParameters,
+    progress: &Progress,
 ) -> Result<()> {
    let PrefixDelta { modified, deleted } = prefix_delta;
-    // Compute word prefix docids
+
+    progress.update_progress(PostProcessingWords::WordPrefixDocids);
    compute_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
-    // Compute exact word prefix docids
+
+    progress.update_progress(PostProcessingWords::ExactWordPrefixDocids);
    compute_exact_word_prefix_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
-    // Compute word prefix fid docids
+
+    progress.update_progress(PostProcessingWords::WordPrefixFieldIdDocids);
    compute_word_prefix_fid_docids(wtxn, index, &modified, &deleted, grenad_parameters)?;
-    // Compute word prefix position docids
+
+    progress.update_progress(PostProcessingWords::WordPrefixPositionDocids);
    compute_word_prefix_position_docids(wtxn, index, &modified, &deleted, grenad_parameters)
 }

 #[tracing::instrument(level = "trace", skip_all, target = "indexing")]
-fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result<Option<PrefixDelta>> {
+fn compute_word_fst(
+    index: &Index,
+    wtxn: &mut RwTxn,
+    progress: &Progress,
+) -> Result<Option<PrefixDelta>> {
    let rtxn = index.read_txn()?;
+    progress.update_progress(PostProcessingWords::WordFst);
+
    let words_fst = index.words_fst(&rtxn)?;
    let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
    let prefix_settings = index.prefix_settings(&rtxn)?;
@@ -112,8 +136,10 @@ fn compute_facet_search_database(
    index: &Index,
    wtxn: &mut RwTxn,
    global_fields_ids_map: GlobalFieldsIdsMap,
+    progress: &Progress,
 ) -> Result<()> {
    let rtxn = index.read_txn()?;
+    progress.update_progress(PostProcessingFacets::FacetSearch);

    // if the facet search is not enabled, we can skip the rest of the function
    if !index.facet_search(wtxn)? {
@@ -171,10 +197,16 @@ fn compute_facet_level_database(
    wtxn: &mut RwTxn,
    mut facet_field_ids_delta: FacetFieldIdsDelta,
    global_fields_ids_map: &mut GlobalFieldsIdsMap,
+    progress: &Progress,
 ) -> Result<()> {
    let rtxn = index.read_txn()?;
+
    let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?;
-    for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() {
+    let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_string_delta().collect();
+    // We move all bulks at the front and incrementals (others) at the end.
+    deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
+
+    for (fid, delta) in deltas {
        // skip field ids that should not be facet leveled
        let Some(metadata) = global_fields_ids_map.metadata(fid) else {
            continue;
@@ -187,11 +219,13 @@ fn compute_facet_level_database(
        let _entered = span.enter();
        match delta {
            FacetFieldIdDelta::Bulk => {
+                progress.update_progress(PostProcessingFacets::StringsBulk);
                tracing::debug!(%fid, "bulk string facet processing");
                FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::String)
                    .execute(wtxn)?
            }
            FacetFieldIdDelta::Incremental(delta_data) => {
+                progress.update_progress(PostProcessingFacets::StringsIncremental);
                tracing::debug!(%fid, len=%delta_data.len(), "incremental string facet processing");
                FacetsUpdateIncremental::new(
                    index,
@@ -207,16 +241,22 @@ fn compute_facet_level_database(
        }
    }

-    for (fid, delta) in facet_field_ids_delta.consume_facet_number_delta() {
+    let mut deltas: Vec<_> = facet_field_ids_delta.consume_facet_number_delta().collect();
+    // We move all bulks at the front and incrementals (others) at the end.
+    deltas.sort_by_key(|(_, delta)| if let FacetFieldIdDelta::Bulk = delta { 0 } else { 1 });
+
+    for (fid, delta) in deltas {
        let span = tracing::trace_span!(target: "indexing::facet_field_ids", "number");
        let _entered = span.enter();
        match delta {
            FacetFieldIdDelta::Bulk => {
+                progress.update_progress(PostProcessingFacets::NumbersBulk);
                tracing::debug!(%fid, "bulk number facet processing");
                FacetsUpdateBulk::new_not_updating_level_0(index, vec![fid], FacetType::Number)
                    .execute(wtxn)?
            }
            FacetFieldIdDelta::Incremental(delta_data) => {
+                progress.update_progress(PostProcessingFacets::NumbersIncremental);
                tracing::debug!(%fid, len=%delta_data.len(), "incremental number facet processing");
                FacetsUpdateIncremental::new(
                    index,
--- a/crates/milli/src/update/new/indexer/write.rs
+++ b/crates/milli/src/update/new/indexer/write.rs
@@ -7,6 +7,7 @@ use rand::SeedableRng as _;
 use time::OffsetDateTime;

 use super::super::channel::*;
+use crate::database_stats::DatabaseStats;
 use crate::documents::PrimaryKey;
 use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
 use crate::index::IndexEmbeddingConfig;
@@ -142,7 +143,6 @@ pub(super) fn update_index(
    embedders: EmbeddingConfigs,
    field_distribution: std::collections::BTreeMap<String, u64>,
    document_ids: roaring::RoaringBitmap,
-    modified_docids: roaring::RoaringBitmap,
 ) -> Result<()> {
    index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
    if let Some(new_primary_key) = new_primary_key {
@@ -153,7 +153,8 @@ pub(super) fn update_index(
    index.put_field_distribution(wtxn, &field_distribution)?;
    index.put_documents_ids(wtxn, &document_ids)?;
    index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
-    index.update_documents_stats(wtxn, modified_docids)?;
+    let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?;
+    index.put_documents_stats(wtxn, stats)?;
    Ok(())
 }

--- a/crates/milli/src/update/new/steps.rs
+++ b/crates/milli/src/update/new/steps.rs
@@ -20,3 +20,23 @@ make_enum_progress! {
        Finalizing,
    }
 }
+
+make_enum_progress! {
+    pub enum PostProcessingFacets {
+        StringsBulk,
+        StringsIncremental,
+        NumbersBulk,
+        NumbersIncremental,
+        FacetSearch,
+    }
+}
+
+make_enum_progress! {
+    pub enum PostProcessingWords {
+        WordFst,
+        WordPrefixDocids,
+        ExactWordPrefixDocids,
+        WordPrefixFieldIdDocids,
+        WordPrefixPositionDocids,
+    }
+}
Author	SHA1	Message	Date
Clément Renault	80d7aa7bdc	Use a patch version of arroy to disable simd	2025-06-26 16:54:18 +02:00
Louis Dureuil	94b43001db	Merge pull request #5492 from meilisearch/accept-cancelation-tasks-when-disk-full make meilisearch accept cancelation tasks even when the disk is full	2025-04-03 15:46:46 +00:00
Tamo	796a325972	Fix typos Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>	2025-04-03 15:53:42 +02:00
Tamo	1db550ec7f	make meilisearch accept cancelation tasks even when the disk is full	2025-04-03 15:47:56 +02:00
Louis Dureuil	418fa47963	Merge pull request #5313 from barloes/fixRankingScoreThresholdRankingIssue fix for rankingScoreThreshold changes the results' ranking	2025-04-01 13:10:55 +00:00
Louis Dureuil	0656a0d515	Optimize roaring operation Co-authored-by: Many the fish <many@meilisearch.com>	2025-04-01 14:25:27 +02:00
Tamo	e36a8c50b9	Merge pull request #5478 from meilisearch/enforce-embedding-dimensions Enforce embedding dimensions	2025-03-31 15:31:29 +00:00
Louis Dureuil	08ff135ad6	Fix test	2025-03-31 15:27:49 +02:00
Louis Dureuil	f729864466	Check dimension mismatch at insertion time	2025-03-31 15:27:49 +02:00
Louis Dureuil	94ea263bef	Add new error for dimensions mismatch during indexing	2025-03-31 15:27:49 +02:00
Tamo	0e475cb5e6	fix warn and show what meilisearch understood of the vectors in the cursed test	2025-03-31 13:49:22 +02:00
vuthanhtung2412	62de70b73c	Document problematic case in test and acknowledge PR comment	2025-03-31 13:49:22 +02:00
vuthanhtung2412	7707fb18dd	add embedding with dimension mismatch test case	2025-03-31 13:49:22 +02:00
Clément Renault	bb2e9419d3	Merge pull request #5468 from meilisearch/more-precise-post-processing More Precise Post Processing	2025-03-27 10:07:09 +00:00
Clément Renault	cf68713145	Merge pull request #5465 from meilisearch/improve-stats-perf Improve documents stats performances	2025-03-27 09:20:14 +00:00
Kerollmops	811143cbe9	Add more progress precision when doing post processing	2025-03-27 10:17:28 +01:00
Kerollmops	c670e9a39b	Make sure the snaps are happy	2025-03-26 20:03:35 +01:00
Clément Renault	65f1b13475	Merge pull request #5464 from meilisearch/camel-case-database-sizes Prefer camelCase for internal database sizes db name	2025-03-26 16:40:39 +00:00
Kerollmops	db7ce03763	Improve the performances of computing the size of the documents database	2025-03-26 17:40:12 +01:00
Kerollmops	7ed9adde29	Prefer camelCase for internal database sizes db name	2025-03-26 16:45:52 +01:00
Louis Dureuil	f9807ba32e	Fix logic when results are below the threshold	2025-03-19 11:34:53 +01:00
Tee Jun hui	8c8cc59a6c	remove new line added by accident	2025-03-19 11:34:53 +01:00
Tee Jun hui	f540a69ac3	add 1 to index so it points to correct position	2025-03-19 11:34:52 +01:00