Change implementation of MergedDocuments::iter_top_level_fields

Merge #5131
5131: Ignore documents whose selected fields didn't change r=dureuill a=dureuill Attempts to improve the new indexer performance by ignoring documents whose selected fields didn't change: - Add `Update::has_changed_for_fields` function - Ignore documents whose searchable attributes didn't change for word docids and word pair proximity extraction - Ignore documents whose faceted attributes didn't change for facet extraction Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2025-07-22 06:11:01 +00:00 · 2024-12-09 09:38:21 +01:00 · 2024-12-05 16:04:16 +00:00 · 2024-12-05 15:21:55 +00:00 · 2024-12-05 16:13:07 +01:00 · 2024-12-05 16:12:52 +01:00
29 changed files with 229 additions and 52 deletions
--- a/crates/index-scheduler/src/batch.rs
+++ b/crates/index-scheduler/src/batch.rs
@ -497,7 +497,6 @@ impl IndexScheduler {
        // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task.
        let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) };
        let mut task = self.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
-        current_batch.processing(Some(&mut task));

        // If the task is not associated with any index, verify that it is an index swap and
        // create the batch directly. Otherwise, get the index name associated with the task
@ -507,6 +506,7 @@ impl IndexScheduler {
            index_name
        } else {
            assert!(matches!(&task.kind, KindWithContent::IndexSwap { swaps } if swaps.is_empty()));
+            current_batch.processing(Some(&mut task));
            return Ok(Some((Batch::IndexSwap { task }, current_batch)));
        };

--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@ -4319,10 +4319,35 @@ mod tests {
        let proc = index_scheduler.processing_tasks.read().unwrap().clone();

        let query = Query { statuses: Some(vec![Status::Processing]), ..Default::default() };
-        let (batches, _) = index_scheduler
-            .get_batch_ids_from_authorized_indexes(&rtxn, &proc, &query, &AuthFilter::default())
+        let (mut batches, _) = index_scheduler
+            .get_batches_from_authorized_indexes(query.clone(), &AuthFilter::default())
            .unwrap();
-        snapshot!(snapshot_bitmap(&batches), @"[0,]"); // only the processing batch in the first tick
+        assert_eq!(batches.len(), 1);
+        batches[0].started_at = OffsetDateTime::UNIX_EPOCH;
+        // Insta cannot snapshot our batches because the batch stats contains an enum as key: https://github.com/mitsuhiko/insta/issues/689
+        let batch = serde_json::to_string_pretty(&batches[0]).unwrap();
+        snapshot!(batch, @r#"
+        {
+          "uid": 0,
+          "details": {
+            "primaryKey": "mouse"
+          },
+          "stats": {
+            "totalNbTasks": 1,
+            "status": {
+              "processing": 1
+            },
+            "types": {
+              "indexCreation": 1
+            },
+            "indexUids": {
+              "catto": 1
+            }
+          },
+          "startedAt": "1970-01-01T00:00:00Z",
+          "finishedAt": null
+        }
+        "#);

        let query = Query { statuses: Some(vec![Status::Enqueued]), ..Default::default() };
        let (batches, _) = index_scheduler
--- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/aborted_indexation.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(1):
 [1,]
-{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, }
+{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_mix_of_tasks/processing_second_task_cancel_enqueued.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(1):
 [1,]
-{uid: 1, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"beavero":2}}, }
+{uid: 1, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"beavero":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, batch_uid: 0, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"dumpCreation":1},"indexUids":{}}, }
+{uid: 0, details: {"dumpUid":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"dumpCreation":1},"indexUids":{}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/aborted_indexation.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/cancel_task_registered.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/cancel_processing_task/initial_task_processing.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"catto":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"catto":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "catto", primary_key: None, method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/document_addition/after_the_batch_creation.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_process_batch_for_document_addition/document_addition_batch_created.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_batch_succeeded.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/fail_in_update_task_after_process_batch_success_for_document_addition/after_failing_to_commit.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"receivedDocuments":2,"indexedDocuments":null}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"documentAdditionOrUpdate":2},"indexUids":{"doggos":2}}, }
+{uid: 0, details: {"receivedDocuments":1,"indexedDocuments":null}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"documentAdditionOrUpdate":1},"indexUids":{"doggos":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/after_batch_creation.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, }
+{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_second_task.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, }
+{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/insert_task_while_another_task_is_processing/registered_the_third_task.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [0,]
-{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"index_a":2}}, }
+{uid: 0, details: {"primaryKey":"id"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"index_a":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { primary_key: Some("id") }, kind: IndexCreation { index_uid: "index_a", primary_key: Some("id") }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/query_batches_simple/after-advancing-a-bit.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(1):
 [1,]
-{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":2,"status":{"enqueued":2},"types":{"indexCreation":2},"indexUids":{"doggo":2}}, }
+{uid: 1, details: {"primaryKey":"sheep"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, batch_uid: 0, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
--- a/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap
+++ b/crates/index-scheduler/src/snapshots/lib.rs/task_deletion_undeleteable/task_deletion_processing.snap
@ -5,7 +5,7 @@ snapshot_kind: text
 ### Autobatching Enabled = true
 ### Processing batch Some(0):
 [3,]
-{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"enqueued":1},"types":{"taskDeletion":1},"indexUids":{}}, }
+{uid: 0, details: {"matchedTasks":2,"deletedTasks":null,"originalFilter":"test_query"}, stats: {"totalNbTasks":1,"status":{"processing":1},"types":{"taskDeletion":1},"indexUids":{}}, }
 ----------------------------------------------------------------------
 ### All Tasks:
 0 {uid: 0, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }}
--- a/crates/index-scheduler/src/utils.rs
+++ b/crates/index-scheduler/src/utils.rs
@ -67,7 +67,7 @@ impl ProcessingBatch {
            task.batch_uid = Some(self.uid);
            // We don't store the statuses in the map since they're all enqueued but we must
            // still store them in the stats since that can be displayed.
-            *self.stats.status.entry(task.status).or_default() += 1;
+            *self.stats.status.entry(Status::Processing).or_default() += 1;

            self.kinds.insert(task.kind.as_kind());
            *self.stats.types.entry(task.kind.as_kind()).or_default() += 1;
--- a/crates/meilisearch-types/src/error.rs
+++ b/crates/meilisearch-types/src/error.rs
@ -279,6 +279,7 @@ InvalidSearchPage                     , InvalidRequest       , BAD_REQUEST ;
 InvalidSearchQ                        , InvalidRequest       , BAD_REQUEST ;
 InvalidFacetSearchQuery               , InvalidRequest       , BAD_REQUEST ;
 InvalidFacetSearchName                , InvalidRequest       , BAD_REQUEST ;
+FacetSearchDisabled                   , InvalidRequest       , BAD_REQUEST ;
 InvalidSearchVector                   , InvalidRequest       , BAD_REQUEST ;
 InvalidSearchShowMatchesPosition      , InvalidRequest       , BAD_REQUEST ;
 InvalidSearchShowRankingScore         , InvalidRequest       , BAD_REQUEST ;
--- a/crates/meilisearch/src/option.rs
+++ b/crates/meilisearch/src/option.rs
@ -654,8 +654,9 @@ impl Opt {

 #[derive(Debug, Default, Clone, Parser, Deserialize)]
 pub struct IndexerOpts {
-    /// Sets the maximum amount of RAM Meilisearch can use when indexing. By default, Meilisearch
-    /// uses no more than two thirds of available memory.
+    /// Specifies the maximum resident memory that Meilisearch can use for indexing.
+    /// By default, Meilisearch limits the RAM usage to 5% of the total available memory.
+    /// Note that the underlying store utilizes memory-mapping and makes use of the rest.
    #[clap(long, env = MEILI_MAX_INDEXING_MEMORY, default_value_t)]
    #[serde(default)]
    pub max_indexing_memory: MaxMemory,
@ -714,7 +715,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
    }
 }

-/// A type used to detect the max memory available and use 2/3 of it.
+/// A type used to detect the max resident memory available and use 5% of it.
 #[derive(Debug, Clone, Copy, Deserialize, Serialize)]
 pub struct MaxMemory(Option<Byte>);

@ -728,7 +729,7 @@ impl FromStr for MaxMemory {

 impl Default for MaxMemory {
    fn default() -> MaxMemory {
-        MaxMemory(total_memory_bytes().map(|bytes| bytes * 2 / 3).map(Byte::from_u64))
+        MaxMemory(total_memory_bytes().map(|bytes| bytes * 5 / 100).map(Byte::from_u64))
    }
 }

--- a/crates/meilisearch/src/search/mod.rs
+++ b/crates/meilisearch/src/search/mod.rs
@ -1407,6 +1407,13 @@ pub fn perform_facet_search(
        None => TimeBudget::default(),
    };

+    if !index.facet_search(&rtxn)? {
+        return Err(ResponseError::from_msg(
+            "The facet search is disabled for this index".to_string(),
+            Code::FacetSearchDisabled,
+        ));
+    }
+
    // In the faceted search context, we want to use the intersection between the locales provided by the user
    // and the locales of the facet string.
    // If the facet string is not localized, we **ignore** the locales provided by the user because the facet data has no locale.
--- a/crates/meilisearch/tests/common/mod.rs
+++ b/crates/meilisearch/tests/common/mod.rs
@ -52,6 +52,25 @@ impl Value {
        }
        self
    }
+
+    /// Return `true` if the `status` field is set to `failed`.
+    /// Panic if the `status` field doesn't exists.
+    #[track_caller]
+    pub fn is_fail(&self) -> bool {
+        if !self["status"].is_string() {
+            panic!("Called `is_fail` on {}", serde_json::to_string_pretty(&self.0).unwrap());
+        }
+        self["status"] == serde_json::Value::String(String::from("failed"))
+    }
+
+    // Panic if the json doesn't contain the `status` field set to "succeeded"
+    #[track_caller]
+    pub fn failed(&self) -> &Self {
+        if !self.is_fail() {
+            panic!("Called failed on {}", serde_json::to_string_pretty(&self.0).unwrap());
+        }
+        self
+    }
 }

 impl From<serde_json::Value> for Value {
--- a/crates/meilisearch/tests/search/facet_search.rs
+++ b/crates/meilisearch/tests/search/facet_search.rs
@ -221,8 +221,15 @@ async fn add_documents_and_deactivate_facet_search() {
    let (response, code) =
        index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await;

-    assert_eq!(code, 200, "{}", response);
-    assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 0);
+    assert_eq!(code, 400, "{}", response);
+    snapshot!(response, @r###"
+    {
+      "message": "The facet search is disabled for this index",
+      "code": "facet_search_disabled",
+      "type": "invalid_request",
+      "link": "https://docs.meilisearch.com/errors#facet_search_disabled"
+    }
+    "###);
 }

 #[actix_rt::test]
@ -245,8 +252,15 @@ async fn deactivate_facet_search_and_add_documents() {
    let (response, code) =
        index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await;

-    assert_eq!(code, 200, "{}", response);
-    assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 0);
+    assert_eq!(code, 400, "{}", response);
+    snapshot!(response, @r###"
+    {
+      "message": "The facet search is disabled for this index",
+      "code": "facet_search_disabled",
+      "type": "invalid_request",
+      "link": "https://docs.meilisearch.com/errors#facet_search_disabled"
+    }
+    "###);
 }

 #[actix_rt::test]
--- a/crates/meilisearch/tests/snapshot/mod.rs
+++ b/crates/meilisearch/tests/snapshot/mod.rs
@ -129,11 +129,11 @@ async fn perform_on_demand_snapshot() {

    index.load_test_set().await;

-    server.index("doggo").create(Some("bone")).await;
-    index.wait_task(2).await;
+    let (task, _) = server.index("doggo").create(Some("bone")).await;
+    index.wait_task(task.uid()).await.succeeded();

-    server.index("doggo").create(Some("bone")).await;
-    index.wait_task(2).await;
+    let (task, _) = server.index("doggo").create(Some("bone")).await;
+    index.wait_task(task.uid()).await.failed();

    let (task, code) = server.create_snapshot().await;
    snapshot!(code, @"202 Accepted");
--- a/crates/milli/src/update/new/document.rs
+++ b/crates/milli/src/update/new/document.rs
@ -1,5 +1,6 @@
 use std::collections::{BTreeMap, BTreeSet};

+use either::Either;
 use heed::RoTxn;
 use raw_collections::RawMap;
 use serde_json::value::RawValue;
@ -209,11 +210,13 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
    for MergedDocument<'d, 'doc, 't, Mapper>
 {
    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> {
+        match &self.db {
+            Some(db) => {
                let mut new_doc_it = self.new_doc.iter_top_level_fields();
-        let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields());
+                let mut db_it = db.iter_top_level_fields();
                let mut seen_fields = BTreeSet::new();

-        std::iter::from_fn(move || {
+                Either::Left(std::iter::from_fn(move || {
                    if let Some(next) = new_doc_it.next() {
                        if let Ok((name, _)) = next {
                            seen_fields.insert(name);
@ -231,7 +234,10 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
                            Err(err) => return Some(Err(err)),
                        }
                    }
-        })
+                }))
+            }
+            None => Either::Right(self.new_doc.iter_top_level_fields()),
+        }
    }

    fn vectors_field(&self) -> Result<Option<&'d RawValue>> {
--- a/crates/milli/src/update/new/document_change.rs
+++ b/crates/milli/src/update/new/document_change.rs
@ -1,7 +1,10 @@
 use bumpalo::Bump;
 use heed::RoTxn;

-use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions};
+use super::document::{
+    Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
+};
+use super::extract::perm_json_p;
 use super::vector_document::{
    MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions,
 };
@ -164,6 +167,80 @@ impl<'doc> Update<'doc> {
        }
    }

+    /// Returns whether the updated version of the document is different from the current version for the passed subset of fields.
+    ///
+    /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed.
+    /// Otherwise `false`.
+    pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>(
+        &self,
+        fields: Option<&[&str]>,
+        rtxn: &'t RoTxn,
+        index: &'t Index,
+        mapper: &'t Mapper,
+    ) -> Result<bool> {
+        let mut changed = false;
+        let mut cached_current = None;
+        let mut updated_selected_field_count = 0;
+
+        for entry in self.updated().iter_top_level_fields() {
+            let (key, updated_value) = entry?;
+
+            if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
+                continue;
+            }
+
+            updated_selected_field_count += 1;
+            let current = match cached_current {
+                Some(current) => current,
+                None => self.current(rtxn, index, mapper)?,
+            };
+            let current_value = current.top_level_field(key)?;
+            let Some(current_value) = current_value else {
+                changed = true;
+                break;
+            };
+
+            if current_value.get() != updated_value.get() {
+                changed = true;
+                break;
+            }
+            cached_current = Some(current);
+        }
+
+        if !self.has_deletion {
+            // no field deletion, so fields that don't appear in `updated` cannot have changed
+            return Ok(changed);
+        }
+
+        if changed {
+            return Ok(true);
+        }
+
+        // we saw all updated fields, and set `changed` if any field wasn't in `current`.
+        // so if there are as many fields in `current` as in `updated`, then nothing changed.
+        // If there is any more fields in `current`, then they are missing in `updated`.
+        let has_deleted_fields = {
+            let current = match cached_current {
+                Some(current) => current,
+                None => self.current(rtxn, index, mapper)?,
+            };
+
+            let mut current_selected_field_count = 0;
+            for entry in current.iter_top_level_fields() {
+                let (key, _) = entry?;
+
+                if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip {
+                    continue;
+                }
+                current_selected_field_count += 1;
+            }
+
+            current_selected_field_count != updated_selected_field_count
+        };
+
+        Ok(has_deleted_fields)
+    }
+
    pub fn updated_vectors(
        &self,
        doc_alloc: &'doc Bump,
--- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs
@ -97,6 +97,15 @@ impl FacetedDocidsExtractor {
                },
            ),
            DocumentChange::Update(inner) => {
+                if !inner.has_changed_for_fields(
+                    Some(attributes_to_extract),
+                    rtxn,
+                    index,
+                    context.db_fields_ids_map,
+                )? {
+                    return Ok(());
+                }
+
                extract_document_facets(
                    attributes_to_extract,
                    inner.current(rtxn, index, context.db_fields_ids_map)?,
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -351,6 +351,15 @@ impl WordDocidsExtractors {
                )?;
            }
            DocumentChange::Update(inner) => {
+                if !inner.has_changed_for_fields(
+                    document_tokenizer.attribute_to_extract,
+                    &context.rtxn,
+                    context.index,
+                    context.db_fields_ids_map,
+                )? {
+                    return Ok(());
+                }
+
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
                    cached_sorter.insert_del_u32(
                        fid,
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
                )?;
            }
            DocumentChange::Update(inner) => {
+                if !inner.has_changed_for_fields(
+                    document_tokenizer.attribute_to_extract,
+                    rtxn,
+                    index,
+                    context.db_fields_ids_map,
+                )? {
+                    return Ok(());
+                }
+
                let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
                process_document_tokens(
                    document,
Author	SHA1	Message	Date
Louis Dureuil	1cf14e765f	Change implementation of MergedDocuments::iter_top_level_fields	2024-12-09 09:38:21 +01:00
meili-bors[bot]	4a082683df	Merge #5131 Some checks failed Test suite / Tests on ${{ matrix.os }} (windows-2022) (push) Failing after 21s Test suite / Tests on ubuntu-20.04 (push) Failing after 10s Test suite / Tests almost all features (push) Has been skipped Test suite / Test disabled tokenization (push) Has been skipped Test suite / Run tests in debug (push) Failing after 10s Test suite / Run Rustfmt (push) Successful in 1m25s Test suite / Run Clippy (push) Successful in 5m54s Test suite / Tests on ${{ matrix.os }} (macos-13) (push) Has been cancelled 5131: Ignore documents whose selected fields didn't change r=dureuill a=dureuill Attempts to improve the new indexer performance by ignoring documents whose selected fields didn't change: - Add `Update::has_changed_for_fields` function - Ignore documents whose searchable attributes didn't change for word docids and word pair proximity extraction - Ignore documents whose faceted attributes didn't change for facet extraction Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2024-12-05 16:04:16 +00:00
meili-bors[bot]	26be5e0733	Merge #5123 5123: Fix batch details r=dureuill a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/5079 Fixes https://github.com/meilisearch/meilisearch/issues/5112 ## What does this PR do? - Make the processing tasks actually processing in the stats of the batch instead of enqueued - Stop counting one extra task for all non-prioritized batches in the stats - Add a test Co-authored-by: Tamo <tamo@meilisearch.com>	2024-12-05 15:21:55 +00:00
Louis Dureuil	bd5110a2fe	Fix clippy warnings	2024-12-05 16:13:07 +01:00
Louis Dureuil	fa8b9acdf6	Ignore documents that didn't change in facets	2024-12-05 16:12:52 +01:00
Louis Dureuil	2b74d1824b	Ignore documents that didn't change any field in word pair proximity	2024-12-05 15:56:22 +01:00
Louis Dureuil	c77b00d3ac	Don't extract word docids when no searchable changed	2024-12-05 15:51:58 +01:00
Louis Dureuil	c77073efcc	Update::has_changed_for_fields	2024-12-05 15:50:12 +01:00
meili-bors[bot]	1537323eb9	Merge #5119 5119: Settings opt out error msg r=Kerollmops a=ManyTheFish # Pull Request ## Related issue PRD: https://meilisearch.notion.site/API-usage-Settings-to-opt-out-indexing-features-fff4b06b651f8108ade3f858aeb16b14?pvs=4 ## What does this PR do? Add a new error code and message when the user tries a facet search on an index where the facet search is disabled: ```json { "message": "The facet search is disabled for this index", "code": "facet_search_disabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_facet_search_disabled" } ``` Co-authored-by: ManyTheFish <many@meilisearch.com>	2024-12-05 13:51:11 +00:00
ManyTheFish	a0a3b55700	Change error code	2024-12-05 14:48:29 +01:00
Tamo	214b51de87	try to fix the snapshot on demand flaky test	2024-12-05 14:45:54 +01:00
Tamo	95975944d7	fix the dumps missing the empty swap index tasks	2024-12-05 14:23:38 +01:00
meili-bors[bot]	9a9383643f	Merge #5125 Some checks failed Test suite / Tests on ${{ matrix.os }} (macos-13) (push) Waiting to run Test suite / Tests on ${{ matrix.os }} (windows-2022) (push) Failing after 37s Test suite / Tests on ubuntu-20.04 (push) Failing after 15s Test suite / Tests almost all features (push) Has been skipped Test suite / Test disabled tokenization (push) Has been skipped Test suite / Run tests in debug (push) Failing after 12s Test suite / Run Rustfmt (push) Successful in 2m14s Test suite / Run Clippy (push) Successful in 12m4s 5125: Change the default max memory usage to 5% of the total memory r=ManyTheFish a=Kerollmops After thorough testing, we found that giving 5% of the total available memory to allocate resident memory (caches and channels) is the best approach. The main reason is that the new indexer is highly memory-map oriented, with LMDB, and reads the database while performing the indexation. So, by allowing the maximum amount of memory available to LMDB and the OS, it will perform the key-value store reads and all other indexation operations faster by keeping more pages hot in the cache. In #5124, we also sorted the entries to merge to improve the read speed of LMDB. This is common in database management systems: Reading stuff on the disk is much faster when done in lexicographic order (the default sorted order of key values). The entries have a great chance of already being in the OS memory cache, as they were loaded in a previous read, and reading stuff on the disk is very slow compared to reading memory. Co-authored-by: Kerollmops <clement@meilisearch.com>	2024-12-05 10:11:25 +00:00
Kerollmops	9020a50df8	Change the default max memory usage to 5% of the total memory	2024-12-05 10:14:46 +01:00
Tamo	7a2af06b1e	update the impacted snapshots	2024-12-04 15:52:24 +01:00
Tamo	cb0c3a5aad	stop adding one enqueued tasks to all unprioritized batches	2024-12-04 15:48:28 +01:00
Tamo	cbcf6c9ba3	make the processing tasks as processing in a batch	2024-12-04 14:48:48 +01:00
Tamo	bf742d81cf	add a test	2024-12-04 14:47:02 +01:00
ManyTheFish	fc1df5793c	fix tests	2024-12-04 14:35:20 +01:00
ManyTheFish	953a82ca04	Add new error message	2024-12-04 11:15:29 +01:00