Merge pull request #6005 from meilisearch/clamp-max-batch-size

Clamp max batch size to 10 GiB
Merge pull request #5999 from meilisearch/fix-document-fetch-sort
2025-11-22 04:36:32 +00:00 · 2025-11-20 10:45:23 +00:00 · 2025-11-20 10:15:04 +00:00 · 2025-11-20 10:33:02 +01:00 · 2025-11-20 10:29:50 +01:00 · 2025-11-20 10:27:24 +01:00
4 changed files with 126 additions and 5 deletions
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@@ -231,8 +231,14 @@ pub fn setup_meilisearch(
        max_number_of_tasks: 1_000_000,
        max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.map_or_else(
-            // By default, we use half of the available memory to determine the size of batched tasks
-            || opt.indexer_options.max_indexing_memory.map_or(u64::MAX, |mem| mem.as_u64() / 2),
+            || {
+                opt.indexer_options
+                    .max_indexing_memory
+                    // By default, we use half of the available memory to determine the size of batched tasks
+                    .map_or(u64::MAX, |mem| mem.as_u64() / 2)
+                    // And never exceed 10 GiB when we infer the limit
+                    .min(10 * 1024 * 1024 * 1024)
+            },
            |size| size.as_u64(),
        ),
        index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
--- a/crates/meilisearch/src/option.rs
+++ b/crates/meilisearch/src/option.rs
@@ -474,7 +474,8 @@ pub struct Opt {
    pub experimental_max_number_of_batched_tasks: usize,

    /// Experimentally controls the maximum total size, in bytes, of tasks that will be processed
-    /// simultaneously. When unspecified, defaults to half of the maximum indexing memory.
+    /// simultaneously. When unspecified, defaults to half of the maximum indexing memory and
+    /// clamped to 10 GiB.
    ///
    /// See: <https://github.com/orgs/meilisearch/discussions/801>
    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE)]
--- a/crates/meilisearch/tests/documents/get_documents.rs
+++ b/crates/meilisearch/tests/documents/get_documents.rs
@@ -1339,3 +1339,117 @@ async fn get_document_with_vectors() {
    }
    "###);
 }
+
+#[actix_rt::test]
+async fn test_fetch_documents_pagination_with_sorting() {
+    let server = Server::new_shared();
+    let index = server.unique_index();
+    let (task, _code) = index.create(None).await;
+    server.wait_task(task.uid()).await.succeeded();
+
+    // Set name as sortable attribute
+    let (task, code) = index.update_settings_sortable_attributes(json!(["name"])).await;
+    assert_eq!(code, 202);
+    server.wait_task(task.uid()).await.succeeded();
+
+    let documents = json!((0..50)
+        .map(|i| json!({"id": i, "name": format!("doc_{:05}", std::cmp::min(i, 5))}))
+        .collect::<Vec<_>>());
+
+    // Add documents as described in the bug report
+    let (task, code) = index.add_documents(documents, None).await;
+    assert_eq!(code, 202);
+    server.wait_task(task.uid()).await.succeeded();
+
+    // Request 1 (first page): offset 0, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 0,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 0,
+        "name": "doc_00000"
+      },
+      {
+        "id": 1,
+        "name": "doc_00001"
+      }
+    ]
+    "###);
+
+    // Request 2 (second page): offset 2, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 2,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 2,
+        "name": "doc_00002"
+      },
+      {
+        "id": 3,
+        "name": "doc_00003"
+      }
+    ]
+    "###);
+
+    // Request 3 (third page): offset 4, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 4,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 4,
+        "name": "doc_00004"
+      },
+      {
+        "id": 5,
+        "name": "doc_00005"
+      }
+    ]
+    "###);
+
+    // Request 4 (fourth page): offset 6, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 6,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 6,
+        "name": "doc_00005"
+      },
+      {
+        "id": 7,
+        "name": "doc_00005"
+      }
+    ]
+    "###);
+}
--- a/crates/milli/src/documents/sort.rs
+++ b/crates/milli/src/documents/sort.rs
@@ -87,7 +87,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
        };

        // Otherwise don't directly iterate over children, skip them if we know we will go further
-        let mut to_skip = n - 1;
+        let mut to_skip = n;
        while to_skip > 0 {
            if let Err(e) = SortedDocumentsIterator::update_current(
                current_child,
@@ -108,7 +108,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
                continue;
            } else {
                // The current iterator is large enough, so we can forward the call to it.
-                return inner.nth(to_skip + 1);
+                return inner.nth(to_skip);
            }
        }
Author	SHA1	Message	Date
Clément Renault	cf62af13e8	Merge pull request #6005 from meilisearch/clamp-max-batch-size Clamp max batch size to 10 GiB	2025-11-20 10:45:23 +00:00
Many the fish	91cf94c196	Merge pull request #5999 from meilisearch/fix-document-fetch-sort Fix the Document Fetch pagination bug when Sort is applied	2025-11-20 10:15:04 +00:00
Clément Renault	753ba39199	Update the documentation of the batch size	2025-11-20 10:33:02 +01:00
Clément Renault	3944c25853	Clamp the maximum batch size to maximum 10GiB	2025-11-20 10:29:50 +01:00
ManyTheFish	925bce5fbd	Modify the test to test all the sort branches and fix the untested branch	2025-11-20 10:27:24 +01:00
ManyTheFish	62065ed30d	Fix the pagination bug where the last document of the previous page was duplicated as the first document of the current page. This was due to a bug on the custom nth function of the sort ranking rule skipping `n-1` documents instead of `n`	2025-11-20 10:27:24 +01:00