Modify the test to test all the sort branches and fix the untested branch

Fix the pagination bug
where the last document of the previous page was duplicated as the first document of the current page. This was due to a bug on the custom nth function of the sort ranking rule skipping `n-1` documents instead of `n`
2025-11-22 04:36:32 +00:00 · 2025-11-20 10:27:24 +01:00 · 2025-11-20 10:27:24 +01:00 · 2025-11-19 16:58:02 +00:00 · 2025-11-19 16:56:34 +00:00 · 2025-11-19 16:12:52 +00:00
10 changed files with 159 additions and 38 deletions
--- a/.github/workflows/publish-release-assets.yml
+++ b/.github/workflows/publish-release-assets.yml
@@ -65,9 +65,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-13, windows-2022]
+        os: [macos-14, windows-2022]
        include:
-          - os: macos-13
+          - os: macos-14
            artifact_name: meilisearch
            asset_name: meilisearch-macos-amd64
          - os: windows-2022
@@ -90,7 +90,7 @@ jobs:

  publish-macos-apple-silicon:
    name: Publish binary for macOS silicon
-    runs-on: macos-13
+    runs-on: macos-14
    needs: check-version
    strategy:
      matrix:
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -47,7 +47,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-13, windows-2022]
+        os: [macos-14, windows-2022]
    steps:
      - uses: actions/checkout@v5
      - name: Cache dependencies
--- a/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs
+++ b/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs
@@ -438,12 +438,15 @@ async fn multipart_stream_to_s3(
    db_name: String,
    reader: std::io::PipeReader,
 ) -> Result<(), Error> {
-    use std::{collections::VecDeque, os::fd::OwnedFd, path::PathBuf};
+    use std::collections::VecDeque;
+    use std::io;
+    use std::os::fd::OwnedFd;
+    use std::path::PathBuf;

    use bytes::{Bytes, BytesMut};
    use reqwest::{Client, Response};
-    use rusty_s3::S3Action as _;
-    use rusty_s3::{actions::CreateMultipartUpload, Bucket, BucketError, Credentials, UrlStyle};
+    use rusty_s3::actions::CreateMultipartUpload;
+    use rusty_s3::{Bucket, BucketError, Credentials, S3Action as _, UrlStyle};
    use tokio::task::JoinHandle;

    let reader = OwnedFd::from(reader);
@@ -517,7 +520,6 @@ async fn multipart_stream_to_s3(
        while buffer.len() < (s3_multipart_part_size as usize / 2) {
            // Wait for the pipe to be readable

-            use std::io;
            reader.readable().await?;

            match reader.try_read_buf(&mut buffer) {
@@ -581,15 +583,17 @@ async fn multipart_stream_to_s3(
        async move {
            match client.post(url).body(body).send().await {
                Ok(resp) if resp.status().is_client_error() => {
-                    resp.error_for_status().map_err(backoff::Error::Permanent)
+                    Err(backoff::Error::Permanent(Error::S3Error {
+                        status: resp.status(),
+                        body: resp.text().await.unwrap_or_default(),
+                    }))
                }
                Ok(resp) => Ok(resp),
-                Err(e) => Err(backoff::Error::transient(e)),
+                Err(e) => Err(backoff::Error::transient(Error::S3HttpError(e))),
            }
        }
    })
-    .await
-    .map_err(Error::S3HttpError)?;
+    .await?;

    let status = resp.status();
    let body = resp.text().await.map_err(|e| Error::S3Error { status, body: e.to_string() })?;
--- a/crates/meilisearch/src/analytics/segment_analytics.rs
+++ b/crates/meilisearch/src/analytics/segment_analytics.rs
@@ -195,7 +195,7 @@ struct Infos {
    experimental_enable_logs_route: bool,
    experimental_reduce_indexing_memory_usage: bool,
    experimental_max_number_of_batched_tasks: usize,
-    experimental_limit_batched_tasks_total_size: u64,
+    experimental_limit_batched_tasks_total_size: Option<u64>,
    experimental_network: bool,
    experimental_multimodal: bool,
    experimental_chat_completions: bool,
@@ -359,7 +359,7 @@ impl Infos {
            http_payload_size_limit,
            experimental_max_number_of_batched_tasks,
            experimental_limit_batched_tasks_total_size:
-                experimental_limit_batched_tasks_total_size.into(),
+                experimental_limit_batched_tasks_total_size.map(|size| size.as_u64()),
            task_queue_webhook: task_webhook_url.is_some(),
            task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
            log_level: log_level.to_string(),
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@@ -230,7 +230,11 @@ pub fn setup_meilisearch(
        cleanup_enabled: !opt.experimental_replication_parameters,
        max_number_of_tasks: 1_000_000,
        max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
-        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.into(),
+        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.map_or_else(
+            // By default, we use half of the available memory to determine the size of batched tasks
+            || opt.indexer_options.max_indexing_memory.map_or(u64::MAX, |mem| mem.as_u64() / 2),
+            |size| size.as_u64(),
+        ),
        index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
        index_count: DEFAULT_INDEX_COUNT,
        instance_features: opt.to_instance_features(),
--- a/crates/meilisearch/src/option.rs
+++ b/crates/meilisearch/src/option.rs
@@ -473,11 +473,13 @@ pub struct Opt {
    #[serde(default = "default_limit_batched_tasks")]
    pub experimental_max_number_of_batched_tasks: usize,

-    /// Experimentally reduces the maximum total size, in bytes, of tasks that will be processed at once,
-    /// see: <https://github.com/orgs/meilisearch/discussions/801>
-    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())]
-    #[serde(default = "default_limit_batched_tasks_total_size")]
-    pub experimental_limit_batched_tasks_total_size: Byte,
+    /// Experimentally controls the maximum total size, in bytes, of tasks that will be processed
+    /// simultaneously. When unspecified, defaults to half of the maximum indexing memory.
+    ///
+    /// See: <https://github.com/orgs/meilisearch/discussions/801>
+    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE)]
+    #[serde(default)]
+    pub experimental_limit_batched_tasks_total_size: Option<Byte>,

    /// Enables experimental caching of search query embeddings. The value represents the maximal number of entries in the cache of each
    /// distinct embedder.
@@ -701,10 +703,12 @@ impl Opt {
            MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
            experimental_max_number_of_batched_tasks.to_string(),
        );
-        export_to_env_if_not_present(
-            MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE,
-            experimental_limit_batched_tasks_total_size.to_string(),
-        );
+        if let Some(limit) = experimental_limit_batched_tasks_total_size {
+            export_to_env_if_not_present(
+                MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE,
+                limit.to_string(),
+            );
+        }
        export_to_env_if_not_present(
            MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES,
            experimental_embedding_cache_entries.to_string(),
@@ -1273,10 +1277,6 @@ fn default_limit_batched_tasks() -> usize {
    usize::MAX
 }

-fn default_limit_batched_tasks_total_size() -> Byte {
-    Byte::from_u64(u64::MAX)
-}
-
 fn default_embedding_cache_entries() -> usize {
    0
 }
--- a/crates/meilisearch/src/personalization/mod.rs
+++ b/crates/meilisearch/src/personalization/mod.rs
@@ -1,14 +1,14 @@
-use crate::search::{Personalize, SearchResult};
-use meilisearch_types::{
-    error::{Code, ErrorCode, ResponseError},
-    milli::TimeBudget,
-};
+use std::time::Duration;
+
+use meilisearch_types::error::{Code, ErrorCode, ResponseError};
+use meilisearch_types::milli::TimeBudget;
 use rand::Rng;
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
-use std::time::Duration;
 use tracing::{debug, info, warn};

+use crate::search::{Personalize, SearchResult};
+
 const COHERE_API_URL: &str = "https://api.cohere.ai/v1/rerank";
 const MAX_RETRIES: u32 = 10;

--- a/crates/meilisearch/src/search/federated/types.rs
+++ b/crates/meilisearch/src/search/federated/types.rs
@@ -18,10 +18,9 @@ use serde::{Deserialize, Serialize};
 use utoipa::ToSchema;
 use uuid::Uuid;

-use crate::search::SearchMetadata;
-
 use super::super::{ComputedFacets, FacetStats, HitsInfo, SearchHit, SearchQueryWithIndex};
 use crate::milli::vector::Embedding;
+use crate::search::SearchMetadata;

 pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0;

--- a/crates/meilisearch/tests/documents/get_documents.rs
+++ b/crates/meilisearch/tests/documents/get_documents.rs
@@ -1339,3 +1339,117 @@ async fn get_document_with_vectors() {
    }
    "###);
 }
+
+#[actix_rt::test]
+async fn test_fetch_documents_pagination_with_sorting() {
+    let server = Server::new_shared();
+    let index = server.unique_index();
+    let (task, _code) = index.create(None).await;
+    server.wait_task(task.uid()).await.succeeded();
+
+    // Set name as sortable attribute
+    let (task, code) = index.update_settings_sortable_attributes(json!(["name"])).await;
+    assert_eq!(code, 202);
+    server.wait_task(task.uid()).await.succeeded();
+
+    let documents = json!((0..50)
+        .map(|i| json!({"id": i, "name": format!("doc_{:05}", std::cmp::min(i, 5))}))
+        .collect::<Vec<_>>());
+
+    // Add documents as described in the bug report
+    let (task, code) = index.add_documents(documents, None).await;
+    assert_eq!(code, 202);
+    server.wait_task(task.uid()).await.succeeded();
+
+    // Request 1 (first page): offset 0, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 0,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 0,
+        "name": "doc_00000"
+      },
+      {
+        "id": 1,
+        "name": "doc_00001"
+      }
+    ]
+    "###);
+
+    // Request 2 (second page): offset 2, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 2,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 2,
+        "name": "doc_00002"
+      },
+      {
+        "id": 3,
+        "name": "doc_00003"
+      }
+    ]
+    "###);
+
+    // Request 3 (third page): offset 4, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 4,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 4,
+        "name": "doc_00004"
+      },
+      {
+        "id": 5,
+        "name": "doc_00005"
+      }
+    ]
+    "###);
+
+    // Request 4 (fourth page): offset 6, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 6,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 6,
+        "name": "doc_00005"
+      },
+      {
+        "id": 7,
+        "name": "doc_00005"
+      }
+    ]
+    "###);
+}
--- a/crates/milli/src/documents/sort.rs
+++ b/crates/milli/src/documents/sort.rs
@@ -87,7 +87,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
        };

        // Otherwise don't directly iterate over children, skip them if we know we will go further
-        let mut to_skip = n - 1;
+        let mut to_skip = n;
        while to_skip > 0 {
            if let Err(e) = SortedDocumentsIterator::update_current(
                current_child,
@@ -108,7 +108,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
                continue;
            } else {
                // The current iterator is large enough, so we can forward the call to it.
-                return inner.nth(to_skip + 1);
+                return inner.nth(to_skip);
            }
        }
Author	SHA1	Message	Date
ManyTheFish	925bce5fbd	Modify the test to test all the sort branches and fix the untested branch	2025-11-20 10:27:24 +01:00
ManyTheFish	62065ed30d	Fix the pagination bug where the last document of the previous page was duplicated as the first document of the current page. This was due to a bug on the custom nth function of the sort ranking rule skipping `n-1` documents instead of `n`	2025-11-20 10:27:24 +01:00
Clément Renault	97e6ae1957	Merge pull request #5994 from meilisearch/improve-s3-error-messages Improve S3 upload by showing errors in the task queue	2025-11-19 16:58:02 +00:00
Clément Renault	5ed9be0789	Merge pull request #5990 from meilisearch/default-max-batch-size Make the limit batched tasks total size defaults to half of the max indexing memory	2025-11-19 16:56:34 +00:00
Clément Renault	7597b1049f	Merge pull request #6001 from meilisearch/update-windows-macos-ci Update the macOS platform version in the CI	2025-11-19 16:12:52 +00:00
Clément Renault	d99150f21b	Improve error message extraction Co-authored-by: Many the fish <many@meilisearch.com>	2025-11-19 17:09:15 +01:00
Kerollmops	c9726674a0	Make the limit batched tasks total size default to half of max indexing memory	2025-11-19 17:04:45 +01:00
Clément Renault	205f40b3b8	Update the macOS platform version to use version 14	2025-11-19 16:10:41 +01:00
Kerollmops	361580f451	Display the error message on failure	2025-11-17 09:21:18 +01:00