Merge pull request #6005 from meilisearch/clamp-max-batch-size

Clamp max batch size to 10 GiB
Merge pull request #5999 from meilisearch/fix-document-fetch-sort
2025-11-22 12:46:53 +00:00 · 2025-11-20 10:45:23 +00:00 · 2025-11-20 10:15:04 +00:00 · 2025-11-20 10:33:02 +01:00 · 2025-11-20 10:29:50 +01:00 · 2025-11-20 10:27:24 +01:00
11 changed files with 188 additions and 144 deletions
--- a/.github/workflows/publish-docker-images.yml
+++ b/.github/workflows/publish-docker-images.yml
@@ -13,93 +13,11 @@ on:
    - cron: '0 23 * * *' # Every day at 11:00pm
  workflow_dispatch:

-env:
-  REGISTRY_IMAGE: getmeili/meilisearch
-
 jobs:
-  build:
-    runs-on: ${{ matrix.runner }}
-
-    strategy:
-      matrix:
-        platform: [amd64, arm64]
-        include:
-          - platform: amd64
-            runner: ubuntu-24.04
-          - platform: arm64
-            runner: ubuntu-24.04-arm
-
-    permissions: {}
-    steps:
-      - uses: actions/checkout@v5
-
-      - name: Prepare
-        run: |
-          platform=linux/${{ matrix.platform }}
-          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-        with:
-          platforms: linux/${{ matrix.platform }}
-          install: true
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: ${{ env.REGISTRY_IMAGE }}
-          # Prevent `latest` to be updated for each new tag pushed.
-          # We need latest and `vX.Y` tags to only be pushed for the stable Meilisearch releases.
-          flavor: latest=false
-          tags: |
-            type=ref,event=tag
-            type=raw,value=nightly,enable=${{ github.event_name != 'push' }}
-            type=semver,pattern=v{{major}}.{{minor}},enable=${{ steps.check-tag-format.outputs.stable == 'true' }}
-            type=semver,pattern=v{{major}},enable=${{ steps.check-tag-format.outputs.stable == 'true' }}
-            type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}
-
-      - name: Build and push by digest
-        uses: docker/build-push-action@v6
-        id: build-and-push
-        with:
-          platforms: linux/${{ matrix.platform }}
-          labels: ${{ steps.meta.outputs.labels }}
-          tags: ${{ env.REGISTRY_IMAGE }}
-          outputs: type=image,push-by-digest=true,name-canonical=true,push=true
-          build-args: |
-            COMMIT_SHA=${{ github.sha }}
-            COMMIT_DATE=${{ steps.build-metadata.outputs.date }}
-            GIT_TAG=${{ github.ref_name }}
-
-      - name: Export digest
-        run: |
-          mkdir -p ${{ runner.temp }}/digests
-          digest="${{ steps.build-and-push.outputs.digest }}"
-          touch "${{ runner.temp }}/digests/${digest#sha256:}"
-
-      - name: Upload digest
-        uses: actions/upload-artifact@v4
-        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
-          path: ${{ runner.temp }}/digests/*
-          if-no-files-found: error
-          retention-days: 1
-
-  merge:
-    runs-on: ubuntu-latest
-    needs:
-      - build
-
+  docker:
+    runs-on: docker
    permissions:
      id-token: write # This is needed to use Cosign in keyless mode
-
    steps:
      - uses: actions/checkout@v5

@@ -140,30 +58,26 @@ jobs:

          echo "date=$commit_date" >> $GITHUB_OUTPUT

+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
      - name: Install cosign
        uses: sigstore/cosign-installer@d7543c93d881b35a8faa02e8e3605f69b7a1ce62 # tag=v3.10.0

-      - name: Download digests
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ runner.temp }}/digests
-          pattern: digests-*
-          merge-multiple: true
-
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}

-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
-          images: ${{ env.REGISTRY_IMAGE }}
+          images: getmeili/meilisearch
          # Prevent `latest` to be updated for each new tag pushed.
          # We need latest and `vX.Y` tags to only be pushed for the stable Meilisearch releases.
          flavor: latest=false
@@ -174,24 +88,26 @@ jobs:
            type=semver,pattern=v{{major}},enable=${{ steps.check-tag-format.outputs.stable == 'true' }}
            type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}

-      - name: Create manifest list and push
-        working-directory: ${{ runner.temp }}/digests
-        run: |
-          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
-            $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
-
-      - name: Inspect image
-        run: |
-          digest=$(docker buildx imagetools inspect --raw ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }} | jq -r '.manifests[0].digest')
-          echo "DIGEST=${digest}" >> $GITHUB_ENV
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        id: build-and-push
+        with:
+          push: true
+          platforms: linux/amd64,linux/arm64
+          tags: ${{ steps.meta.outputs.tags }}
+          build-args: |
+            COMMIT_SHA=${{ github.sha }}
+            COMMIT_DATE=${{ steps.build-metadata.outputs.date }}
+            GIT_TAG=${{ github.ref_name }}

      - name: Sign the images with GitHub OIDC Token
        env:
+          DIGEST: ${{ steps.build-and-push.outputs.digest }}
          TAGS: ${{ steps.meta.outputs.tags }}
        run: |
          images=""
          for tag in ${TAGS}; do
-            images+="${tag}@${{ env.DIGEST }} "
+            images+="${tag}@${DIGEST} "
          done
          cosign sign --yes ${images}

--- a/.github/workflows/publish-release-assets.yml
+++ b/.github/workflows/publish-release-assets.yml
@@ -65,9 +65,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-13, windows-2022]
+        os: [macos-14, windows-2022]
        include:
-          - os: macos-13
+          - os: macos-14
            artifact_name: meilisearch
            asset_name: meilisearch-macos-amd64
          - os: windows-2022
@@ -90,7 +90,7 @@ jobs:

  publish-macos-apple-silicon:
    name: Publish binary for macOS silicon
-    runs-on: macos-13
+    runs-on: macos-14
    needs: check-version
    strategy:
      matrix:
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -47,7 +47,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        os: [macos-13, windows-2022]
+        os: [macos-14, windows-2022]
    steps:
      - uses: actions/checkout@v5
      - name: Cache dependencies
--- a/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs
+++ b/crates/index-scheduler/src/scheduler/process_snapshot_creation.rs
@@ -438,12 +438,15 @@ async fn multipart_stream_to_s3(
    db_name: String,
    reader: std::io::PipeReader,
 ) -> Result<(), Error> {
-    use std::{collections::VecDeque, os::fd::OwnedFd, path::PathBuf};
+    use std::collections::VecDeque;
+    use std::io;
+    use std::os::fd::OwnedFd;
+    use std::path::PathBuf;

    use bytes::{Bytes, BytesMut};
    use reqwest::{Client, Response};
-    use rusty_s3::S3Action as _;
-    use rusty_s3::{actions::CreateMultipartUpload, Bucket, BucketError, Credentials, UrlStyle};
+    use rusty_s3::actions::CreateMultipartUpload;
+    use rusty_s3::{Bucket, BucketError, Credentials, S3Action as _, UrlStyle};
    use tokio::task::JoinHandle;

    let reader = OwnedFd::from(reader);
@@ -517,7 +520,6 @@ async fn multipart_stream_to_s3(
        while buffer.len() < (s3_multipart_part_size as usize / 2) {
            // Wait for the pipe to be readable

-            use std::io;
            reader.readable().await?;

            match reader.try_read_buf(&mut buffer) {
@@ -581,15 +583,17 @@ async fn multipart_stream_to_s3(
        async move {
            match client.post(url).body(body).send().await {
                Ok(resp) if resp.status().is_client_error() => {
-                    resp.error_for_status().map_err(backoff::Error::Permanent)
+                    Err(backoff::Error::Permanent(Error::S3Error {
+                        status: resp.status(),
+                        body: resp.text().await.unwrap_or_default(),
+                    }))
                }
                Ok(resp) => Ok(resp),
-                Err(e) => Err(backoff::Error::transient(e)),
+                Err(e) => Err(backoff::Error::transient(Error::S3HttpError(e))),
            }
        }
    })
-    .await
-    .map_err(Error::S3HttpError)?;
+    .await?;

    let status = resp.status();
    let body = resp.text().await.map_err(|e| Error::S3Error { status, body: e.to_string() })?;
--- a/crates/meilisearch/src/analytics/segment_analytics.rs
+++ b/crates/meilisearch/src/analytics/segment_analytics.rs
@@ -195,7 +195,7 @@ struct Infos {
    experimental_enable_logs_route: bool,
    experimental_reduce_indexing_memory_usage: bool,
    experimental_max_number_of_batched_tasks: usize,
-    experimental_limit_batched_tasks_total_size: u64,
+    experimental_limit_batched_tasks_total_size: Option<u64>,
    experimental_network: bool,
    experimental_multimodal: bool,
    experimental_chat_completions: bool,
@@ -359,7 +359,7 @@ impl Infos {
            http_payload_size_limit,
            experimental_max_number_of_batched_tasks,
            experimental_limit_batched_tasks_total_size:
-                experimental_limit_batched_tasks_total_size.into(),
+                experimental_limit_batched_tasks_total_size.map(|size| size.as_u64()),
            task_queue_webhook: task_webhook_url.is_some(),
            task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
            log_level: log_level.to_string(),
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@@ -230,7 +230,17 @@ pub fn setup_meilisearch(
        cleanup_enabled: !opt.experimental_replication_parameters,
        max_number_of_tasks: 1_000_000,
        max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
-        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.into(),
+        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.map_or_else(
+            || {
+                opt.indexer_options
+                    .max_indexing_memory
+                    // By default, we use half of the available memory to determine the size of batched tasks
+                    .map_or(u64::MAX, |mem| mem.as_u64() / 2)
+                    // And never exceed 10 GiB when we infer the limit
+                    .min(10 * 1024 * 1024 * 1024)
+            },
+            |size| size.as_u64(),
+        ),
        index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
        index_count: DEFAULT_INDEX_COUNT,
        instance_features: opt.to_instance_features(),
--- a/crates/meilisearch/src/option.rs
+++ b/crates/meilisearch/src/option.rs
@@ -473,11 +473,14 @@ pub struct Opt {
    #[serde(default = "default_limit_batched_tasks")]
    pub experimental_max_number_of_batched_tasks: usize,

-    /// Experimentally reduces the maximum total size, in bytes, of tasks that will be processed at once,
-    /// see: <https://github.com/orgs/meilisearch/discussions/801>
-    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())]
-    #[serde(default = "default_limit_batched_tasks_total_size")]
-    pub experimental_limit_batched_tasks_total_size: Byte,
+    /// Experimentally controls the maximum total size, in bytes, of tasks that will be processed
+    /// simultaneously. When unspecified, defaults to half of the maximum indexing memory and
+    /// clamped to 10 GiB.
+    ///
+    /// See: <https://github.com/orgs/meilisearch/discussions/801>
+    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE)]
+    #[serde(default)]
+    pub experimental_limit_batched_tasks_total_size: Option<Byte>,

    /// Enables experimental caching of search query embeddings. The value represents the maximal number of entries in the cache of each
    /// distinct embedder.
@@ -701,10 +704,12 @@ impl Opt {
            MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
            experimental_max_number_of_batched_tasks.to_string(),
        );
-        export_to_env_if_not_present(
-            MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE,
-            experimental_limit_batched_tasks_total_size.to_string(),
-        );
+        if let Some(limit) = experimental_limit_batched_tasks_total_size {
+            export_to_env_if_not_present(
+                MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE,
+                limit.to_string(),
+            );
+        }
        export_to_env_if_not_present(
            MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES,
            experimental_embedding_cache_entries.to_string(),
@@ -1273,10 +1278,6 @@ fn default_limit_batched_tasks() -> usize {
    usize::MAX
 }

-fn default_limit_batched_tasks_total_size() -> Byte {
-    Byte::from_u64(u64::MAX)
-}
-
 fn default_embedding_cache_entries() -> usize {
    0
 }
--- a/crates/meilisearch/src/personalization/mod.rs
+++ b/crates/meilisearch/src/personalization/mod.rs
@@ -1,14 +1,14 @@
-use crate::search::{Personalize, SearchResult};
-use meilisearch_types::{
-    error::{Code, ErrorCode, ResponseError},
-    milli::TimeBudget,
-};
+use std::time::Duration;
+
+use meilisearch_types::error::{Code, ErrorCode, ResponseError};
+use meilisearch_types::milli::TimeBudget;
 use rand::Rng;
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
-use std::time::Duration;
 use tracing::{debug, info, warn};

+use crate::search::{Personalize, SearchResult};
+
 const COHERE_API_URL: &str = "https://api.cohere.ai/v1/rerank";
 const MAX_RETRIES: u32 = 10;

--- a/crates/meilisearch/src/search/federated/types.rs
+++ b/crates/meilisearch/src/search/federated/types.rs
@@ -18,10 +18,9 @@ use serde::{Deserialize, Serialize};
 use utoipa::ToSchema;
 use uuid::Uuid;

-use crate::search::SearchMetadata;
-
 use super::super::{ComputedFacets, FacetStats, HitsInfo, SearchHit, SearchQueryWithIndex};
 use crate::milli::vector::Embedding;
+use crate::search::SearchMetadata;

 pub const DEFAULT_FEDERATED_WEIGHT: f64 = 1.0;

--- a/crates/meilisearch/tests/documents/get_documents.rs
+++ b/crates/meilisearch/tests/documents/get_documents.rs
@@ -1339,3 +1339,117 @@ async fn get_document_with_vectors() {
    }
    "###);
 }
+
+#[actix_rt::test]
+async fn test_fetch_documents_pagination_with_sorting() {
+    let server = Server::new_shared();
+    let index = server.unique_index();
+    let (task, _code) = index.create(None).await;
+    server.wait_task(task.uid()).await.succeeded();
+
+    // Set name as sortable attribute
+    let (task, code) = index.update_settings_sortable_attributes(json!(["name"])).await;
+    assert_eq!(code, 202);
+    server.wait_task(task.uid()).await.succeeded();
+
+    let documents = json!((0..50)
+        .map(|i| json!({"id": i, "name": format!("doc_{:05}", std::cmp::min(i, 5))}))
+        .collect::<Vec<_>>());
+
+    // Add documents as described in the bug report
+    let (task, code) = index.add_documents(documents, None).await;
+    assert_eq!(code, 202);
+    server.wait_task(task.uid()).await.succeeded();
+
+    // Request 1 (first page): offset 0, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 0,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 0,
+        "name": "doc_00000"
+      },
+      {
+        "id": 1,
+        "name": "doc_00001"
+      }
+    ]
+    "###);
+
+    // Request 2 (second page): offset 2, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 2,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 2,
+        "name": "doc_00002"
+      },
+      {
+        "id": 3,
+        "name": "doc_00003"
+      }
+    ]
+    "###);
+
+    // Request 3 (third page): offset 4, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 4,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 4,
+        "name": "doc_00004"
+      },
+      {
+        "id": 5,
+        "name": "doc_00005"
+      }
+    ]
+    "###);
+
+    // Request 4 (fourth page): offset 6, limit 2
+    let (response, code) = index
+        .fetch_documents(json!({
+            "offset": 6,
+            "limit": 2,
+            "sort": ["name:asc"]
+        }))
+        .await;
+    assert_eq!(code, 200);
+    let results = response["results"].as_array().unwrap();
+    snapshot!(json_string!(results), @r###"
+    [
+      {
+        "id": 6,
+        "name": "doc_00005"
+      },
+      {
+        "id": 7,
+        "name": "doc_00005"
+      }
+    ]
+    "###);
+}
--- a/crates/milli/src/documents/sort.rs
+++ b/crates/milli/src/documents/sort.rs
@@ -87,7 +87,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
        };

        // Otherwise don't directly iterate over children, skip them if we know we will go further
-        let mut to_skip = n - 1;
+        let mut to_skip = n;
        while to_skip > 0 {
            if let Err(e) = SortedDocumentsIterator::update_current(
                current_child,
@@ -108,7 +108,7 @@ impl Iterator for SortedDocumentsIterator<'_> {
                continue;
            } else {
                // The current iterator is large enough, so we can forward the call to it.
-                return inner.nth(to_skip + 1);
+                return inner.nth(to_skip);
            }
        }
Author	SHA1	Message	Date
Clément Renault	cf62af13e8	Merge pull request #6005 from meilisearch/clamp-max-batch-size Clamp max batch size to 10 GiB	2025-11-20 10:45:23 +00:00
Many the fish	91cf94c196	Merge pull request #5999 from meilisearch/fix-document-fetch-sort Fix the Document Fetch pagination bug when Sort is applied	2025-11-20 10:15:04 +00:00
Clément Renault	753ba39199	Update the documentation of the batch size	2025-11-20 10:33:02 +01:00
Clément Renault	3944c25853	Clamp the maximum batch size to maximum 10GiB	2025-11-20 10:29:50 +01:00
ManyTheFish	925bce5fbd	Modify the test to test all the sort branches and fix the untested branch	2025-11-20 10:27:24 +01:00
ManyTheFish	62065ed30d	Fix the pagination bug where the last document of the previous page was duplicated as the first document of the current page. This was due to a bug on the custom nth function of the sort ranking rule skipping `n-1` documents instead of `n`	2025-11-20 10:27:24 +01:00
Clément Renault	97e6ae1957	Merge pull request #5994 from meilisearch/improve-s3-error-messages Improve S3 upload by showing errors in the task queue	2025-11-19 16:58:02 +00:00
Clément Renault	5ed9be0789	Merge pull request #5990 from meilisearch/default-max-batch-size Make the limit batched tasks total size defaults to half of the max indexing memory	2025-11-19 16:56:34 +00:00
Clément Renault	7597b1049f	Merge pull request #6001 from meilisearch/update-windows-macos-ci Update the macOS platform version in the CI	2025-11-19 16:12:52 +00:00
Clément Renault	d99150f21b	Improve error message extraction Co-authored-by: Many the fish <many@meilisearch.com>	2025-11-19 17:09:15 +01:00
Kerollmops	c9726674a0	Make the limit batched tasks total size default to half of max indexing memory	2025-11-19 17:04:45 +01:00
Clément Renault	205f40b3b8	Update the macOS platform version to use version 14	2025-11-19 16:10:41 +01:00
Kerollmops	361580f451	Display the error message on failure	2025-11-17 09:21:18 +01:00