enable debug symbols in release

Some clean-up
Add reports route
2025-12-24 05:16:59 +00:00 · 2023-11-16 10:21:55 +01:00 · 2023-11-16 09:46:10 +01:00 · 2023-11-15 23:06:47 +01:00 · 2023-11-15 23:06:19 +01:00 · 2023-11-15 23:01:35 +01:00
40 changed files with 687 additions and 1494 deletions
--- a/.github/workflows/benchmarks-pr.yml
+++ b/.github/workflows/benchmarks-pr.yml
@@ -90,7 +90,8 @@ jobs:
          set -x
          export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
          export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
-          echo 'Here are your benchmarks diff 👊' >> body.txt
+          export bench_name=$(echo ${{ steps.command.outputs.command-arguments }})
+          echo "Here are your $bench_name benchmarks diff 👊" >> body.txt
          echo '```' >> body.txt
          ./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
          echo '```' >> body.txt
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@@ -50,7 +50,7 @@ jobs:
    needs: check-version
    steps:
      - name: Create PR to Homebrew
-        uses: mislav/bump-homebrew-formula-action@v2
+        uses: mislav/bump-homebrew-formula-action@v3
        with:
          formula-name: meilisearch
          formula-path: Formula/m/meilisearch.rb
--- a/.github/workflows/publish-docker-images.yml
+++ b/.github/workflows/publish-docker-images.yml
@@ -63,7 +63,7 @@ jobs:
        uses: docker/setup-buildx-action@v3

      - name: Login to Docker Hub
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
--- a/.github/workflows/sdks-tests.yml
+++ b/.github/workflows/sdks-tests.yml
@@ -160,7 +160,7 @@ jobs:
        with:
          repository: meilisearch/meilisearch-js
      - name: Setup node
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@v4
        with:
          cache: 'yarn'
      - name: Install dependencies
@@ -318,7 +318,7 @@ jobs:
        with:
          repository: meilisearch/meilisearch-js-plugins
      - name: Setup node
-        uses: actions/setup-node@v3
+        uses: actions/setup-node@v4
        with:
          cache: yarn
      - name: Install dependencies
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -43,7 +43,7 @@ jobs:
          toolchain: nightly
          override: true
      - name: Cache dependencies
-        uses: Swatinem/rust-cache@v2.6.2
+        uses: Swatinem/rust-cache@v2.7.1
      - name: Run cargo check without any default features
        uses: actions-rs/cargo@v1
        with:
@@ -65,7 +65,7 @@ jobs:
    steps:
      - uses: actions/checkout@v3
      - name: Cache dependencies
-        uses: Swatinem/rust-cache@v2.6.2
+        uses: Swatinem/rust-cache@v2.7.1
      - name: Run cargo check without any default features
        uses: actions-rs/cargo@v1
        with:
@@ -149,7 +149,7 @@ jobs:
          toolchain: stable
          override: true
      - name: Cache dependencies
-        uses: Swatinem/rust-cache@v2.6.2
+        uses: Swatinem/rust-cache@v2.7.1
      - name: Run tests in debug
        uses: actions-rs/cargo@v1
        with:
@@ -168,7 +168,7 @@ jobs:
          override: true
          components: clippy
      - name: Cache dependencies
-        uses: Swatinem/rust-cache@v2.6.2
+        uses: Swatinem/rust-cache@v2.7.1
      - name: Run cargo clippy
        uses: actions-rs/cargo@v1
        with:
@@ -187,7 +187,7 @@ jobs:
          override: true
          components: rustfmt
      - name: Cache dependencies
-        uses: Swatinem/rust-cache@v2.6.2
+        uses: Swatinem/rust-cache@v2.7.1
      - name: Run cargo fmt
        # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
        # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,6 @@
 resolver = "2"
 members = [
    "meilisearch",
-    "meilitool",
    "meilisearch-types",
    "meilisearch-auth",
    "meili-snap",
@@ -19,7 +18,7 @@ members = [
 ]

 [workspace.package]
-version = "1.5.0"
+version = "1.4.1"
 authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
 description = "Meilisearch HTTP server"
 homepage = "https://meilisearch.com"
@@ -29,6 +28,7 @@ license = "MIT"

 [profile.release]
 codegen-units = 1
+debug = true

 [profile.dev.package.flate2]
 opt-level = 3
--- a/9
+++ b/9
@@ -3,7 +3,7 @@ FROM    rust:alpine3.16 AS compiler

 RUN     apk add -q --update-cache --no-cache build-base openssl-dev

-WORKDIR /
+WORKDIR /meilisearch

 ARG     COMMIT_SHA
 ARG     COMMIT_DATE
@@ -17,7 +17,7 @@ RUN     set -eux; \
        if [ "$apkArch" = "aarch64" ]; then \
            export JEMALLOC_SYS_WITH_LG_PAGE=16; \
        fi && \
-        cargo build --release -p meilisearch -p meilitool
+        cargo build --release

 # Run
 FROM    alpine:3.16
@@ -28,10 +28,9 @@ ENV     MEILI_SERVER_PROVIDER docker
 RUN     apk update --quiet \
        && apk add -q --no-cache libgcc tini curl

-# add meilisearch and meilitool to the `/bin` so you can run it from anywhere
-# and it's easy to find.
+# add meilisearch to the `/bin` so you can run it from anywhere and it's easy
+# to find.
 COPY    --from=compiler /meilisearch/target/release/meilisearch /bin/meilisearch
-COPY    --from=compiler /meilisearch/target/release/meilitool /bin/meilitool
 # To stay compatible with the older version of the container (pre v0.27.0) we're
 # going to symlink the meilisearch binary in the path to `/meilisearch`
 RUN     ln -s /bin/meilisearch /meilisearch
--- a/README.md
+++ b/README.md
@@ -25,12 +25,6 @@

 <p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>

---
-
-### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
-
---
-
 Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.

 <p align="center" name="demo">
--- a/dump/src/reader/mod.rs
+++ b/dump/src/reader/mod.rs
@@ -526,12 +526,12 @@ pub(crate) mod test {
        assert!(indexes.is_empty());

        // products
-        insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(products.metadata(), @r###"
        {
          "uid": "products",
          "primaryKey": "sku",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2022-10-09T20:27:22.688964637Z",
+          "updatedAt": "2022-10-09T20:27:23.951017769Z"
        }
        "###);

@@ -541,12 +541,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");

        // movies
-        insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(movies.metadata(), @r###"
        {
          "uid": "movies",
          "primaryKey": "id",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2022-10-09T20:27:22.197788495Z",
+          "updatedAt": "2022-10-09T20:28:01.93111053Z"
        }
        "###);

@@ -571,12 +571,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");

        // spells
-        insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(spells.metadata(), @r###"
        {
          "uid": "dnd_spells",
          "primaryKey": "index",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2022-10-09T20:27:24.242683494Z",
+          "updatedAt": "2022-10-09T20:27:24.312809641Z"
        }
        "###);

@@ -617,12 +617,12 @@ pub(crate) mod test {
        assert!(indexes.is_empty());

        // products
-        insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(products.metadata(), @r###"
        {
          "uid": "products",
          "primaryKey": "sku",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2023-01-30T16:25:56.595257Z",
+          "updatedAt": "2023-01-30T16:25:58.70348Z"
        }
        "###);

@@ -632,12 +632,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");

        // movies
-        insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(movies.metadata(), @r###"
        {
          "uid": "movies",
          "primaryKey": "id",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2023-01-30T16:25:56.192178Z",
+          "updatedAt": "2023-01-30T16:25:56.455714Z"
        }
        "###);

@@ -647,12 +647,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");

        // spells
-        insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(spells.metadata(), @r###"
        {
          "uid": "dnd_spells",
          "primaryKey": "index",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2023-01-30T16:25:58.876405Z",
+          "updatedAt": "2023-01-30T16:25:59.079906Z"
        }
        "###);

--- a/dump/src/reader/v2/mod.rs
+++ b/dump/src/reader/v2/mod.rs
@@ -46,6 +46,7 @@ pub type Checked = settings::Checked;
 pub type Unchecked = settings::Unchecked;

 pub type Task = updates::UpdateEntry;
+pub type Kind = updates::UpdateMeta;

 // everything related to the errors
 pub type ResponseError = errors::ResponseError;
@@ -107,8 +108,11 @@ impl V2Reader {
    pub fn indexes(&self) -> Result<impl Iterator<Item = Result<V2IndexReader>> + '_> {
        Ok(self.index_uuid.iter().map(|index| -> Result<_> {
            V2IndexReader::new(
-                index.uid.clone(),
                &self.dump.path().join("indexes").join(format!("index-{}", index.uuid)),
+                index,
+                BufReader::new(
+                    File::open(self.dump.path().join("updates").join("data.jsonl")).unwrap(),
+                ),
            )
        }))
    }
@@ -143,16 +147,41 @@ pub struct V2IndexReader {
 }

 impl V2IndexReader {
-    pub fn new(name: String, path: &Path) -> Result<Self> {
+    pub fn new(path: &Path, index_uuid: &IndexUuid, tasks: BufReader<File>) -> Result<Self> {
        let meta = File::open(path.join("meta.json"))?;
        let meta: DumpMeta = serde_json::from_reader(meta)?;

+        let mut created_at = None;
+        let mut updated_at = None;
+
+        for line in tasks.lines() {
+            let task: Task = serde_json::from_str(&line?)?;
+            if !(task.uuid == index_uuid.uuid && task.is_finished()) {
+                continue;
+            }
+
+            let new_created_at = match task.update.meta() {
+                Kind::DocumentsAddition { .. } | Kind::Settings(_) => task.update.finished_at(),
+                _ => None,
+            };
+            let new_updated_at = task.update.finished_at();
+
+            if created_at.is_none() || created_at > new_created_at {
+                created_at = new_created_at;
+            }
+
+            if updated_at.is_none() || updated_at < new_updated_at {
+                updated_at = new_updated_at;
+            }
+        }
+
+        let current_time = OffsetDateTime::now_utc();
+
        let metadata = IndexMetadata {
-            uid: name,
+            uid: index_uuid.uid.clone(),
            primary_key: meta.primary_key,
-            // FIXME: Iterate over the whole task queue to find the creation and last update date.
-            created_at: OffsetDateTime::now_utc(),
-            updated_at: OffsetDateTime::now_utc(),
+            created_at: created_at.unwrap_or(current_time),
+            updated_at: updated_at.unwrap_or(current_time),
        };

        let ret = V2IndexReader {
@@ -248,12 +277,12 @@ pub(crate) mod test {
        assert!(indexes.is_empty());

        // products
-        insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(products.metadata(), @r###"
        {
          "uid": "products",
          "primaryKey": "sku",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2022-10-09T20:27:22.688964637Z",
+          "updatedAt": "2022-10-09T20:27:23.951017769Z"
        }
        "###);

@@ -263,12 +292,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");

        // movies
-        insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(movies.metadata(), @r###"
        {
          "uid": "movies",
          "primaryKey": "id",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2022-10-09T20:27:22.197788495Z",
+          "updatedAt": "2022-10-09T20:28:01.93111053Z"
        }
        "###);

@@ -293,12 +322,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"d751713988987e9331980363e24189ce");

        // spells
-        insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(spells.metadata(), @r###"
        {
          "uid": "dnd_spells",
          "primaryKey": "index",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2022-10-09T20:27:24.242683494Z",
+          "updatedAt": "2022-10-09T20:27:24.312809641Z"
        }
        "###);

@@ -340,12 +369,12 @@ pub(crate) mod test {
        assert!(indexes.is_empty());

        // products
-        insta::assert_json_snapshot!(products.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(products.metadata(), @r###"
        {
          "uid": "products",
          "primaryKey": "sku",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2023-01-30T16:25:56.595257Z",
+          "updatedAt": "2023-01-30T16:25:58.70348Z"
        }
        "###);

@@ -355,12 +384,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"548284a84de510f71e88e6cdea495cf5");

        // movies
-        insta::assert_json_snapshot!(movies.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(movies.metadata(), @r###"
        {
          "uid": "movies",
          "primaryKey": "id",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2023-01-30T16:25:56.192178Z",
+          "updatedAt": "2023-01-30T16:25:56.455714Z"
        }
        "###);

@@ -370,12 +399,12 @@ pub(crate) mod test {
        meili_snap::snapshot_hash!(format!("{:#?}", documents), @"0227598af846e574139ee0b80e03a720");

        // spells
-        insta::assert_json_snapshot!(spells.metadata(), { ".createdAt" => "[now]", ".updatedAt" => "[now]" }, @r###"
+        insta::assert_json_snapshot!(spells.metadata(), @r###"
        {
          "uid": "dnd_spells",
          "primaryKey": "index",
-          "createdAt": "[now]",
-          "updatedAt": "[now]"
+          "createdAt": "2023-01-30T16:25:58.876405Z",
+          "updatedAt": "2023-01-30T16:25:59.079906Z"
        }
        "###);

--- a/dump/src/reader/v2/updates.rs
+++ b/dump/src/reader/v2/updates.rs
@@ -227,4 +227,14 @@ impl UpdateStatus {
            _ => None,
        }
    }
+
+    pub fn finished_at(&self) -> Option<OffsetDateTime> {
+        match self {
+            UpdateStatus::Processing(_) => None,
+            UpdateStatus::Enqueued(_) => None,
+            UpdateStatus::Processed(u) => Some(u.processed_at),
+            UpdateStatus::Aborted(_) => None,
+            UpdateStatus::Failed(u) => Some(u.failed_at),
+        }
+    }
 }
--- a/index-scheduler/Cargo.toml
+++ b/index-scheduler/Cargo.toml
@@ -12,6 +12,7 @@ license.workspace = true

 [dependencies]
 anyhow = "1.0.70"
+backtrace = "0.3.69"
 bincode = "1.3.3"
 csv = "1.2.1"
 derive_builder = "0.12.0"
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -825,6 +825,10 @@ impl IndexScheduler {
                // 2. dump the tasks
                let mut dump_tasks = dump.create_tasks_queue()?;
                for ret in self.all_tasks.iter(&rtxn)? {
+                    if self.must_stop_processing.get() {
+                        return Err(Error::AbortedTask);
+                    }
+
                    let (_, mut t) = ret?;
                    let status = t.status;
                    let content_file = t.content_uuid();
@@ -845,6 +849,9 @@ impl IndexScheduler {

                    // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
                    if let Some(content_file) = content_file {
+                        if self.must_stop_processing.get() {
+                            return Err(Error::AbortedTask);
+                        }
                        if status == Status::Enqueued {
                            let content_file = self.file_store.get_update(content_file)?;

@@ -884,6 +891,9 @@ impl IndexScheduler {

                    // 3.1. Dump the documents
                    for ret in index.all_documents(&rtxn)? {
+                        if self.must_stop_processing.get() {
+                            return Err(Error::AbortedTask);
+                        }
                        let (_id, doc) = ret?;
                        let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
                        index_dumper.push_document(&document)?;
@@ -903,6 +913,9 @@ impl IndexScheduler {
                    "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
                )).unwrap();

+                if self.must_stop_processing.get() {
+                    return Err(Error::AbortedTask);
+                }
                let path = self.dumps_path.join(format!("{}.dump", dump_uid));
                let file = File::create(path)?;
                dump.persist_to(BufWriter::new(file))?;
--- a/index-scheduler/src/error.rs
+++ b/index-scheduler/src/error.rs
@@ -108,6 +108,8 @@ pub enum Error {
    TaskDeletionWithEmptyQuery,
    #[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
    TaskCancelationWithEmptyQuery,
+    #[error("Aborted task")]
+    AbortedTask,

    #[error(transparent)]
    Dump(#[from] dump::Error),
@@ -115,8 +117,13 @@ pub enum Error {
    Heed(#[from] heed::Error),
    #[error(transparent)]
    Milli(#[from] milli::Error),
-    #[error("An unexpected crash occurred when processing the task.")]
-    ProcessBatchPanicked,
+    #[error("An unexpected crash occurred when processing the task. {}", {
+        match .0 {
+            Some(report) => format!("Get /reports/{}", report),
+            None => "No report was saved.".into(),
+        }
+    })]
+    ProcessBatchPanicked(Option<uuid::Uuid>),
    #[error(transparent)]
    FileStore(#[from] file_store::Error),
    #[error(transparent)]
@@ -175,10 +182,11 @@ impl Error {
            | Error::TaskNotFound(_)
            | Error::TaskDeletionWithEmptyQuery
            | Error::TaskCancelationWithEmptyQuery
+            | Error::AbortedTask
            | Error::Dump(_)
            | Error::Heed(_)
            | Error::Milli(_)
-            | Error::ProcessBatchPanicked
+            | Error::ProcessBatchPanicked(_)
            | Error::FileStore(_)
            | Error::IoError(_)
            | Error::Persist(_)
@@ -221,7 +229,7 @@ impl ErrorCode for Error {
            Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice,
            Error::Dump(e) => e.error_code(),
            Error::Milli(e) => e.error_code(),
-            Error::ProcessBatchPanicked => Code::Internal,
+            Error::ProcessBatchPanicked(_) => Code::Internal,
            Error::Heed(e) => e.error_code(),
            Error::HeedTransaction(e) => e.error_code(),
            Error::FileStore(e) => e.error_code(),
@@ -236,6 +244,9 @@ impl ErrorCode for Error {
            Error::TaskDatabaseUpdate(_) => Code::Internal,
            Error::CreateBatch(_) => Code::Internal,

+            // This one should never be seen by the end user
+            Error::AbortedTask => Code::Internal,
+
            #[cfg(test)]
            Error::PlannedFailure => Code::Internal,
        }
--- a/index-scheduler/src/insta_snapshot.rs
+++ b/index-scheduler/src/insta_snapshot.rs
@@ -39,6 +39,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
        test_breakpoint_sdr: _,
        planned_failures: _,
        run_loop_iteration: _,
+        panic_reader: _,
    } = scheduler;

    let rtxn = env.read_txn().unwrap();
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -26,8 +26,9 @@ mod index_mapper;
 #[cfg(test)]
 mod insta_snapshot;
 mod lru;
+mod panic_hook;
 mod utils;
-pub mod uuid_codec;
+mod uuid_codec;

 pub type Result<T> = std::result::Result<T, Error>;
 pub type TaskId = u32;
@@ -53,6 +54,8 @@ use meilisearch_types::milli::documents::DocumentsBatchBuilder;
 use meilisearch_types::milli::update::IndexerConfig;
 use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
 use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
+use panic_hook::ReportReader;
+pub use panic_hook::{Panic, Report, ReportRegistry};
 use puffin::FrameView;
 use roaring::RoaringBitmap;
 use synchronoise::SignalEvent;
@@ -331,6 +334,8 @@ pub struct IndexScheduler {
    /// The path to the version file of Meilisearch.
    pub(crate) version_file_path: PathBuf,

+    pub(crate) panic_reader: ReportReader,
+
    // ================= test
    // The next entry is dedicated to the tests.
    /// Provide a way to set a breakpoint in multiple part of the scheduler.
@@ -381,6 +386,7 @@ impl IndexScheduler {
            #[cfg(test)]
            run_loop_iteration: self.run_loop_iteration.clone(),
            features: self.features.clone(),
+            panic_reader: self.panic_reader.clone(),
        }
    }
 }
@@ -438,6 +444,12 @@ impl IndexScheduler {
        let finished_at = env.create_database(&mut wtxn, Some(db_name::FINISHED_AT))?;
        wtxn.commit()?;

+        const MAX_REPORT_COUNT: usize = 20;
+
+        let panic_reader = panic_hook::ReportReader::install_panic_hook(
+            std::num::NonZeroUsize::new(MAX_REPORT_COUNT).unwrap(),
+        );
+
        // allow unreachable_code to get rids of the warning in the case of a test build.
        let this = Self {
            must_stop_processing: MustStopProcessing::default(),
@@ -478,6 +490,7 @@ impl IndexScheduler {
            #[cfg(test)]
            run_loop_iteration: Arc::new(RwLock::new(0)),
            features,
+            panic_reader,
        };

        this.run();
@@ -1130,7 +1143,10 @@ impl IndexScheduler {
                .name(String::from("batch-operation"))
                .spawn(move || cloned_index_scheduler.process_batch(batch))
                .unwrap();
-            handle.join().unwrap_or(Err(Error::ProcessBatchPanicked))
+
+            self.panic_reader
+                .join_thread(handle)
+                .unwrap_or_else(|maybe_report| Err(Error::ProcessBatchPanicked(maybe_report)))
        };

        #[cfg(test)]
@@ -1167,7 +1183,8 @@ impl IndexScheduler {
            // If we have an abortion error we must stop the tick here and re-schedule tasks.
            Err(Error::Milli(milli::Error::InternalError(
                milli::InternalError::AbortedIndexation,
-            ))) => {
+            )))
+            | Err(Error::AbortedTask) => {
                #[cfg(test)]
                self.breakpoint(Breakpoint::AbortedIndexation);
                wtxn.abort().map_err(Error::HeedTransaction)?;
@@ -1310,6 +1327,10 @@ impl IndexScheduler {
        }
    }

+    pub fn reports(&self) -> Arc<RwLock<ReportRegistry>> {
+        self.panic_reader.registry()
+    }
+
    /// Blocks the thread until the test handle asks to progress to/through this breakpoint.
    ///
    /// Two messages are sent through the channel for each breakpoint.
@@ -4323,4 +4344,26 @@ mod tests {
        }
        "###);
    }
+
+    #[test]
+    fn cancel_processing_dump() {
+        let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
+
+        let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None };
+        let dump_cancellation = KindWithContent::TaskCancelation {
+            query: "cancel dump".to_owned(),
+            tasks: RoaringBitmap::from_iter([0]),
+        };
+        let _ = index_scheduler.register(dump_creation).unwrap();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register");
+        handle.advance_till([Start, BatchCreated, InsideProcessBatch]);
+
+        let _ = index_scheduler.register(dump_cancellation).unwrap();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered");
+
+        snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation");
+
+        handle.advance_one_successful_batch();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed");
+    }
 }
--- a/index-scheduler/src/panic_hook.rs
+++ b/index-scheduler/src/panic_hook.rs
@@ -0,0 +1,211 @@
+//! Panic hook designed to fetch a panic from a subthread and recover it on join.
+
+use std::collections::VecDeque;
+use std::num::NonZeroUsize;
+use std::panic::PanicInfo;
+use std::sync::{Arc, RwLock};
+use std::thread::{JoinHandle, ThreadId};
+
+use backtrace::Backtrace;
+
+// Represents a panic in a shallowy structured fashion
+pub struct Panic {
+    pub payload: Option<String>,
+    pub location: Option<String>,
+    pub thread_name: Option<String>,
+    pub thread_id: ThreadId,
+    pub backtrace: Backtrace,
+}
+
+/// A panic enriched with a unique id
+#[derive(serde::Serialize)]
+pub struct Report {
+    pub id: uuid::Uuid,
+    #[serde(serialize_with = "serialize_panic")]
+    pub panic: Panic,
+}
+
+fn serialize_panic<S>(panic: &Panic, s: S) -> std::result::Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    use serde::Serialize;
+
+    panic.to_json().serialize(s)
+}
+
+impl Report {
+    pub fn new(panic: Panic) -> Self {
+        Self { id: uuid::Uuid::new_v4(), panic }
+    }
+}
+
+impl Panic {
+    pub fn to_json(&self) -> serde_json::Value {
+        json::panic_to_json(self)
+    }
+}
+
+mod json {
+    use backtrace::{Backtrace, BacktraceFrame, BacktraceSymbol};
+    use serde_json::{json, Value};
+
+    use super::Panic;
+
+    fn symbol_to_json(symbol: &BacktraceSymbol) -> Value {
+        let address = symbol.addr().map(|addr| format!("{:p}", addr));
+        let column = symbol.colno();
+        let line = symbol.lineno();
+        let function = symbol.name().map(|name| name.to_string());
+        let filename = symbol.filename();
+        json!({
+            "function": function,
+            "filename": filename,
+            "line": line,
+            "column": column,
+            "address": address,
+        })
+    }
+
+    fn frame_to_json(frame: &BacktraceFrame) -> Value {
+        let symbols: Vec<_> = frame.symbols().iter().map(symbol_to_json).collect();
+        match symbols.as_slice() {
+            [] => {
+                let address = format!("{:p}", frame.ip());
+                json!({"address": address})
+            }
+            [symbol] => json!(symbol),
+            symbols => json!(symbols),
+        }
+    }
+
+    fn backtrace_to_json(backtrace: &Backtrace) -> Value {
+        let frames: Vec<_> = backtrace.frames().iter().map(frame_to_json).collect();
+        json!(frames)
+    }
+
+    pub fn panic_to_json(panic: &Panic) -> Value {
+        let thread_id = format!("{:?}", panic.thread_id);
+        serde_json::json!({
+            "payload": panic.payload,
+            "location": panic.location,
+            "thread": {
+                "id": thread_id,
+                "name": panic.thread_name,
+            },
+            "backtrace": backtrace_to_json(&panic.backtrace),
+        })
+    }
+}
+
+struct ReportWriter(Arc<RwLock<ReportRegistry>>);
+
+/// A FIFO queue of reports.
+pub struct ReportRegistry {
+    reports: std::collections::VecDeque<Report>,
+}
+
+impl ReportRegistry {
+    pub fn new(capacity: NonZeroUsize) -> Self {
+        Self { reports: VecDeque::with_capacity(capacity.get()) }
+    }
+
+    pub fn push(&mut self, report: Report) -> Option<Report> {
+        let popped = if self.reports.len() == self.reports.capacity() {
+            self.reports.pop_back()
+        } else {
+            None
+        };
+        self.reports.push_front(report);
+        popped
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &Report> {
+        self.reports.iter()
+    }
+
+    pub fn find(&self, report_id: uuid::Uuid) -> Option<&Report> {
+        self.iter().find(|report| report.id == report_id)
+    }
+}
+
+impl ReportWriter {
+    #[track_caller]
+    fn write_panic(&self, panic_info: &PanicInfo<'_>) {
+        let payload = panic_info
+            .payload()
+            .downcast_ref::<&str>()
+            .map(ToString::to_string)
+            .or_else(|| panic_info.payload().downcast_ref::<String>().cloned());
+        let location = panic_info.location().map(|loc| {
+            format!(
+                "{file}:{line}:{column}",
+                file = loc.file(),
+                line = loc.line(),
+                column = loc.column()
+            )
+        });
+
+        let thread_name = std::thread::current().name().map(ToString::to_string);
+        let thread_id = std::thread::current().id();
+        let backtrace = backtrace::Backtrace::new();
+
+        let panic = Panic { payload, location, thread_name, thread_id, backtrace };
+
+        let report = Report::new(panic);
+
+        log::error!(
+            "An unexpected panic occurred on thread {name} at {location}: {payload}. See report '{report}' for details.",
+            payload = report.panic.payload.as_deref().unwrap_or("Box<dyn Any>"),
+            name = report.panic.thread_name.as_deref().unwrap_or("<unnamed>"),
+            location = report.panic.location.as_deref().unwrap_or("<unknown>"),
+            report = report.id,
+        );
+
+        if let Ok(mut registry) = self.0.write() {
+            if let Some(old_report) = registry.push(report) {
+                log::trace!("Forgetting report {} to make space for new report.", old_report.id)
+            }
+        }
+    }
+}
+
+/// Reads the reports written in case of a panic.
+#[derive(Clone)]
+pub struct ReportReader(Arc<RwLock<ReportRegistry>>);
+
+impl ReportReader {
+    /// Installs a new global panic hook, overriding any existing hook.
+    ///
+    /// The hook writes any incoming panic in reports.
+    /// The reports can then be read by the returned [`ReportReader`].
+    pub fn install_panic_hook(capacity: NonZeroUsize) -> Self {
+        let registry = Arc::new(RwLock::new(ReportRegistry::new(capacity)));
+        let reader = ReportReader(registry.clone());
+        let writer = ReportWriter(registry.clone());
+
+        std::panic::set_hook(Box::new(move |panic_info| writer.write_panic(panic_info)));
+        reader
+    }
+
+    /// Join the thread corresponding to the passed handle, recovering either its value
+    /// or, in case the thread panicked, the id of the report corresponding to the panic.
+    ///
+    /// The id can be used to read the report from the [`self.registry()`].
+    pub fn join_thread<T>(&self, thread: JoinHandle<T>) -> Result<T, Option<uuid::Uuid>> {
+        let thread_id = thread.thread().id();
+        thread.join().map_err(|_e| {
+            self.0
+                .read()
+                .unwrap()
+                .iter()
+                .find(|report| report.panic.thread_id == thread_id)
+                .map(|report| report.id)
+        })
+    }
+
+    /// Returns a registry that can be used to read the reports written during a panic.
+    pub fn registry(&self) -> Arc<RwLock<ReportRegistry>> {
+        self.0.clone()
+    }
+}
--- a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap
+++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/after_dump_register.snap
@@ -0,0 +1,35 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,]
+----------------------------------------------------------------------
+### Kind:
+"dumpCreation" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap
+++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_processed.snap
@@ -0,0 +1,45 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
+1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
+----------------------------------------------------------------------
+### Status:
+enqueued []
+succeeded [1,]
+canceled [0,]
+----------------------------------------------------------------------
+### Kind:
+"taskCancelation" [1,]
+"dumpCreation" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+1 [0,]
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Finished At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap
+++ b/index-scheduler/src/snapshots/lib.rs/cancel_processing_dump/cancel_registered.snap
@@ -0,0 +1,38 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[0,]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
+1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,1,]
+----------------------------------------------------------------------
+### Kind:
+"taskCancelation" [1,]
+"dumpCreation" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
--- a/meilisearch-types/Cargo.toml
+++ b/meilisearch-types/Cargo.toml
@@ -50,7 +50,6 @@ hebrew = ["milli/hebrew"]
 japanese = ["milli/japanese"]
 # thai specialized tokenization
 thai = ["milli/thai"]
+
 # allow greek specialized tokenization
 greek = ["milli/greek"]
-# allow khmer specialized tokenization
-khmer = ["milli/khmer"]
--- a/meilisearch-types/src/error.rs
+++ b/meilisearch-types/src/error.rs
@@ -88,7 +88,6 @@ pub trait ErrorCode {
    }
 }

-#[allow(clippy::enum_variant_names)]
 enum ErrorType {
    Internal,
    InvalidRequest,
@@ -298,6 +297,7 @@ MissingSwapIndexes                    , InvalidRequest       , BAD_REQUEST ;
 MissingTaskFilters                    , InvalidRequest       , BAD_REQUEST ;
 NoSpaceLeftOnDevice                   , System               , UNPROCESSABLE_ENTITY;
 PayloadTooLarge                       , InvalidRequest       , PAYLOAD_TOO_LARGE ;
+ReportNotFound                        , InvalidRequest       , NOT_FOUND ;
 TaskNotFound                          , InvalidRequest       , NOT_FOUND ;
 TooManyOpenFiles                      , System               , UNPROCESSABLE_ENTITY ;
 UnretrievableDocument                 , Internal             , BAD_REQUEST ;
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@@ -150,7 +150,6 @@ hebrew = ["meilisearch-types/hebrew"]
 japanese = ["meilisearch-types/japanese"]
 thai = ["meilisearch-types/thai"]
 greek = ["meilisearch-types/greek"]
-khmer = ["meilisearch-types/khmer"]

 [package.metadata.mini-dashboard]
 assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip"
--- a/meilisearch/src/error.rs
+++ b/meilisearch/src/error.rs
@@ -51,6 +51,8 @@ pub enum MeilisearchHttpError {
    DocumentFormat(#[from] DocumentFormatError),
    #[error(transparent)]
    Join(#[from] JoinError),
+    #[error("Report `{0}` not found. Either its id is incorrect, or it was deleted. To save on memory, only a limited amount of reports are kept.")]
+    ReportNotFound(uuid::Uuid),
 }

 impl ErrorCode for MeilisearchHttpError {
@@ -74,6 +76,7 @@ impl ErrorCode for MeilisearchHttpError {
            MeilisearchHttpError::FileStore(_) => Code::Internal,
            MeilisearchHttpError::DocumentFormat(e) => e.error_code(),
            MeilisearchHttpError::Join(_) => Code::Internal,
+            MeilisearchHttpError::ReportNotFound(_) => Code::ReportNotFound,
        }
    }
 }
--- a/meilisearch/src/routes/mod.rs
+++ b/meilisearch/src/routes/mod.rs
@@ -24,6 +24,7 @@ pub mod features;
 pub mod indexes;
 mod metrics;
 mod multi_search;
+mod reports;
 mod snapshot;
 mod swap_indexes;
 pub mod tasks;
@@ -40,7 +41,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
        .service(web::scope("/multi-search").configure(multi_search::configure))
        .service(web::scope("/swap-indexes").configure(swap_indexes::configure))
        .service(web::scope("/metrics").configure(metrics::configure))
-        .service(web::scope("/experimental-features").configure(features::configure));
+        .service(web::scope("/experimental-features").configure(features::configure))
+        .service(web::scope("/reports").configure(reports::configure));
 }

 #[derive(Debug, Serialize)]
--- a/meilisearch/src/routes/reports.rs
+++ b/meilisearch/src/routes/reports.rs
@@ -0,0 +1,39 @@
+use actix_web::web::{self, Data};
+use actix_web::HttpResponse;
+use index_scheduler::{IndexScheduler, Report};
+use meilisearch_types::error::ResponseError;
+use meilisearch_types::keys::actions;
+
+use crate::extractors::authentication::policies::ActionPolicy;
+use crate::extractors::authentication::GuardedData;
+use crate::extractors::sequential_extractor::SeqHandler;
+
+pub fn configure(cfg: &mut web::ServiceConfig) {
+    cfg.service(web::resource("").route(web::get().to(list_reports))).service(
+        web::scope("/{report_uid}")
+            .service(web::resource("").route(web::get().to(SeqHandler(get_report)))),
+    );
+}
+
+pub async fn list_reports(
+    index_scheduler: GuardedData<ActionPolicy<{ actions::SETTINGS_ALL }>, Data<IndexScheduler>>,
+) -> Result<HttpResponse, ResponseError> {
+    let reports = &index_scheduler.reports();
+    let reports = &reports.read().unwrap();
+    let reports: Vec<&Report> = reports.iter().collect();
+
+    Ok(HttpResponse::Ok().json(reports))
+}
+
+pub async fn get_report(
+    index_scheduler: GuardedData<ActionPolicy<{ actions::SETTINGS_ALL }>, Data<IndexScheduler>>,
+    report_id: web::Path<uuid::Uuid>,
+) -> Result<HttpResponse, ResponseError> {
+    let reports = &index_scheduler.reports();
+    let reports = &reports.read().unwrap();
+    let report = reports
+        .find(*report_id)
+        .ok_or(crate::error::MeilisearchHttpError::ReportNotFound(*report_id))?;
+
+    Ok(HttpResponse::Ok().json(report))
+}
--- a/meilisearch/tests/common/mod.rs
+++ b/meilisearch/tests/common/mod.rs
@@ -5,11 +5,9 @@ pub mod service;

 use std::fmt::{self, Display};

-#[allow(unused)]
 pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
 use meili_snap::json_string;
 use serde::{Deserialize, Serialize};
-#[allow(unused)]
 pub use server::{default_settings, Server};

 #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
--- a/meilisearch/tests/search/distinct.rs
+++ b/meilisearch/tests/search/distinct.rs
@@ -6,109 +6,21 @@ use crate::json;

 pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
    json!([
-      {
-        "id": 1,
-        "description": "Leather Jacket",
-        "brand": "Lee Jeans",
-        "product_id": "123456",
-        "color": "Brown"
-      },
-      {
-        "id": 2,
-        "description": "Leather Jacket",
-        "brand": "Lee Jeans",
-        "product_id": "123456",
-        "color": "Black"
-      },
-      {
-        "id": 3,
-        "description": "Leather Jacket",
-        "brand": "Lee Jeans",
-        "product_id": "123456",
-        "color": "Blue"
-      },
-      {
-        "id": 4,
-        "description": "T-Shirt",
-        "brand": "Nike",
-        "product_id": "789012",
-        "color": "Red"
-      },
-      {
-        "id": 5,
-        "description": "T-Shirt",
-        "brand": "Nike",
-        "product_id": "789012",
-        "color": "Blue"
-      },
-      {
-        "id": 6,
-        "description": "Running Shoes",
-        "brand": "Adidas",
-        "product_id": "456789",
-        "color": "Black"
-      },
-      {
-        "id": 7,
-        "description": "Running Shoes",
-        "brand": "Adidas",
-        "product_id": "456789",
-        "color": "White"
-      },
-      {
-        "id": 8,
-        "description": "Hoodie",
-        "brand": "Puma",
-        "product_id": "987654",
-        "color": "Gray"
-      },
-      {
-        "id": 9,
-        "description": "Sweater",
-        "brand": "Gap",
-        "product_id": "234567",
-        "color": "Green"
-      },
-      {
-        "id": 10,
-        "description": "Sweater",
-        "brand": "Gap",
-        "product_id": "234567",
-        "color": "Red"
-      },
-      {
-        "id": 11,
-        "description": "Sweater",
-        "brand": "Gap",
-        "product_id": "234567",
-        "color": "Blue"
-      },
-      {
-        "id": 12,
-        "description": "Jeans",
-        "brand": "Levi's",
-        "product_id": "345678",
-        "color": "Indigo"
-      },
-      {
-        "id": 13,
-        "description": "Jeans",
-        "brand": "Levi's",
-        "product_id": "345678",
-        "color": "Black"
-      },
-      {
-        "id": 14,
-        "description": "Jeans",
-        "brand": "Levi's",
-        "product_id": "345678",
-        "color": "Stone Wash"
-      }
+        {"productId": 1, "shopId": 1},
+        {"productId": 2, "shopId": 1},
+        {"productId": 3, "shopId": 2},
+        {"productId": 4, "shopId": 2},
+        {"productId": 5, "shopId": 3},
+        {"productId": 6, "shopId": 3},
+        {"productId": 7, "shopId": 4},
+        {"productId": 8, "shopId": 4},
+        {"productId": 9, "shopId": 5},
+        {"productId": 10, "shopId": 5}
    ])
 });

-pub(self) static DOCUMENT_PRIMARY_KEY: &str = "id";
-pub(self) static DOCUMENT_DISTINCT_KEY: &str = "product_id";
+pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
+pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";

 /// testing: https://github.com/meilisearch/meilisearch/issues/4078
 #[actix_rt::test]
@@ -121,121 +33,31 @@ async fn distinct_search_with_offset_no_ranking() {
    index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
    index.wait_task(1).await;

-    fn get_hits(response: &Value) -> Vec<&str> {
+    fn get_hits(Value(response): Value) -> Vec<i64> {
        let hits_array = response["hits"].as_array().unwrap();
-        hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
+        hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
    }

-    let (response, code) = index.search_post(json!({"offset": 0, "limit": 2})).await;
-    let hits = get_hits(&response);
+    let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
+    let hits = get_hits(response);
    snapshot!(code, @"200 OK");
    snapshot!(hits.len(), @"2");
-    snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
-    snapshot!(response["estimatedTotalHits"] , @"11");
+    snapshot!(format!("{:?}", hits), @"[1, 2]");

-    let (response, code) = index.search_post(json!({"offset": 2, "limit": 2})).await;
-    let hits = get_hits(&response);
+    let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
+    let hits = get_hits(response);
    snapshot!(code, @"200 OK");
    snapshot!(hits.len(), @"2");
-    snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
-    snapshot!(response["estimatedTotalHits"], @"10");
+    snapshot!(format!("{:?}", hits), @"[3, 4]");

-    let (response, code) = index.search_post(json!({"offset": 4, "limit": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"2");
-    snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
-    snapshot!(response["estimatedTotalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"offset": 5, "limit": 2})).await;
-    let hits = get_hits(&response);
+    let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
+    let hits = get_hits(response);
    snapshot!(code, @"200 OK");
    snapshot!(hits.len(), @"1");
-    snapshot!(format!("{:?}", hits), @r#"["345678"]"#);
-    snapshot!(response["estimatedTotalHits"], @"6");
+    snapshot!(format!("{:?}", hits), @"[5]");

-    let (response, code) = index.search_post(json!({"offset": 6, "limit": 2})).await;
-    let hits = get_hits(&response);
+    let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
+    let hits = get_hits(response);
    snapshot!(code, @"200 OK");
    snapshot!(hits.len(), @"0");
-    snapshot!(format!("{:?}", hits), @r#"[]"#);
-    snapshot!(response["estimatedTotalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"offset": 7, "limit": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"0");
-    snapshot!(format!("{:?}", hits), @r#"[]"#);
-    snapshot!(response["estimatedTotalHits"], @"6");
-}
-
-/// testing: https://github.com/meilisearch/meilisearch/issues/4130
-#[actix_rt::test]
-async fn distinct_search_with_pagination_no_ranking() {
-    let server = Server::new().await;
-    let index = server.index("test");
-
-    let documents = DOCUMENTS.clone();
-    index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
-    index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
-    index.wait_task(1).await;
-
-    fn get_hits(response: &Value) -> Vec<&str> {
-        let hits_array = response["hits"].as_array().unwrap();
-        hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
-    }
-
-    let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"0");
-    snapshot!(format!("{:?}", hits), @r#"[]"#);
-    snapshot!(response["page"], @"0");
-    snapshot!(response["totalPages"], @"3");
-    snapshot!(response["totalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"page": 1, "hitsPerPage": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"2");
-    snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
-    snapshot!(response["page"], @"1");
-    snapshot!(response["totalPages"], @"3");
-    snapshot!(response["totalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"2");
-    snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
-    snapshot!(response["page"], @"2");
-    snapshot!(response["totalPages"], @"3");
-    snapshot!(response["totalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"page": 3, "hitsPerPage": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"2");
-    snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
-    snapshot!(response["page"], @"3");
-    snapshot!(response["totalPages"], @"3");
-    snapshot!(response["totalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"page": 4, "hitsPerPage": 2})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"0");
-    snapshot!(format!("{:?}", hits), @r#"[]"#);
-    snapshot!(response["page"], @"4");
-    snapshot!(response["totalPages"], @"3");
-    snapshot!(response["totalHits"], @"6");
-
-    let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 3})).await;
-    let hits = get_hits(&response);
-    snapshot!(code, @"200 OK");
-    snapshot!(hits.len(), @"3");
-    snapshot!(format!("{:?}", hits), @r#"["987654", "234567", "345678"]"#);
-    snapshot!(response["page"], @"2");
-    snapshot!(response["totalPages"], @"2");
-    snapshot!(response["totalHits"], @"6");
 }
--- a/meilitool/Cargo.toml
+++ b/meilitool/Cargo.toml
@@ -1,19 +0,0 @@
-[package]
-name = "meilitool"
-description = "A CLI to edit a Meilisearch database from the command line"
-version.workspace = true
-authors.workspace = true
-homepage.workspace = true
-readme.workspace = true
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-anyhow = "1.0.75"
-clap = { version = "4.2.1", features = ["derive"] }
-dump = { path = "../dump" }
-file-store = { path = "../file-store" }
-meilisearch-auth = { path = "../meilisearch-auth" }
-meilisearch-types = { path = "../meilisearch-types" }
-time = { version = "0.3.30", features = ["formatting"] }
-uuid = { version = "1.5.0", features = ["v4"], default-features = false }
--- a/meilitool/src/main.rs
+++ b/meilitool/src/main.rs
@@ -1,312 +0,0 @@
-use std::fs::{read_dir, read_to_string, remove_file, File};
-use std::io::BufWriter;
-use std::path::PathBuf;
-
-use anyhow::Context;
-use clap::{Parser, Subcommand};
-use dump::{DumpWriter, IndexMetadata};
-use file_store::FileStore;
-use meilisearch_auth::AuthController;
-use meilisearch_types::heed::types::{OwnedType, SerdeJson, Str};
-use meilisearch_types::heed::{Database, Env, EnvOpenOptions, PolyDatabase, RoTxn, RwTxn};
-use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
-use meilisearch_types::milli::{obkv_to_json, BEU32};
-use meilisearch_types::tasks::{Status, Task};
-use meilisearch_types::versioning::check_version_file;
-use meilisearch_types::Index;
-use time::macros::format_description;
-use time::OffsetDateTime;
-use uuid_codec::UuidCodec;
-
-mod uuid_codec;
-
-#[derive(Parser)]
-#[command(author, version, about, long_about = None)]
-struct Cli {
-    /// The database path where the Meilisearch is running.
-    #[arg(long, default_value = "data.ms/")]
-    db_path: PathBuf,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-#[derive(Subcommand)]
-enum Command {
-    /// Clears the task queue and make it empty.
-    ///
-    /// This command can be safely executed even if Meilisearch is running and processing tasks.
-    /// Once the task queue is empty you can restart Meilisearch and no more tasks must be visible,
-    /// even the ones that were processing. However, it's highly possible that you see the processing
-    /// tasks in the queue again with an associated internal error message.
-    ClearTaskQueue,
-
-    /// Exports a dump from the Meilisearch database.
-    ///
-    /// Make sure to run this command when Meilisearch is not running or running but not processing tasks.
-    /// If tasks are being processed while a dump is being exported there are chances for the dump to be
-    /// malformed with missing tasks.
-    ///
-    /// TODO Verify this claim or make sure it cannot happen and we can export dumps
-    ///      without caring about killing Meilisearch first!
-    ExportADump {
-        /// The directory in which the dump will be created.
-        #[arg(long, default_value = "dumps/")]
-        dump_dir: PathBuf,
-
-        /// Skip dumping the enqueued or processing tasks.
-        ///
-        /// Can be useful when there are a lot of them and it is not particularly useful
-        /// to keep them. Note that only the enqueued tasks takes up space so skipping
-        /// the processed ones is not particularly interesting.
-        #[arg(long)]
-        skip_enqueued_tasks: bool,
-    },
-}
-
-fn main() -> anyhow::Result<()> {
-    let Cli { db_path, command } = Cli::parse();
-
-    check_version_file(&db_path).context("While checking the version file")?;
-
-    match command {
-        Command::ClearTaskQueue => clear_task_queue(db_path),
-        Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
-            export_a_dump(db_path, dump_dir, skip_enqueued_tasks)
-        }
-    }
-}
-
-/// Clears the task queue located at `db_path`.
-fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
-    let path = db_path.join("tasks");
-    let env = EnvOpenOptions::new()
-        .max_dbs(100)
-        .open(&path)
-        .with_context(|| format!("While trying to open {:?}", path.display()))?;
-
-    eprintln!("Deleting tasks from the database...");
-
-    let mut wtxn = env.write_txn()?;
-    let all_tasks = try_opening_poly_database(&env, &wtxn, "all-tasks")?;
-    let total = all_tasks.len(&wtxn)?;
-    let status = try_opening_poly_database(&env, &wtxn, "status")?;
-    let kind = try_opening_poly_database(&env, &wtxn, "kind")?;
-    let index_tasks = try_opening_poly_database(&env, &wtxn, "index-tasks")?;
-    let canceled_by = try_opening_poly_database(&env, &wtxn, "canceled_by")?;
-    let enqueued_at = try_opening_poly_database(&env, &wtxn, "enqueued-at")?;
-    let started_at = try_opening_poly_database(&env, &wtxn, "started-at")?;
-    let finished_at = try_opening_poly_database(&env, &wtxn, "finished-at")?;
-
-    try_clearing_poly_database(&mut wtxn, all_tasks, "all-tasks")?;
-    try_clearing_poly_database(&mut wtxn, status, "status")?;
-    try_clearing_poly_database(&mut wtxn, kind, "kind")?;
-    try_clearing_poly_database(&mut wtxn, index_tasks, "index-tasks")?;
-    try_clearing_poly_database(&mut wtxn, canceled_by, "canceled_by")?;
-    try_clearing_poly_database(&mut wtxn, enqueued_at, "enqueued-at")?;
-    try_clearing_poly_database(&mut wtxn, started_at, "started-at")?;
-    try_clearing_poly_database(&mut wtxn, finished_at, "finished-at")?;
-
-    wtxn.commit().context("While committing the transaction")?;
-
-    eprintln!("Successfully deleted {total} tasks from the tasks database!");
-    eprintln!("Deleting the content files from disk...");
-
-    let mut count = 0usize;
-    let update_files = db_path.join("update_files");
-    let entries = read_dir(&update_files).with_context(|| {
-        format!("While trying to read the content of {:?}", update_files.display())
-    })?;
-    for result in entries {
-        match result {
-            Ok(ent) => match remove_file(ent.path()) {
-                Ok(_) => count += 1,
-                Err(e) => eprintln!("Error while deleting {:?}: {}", ent.path().display(), e),
-            },
-            Err(e) => {
-                eprintln!("Error while reading a file in {:?}: {}", update_files.display(), e)
-            }
-        }
-    }
-
-    eprintln!("Sucessfully deleted {count} content files from disk!");
-
-    Ok(())
-}
-
-fn try_opening_database<KC: 'static, DC: 'static>(
-    env: &Env,
-    rtxn: &RoTxn,
-    db_name: &str,
-) -> anyhow::Result<Database<KC, DC>> {
-    env.open_database(rtxn, Some(db_name))
-        .with_context(|| format!("While opening the {db_name:?} database"))?
-        .with_context(|| format!("Missing the {db_name:?} database"))
-}
-
-fn try_opening_poly_database(
-    env: &Env,
-    rtxn: &RoTxn,
-    db_name: &str,
-) -> anyhow::Result<PolyDatabase> {
-    env.open_poly_database(rtxn, Some(db_name))
-        .with_context(|| format!("While opening the {db_name:?} poly database"))?
-        .with_context(|| format!("Missing the {db_name:?} poly database"))
-}
-
-fn try_clearing_poly_database(
-    wtxn: &mut RwTxn,
-    database: PolyDatabase,
-    db_name: &str,
-) -> anyhow::Result<()> {
-    database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database"))
-}
-
-/// Exports a dump into the dump directory.
-fn export_a_dump(
-    db_path: PathBuf,
-    dump_dir: PathBuf,
-    skip_enqueued_tasks: bool,
-) -> Result<(), anyhow::Error> {
-    let started_at = OffsetDateTime::now_utc();
-
-    // 1. Extracts the instance UID from disk
-    let instance_uid_path = db_path.join("instance-uid");
-    let instance_uid = match read_to_string(&instance_uid_path) {
-        Ok(content) => match content.trim().parse() {
-            Ok(uuid) => Some(uuid),
-            Err(e) => {
-                eprintln!("Impossible to parse instance-uid: {e}");
-                None
-            }
-        },
-        Err(e) => {
-            eprintln!("Impossible to read {}: {}", instance_uid_path.display(), e);
-            None
-        }
-    };
-
-    let dump = DumpWriter::new(instance_uid).context("While creating a new dump")?;
-    let file_store =
-        FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?;
-
-    let index_scheduler_path = db_path.join("tasks");
-    let env = EnvOpenOptions::new()
-        .max_dbs(100)
-        .open(&index_scheduler_path)
-        .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
-
-    eprintln!("Dumping the keys...");
-
-    // 2. dump the keys
-    let auth_store = AuthController::new(&db_path, &None)
-        .with_context(|| format!("While opening the auth store at {}", db_path.display()))?;
-    let mut dump_keys = dump.create_keys()?;
-    let mut count = 0;
-    for key in auth_store.list_keys()? {
-        dump_keys.push_key(&key)?;
-        count += 1;
-    }
-    dump_keys.flush()?;
-
-    eprintln!("Successfully dumped {count} keys!");
-
-    let rtxn = env.read_txn()?;
-    let all_tasks: Database<OwnedType<BEU32>, SerdeJson<Task>> =
-        try_opening_database(&env, &rtxn, "all-tasks")?;
-    let index_mapping: Database<Str, UuidCodec> =
-        try_opening_database(&env, &rtxn, "index-mapping")?;
-
-    if skip_enqueued_tasks {
-        eprintln!("Skip dumping the enqueued tasks...");
-    } else {
-        eprintln!("Dumping the enqueued tasks...");
-
-        // 3. dump the tasks
-        let mut dump_tasks = dump.create_tasks_queue()?;
-        let mut count = 0;
-        for ret in all_tasks.iter(&rtxn)? {
-            let (_, t) = ret?;
-            let status = t.status;
-            let content_file = t.content_uuid();
-            let mut dump_content_file = dump_tasks.push_task(&t.into())?;
-
-            // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
-            if let Some(content_file_uuid) = content_file {
-                if status == Status::Enqueued {
-                    let content_file = file_store.get_update(content_file_uuid)?;
-
-                    let reader =
-                        DocumentsBatchReader::from_reader(content_file).with_context(|| {
-                            format!("While reading content file {:?}", content_file_uuid)
-                        })?;
-
-                    let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
-                    while let Some(doc) = cursor.next_document().with_context(|| {
-                        format!("While iterating on content file {:?}", content_file_uuid)
-                    })? {
-                        dump_content_file
-                            .push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
-                    }
-                    dump_content_file.flush()?;
-                    count += 1;
-                }
-            }
-        }
-        dump_tasks.flush()?;
-
-        eprintln!("Successfully dumped {count} enqueued tasks!");
-    }
-
-    eprintln!("Dumping the indexes...");
-
-    // 4. Dump the indexes
-    let mut count = 0;
-    for result in index_mapping.iter(&rtxn)? {
-        let (uid, uuid) = result?;
-        let index_path = db_path.join("indexes").join(uuid.to_string());
-        let index = Index::new(EnvOpenOptions::new(), &index_path).with_context(|| {
-            format!("While trying to open the index at path {:?}", index_path.display())
-        })?;
-
-        let rtxn = index.read_txn()?;
-        let metadata = IndexMetadata {
-            uid: uid.to_owned(),
-            primary_key: index.primary_key(&rtxn)?.map(String::from),
-            created_at: index.created_at(&rtxn)?,
-            updated_at: index.updated_at(&rtxn)?,
-        };
-        let mut index_dumper = dump.create_index(uid, &metadata)?;
-
-        let fields_ids_map = index.fields_ids_map(&rtxn)?;
-        let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
-
-        // 4.1. Dump the documents
-        for ret in index.all_documents(&rtxn)? {
-            let (_id, doc) = ret?;
-            let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
-            index_dumper.push_document(&document)?;
-        }
-
-        // 4.2. Dump the settings
-        let settings = meilisearch_types::settings::settings(&index, &rtxn)?;
-        index_dumper.settings(&settings)?;
-        count += 1;
-    }
-
-    eprintln!("Successfully dumped {count} indexes!");
-    // We will not dump experimental feature settings
-    eprintln!("The tool is not dumping experimental features, please set them by hand afterward");
-
-    let dump_uid = started_at.format(format_description!(
-        "[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
-    )).unwrap();
-
-    let path = dump_dir.join(format!("{}.dump", dump_uid));
-    let file = File::create(&path)?;
-    dump.persist_to(BufWriter::new(file))?;
-
-    eprintln!("Dump exported at path {:?}", path.display());
-
-    Ok(())
-}
--- a/meilitool/src/uuid_codec.rs
+++ b/meilitool/src/uuid_codec.rs
@@ -1,24 +0,0 @@
-use std::borrow::Cow;
-use std::convert::TryInto;
-
-use meilisearch_types::heed::{BytesDecode, BytesEncode};
-use uuid::Uuid;
-
-/// A heed codec for value of struct Uuid.
-pub struct UuidCodec;
-
-impl<'a> BytesDecode<'a> for UuidCodec {
-    type DItem = Uuid;
-
-    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        bytes.try_into().ok().map(Uuid::from_bytes)
-    }
-}
-
-impl BytesEncode<'_> for UuidCodec {
-    type EItem = Uuid;
-
-    fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
-        Some(Cow::Borrowed(item.as_bytes()))
-    }
-}
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -17,7 +17,7 @@ bincode = "1.3.3"
 bstr = "1.4.0"
 bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
 byteorder = "1.4.3"
-charabia = { version = "0.8.5", default-features = false }
+charabia = { version = "0.8.3", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.8"
 deserr = { version = "0.6.0", features = ["actix-web"]}
@@ -82,7 +82,7 @@ md5 = "0.7.0"
 rand = { version = "0.8.5", features = ["small_rng"] }

 [features]
-all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek", "charabia/khmer"]
+all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek"]

 # Use POSIX semaphores instead of SysV semaphores in LMDB
 # For more information on this feature, see heed's Cargo.toml
@@ -106,6 +106,3 @@ thai = ["charabia/thai"]

 # allow greek specialized tokenization
 greek = ["charabia/greek"]
-
-# allow khmer specialized tokenization
-khmer = ["charabia/khmer"]
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -3,7 +3,7 @@ use std::fmt::{Debug, Display};
 use std::ops::Bound::{self, Excluded, Included};

 use either::Either;
-pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token};
+pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token};
 use roaring::RoaringBitmap;
 use serde_json::Value;

--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;

 pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
-pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
+pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
 use self::new::PartialSearchResult;
 use crate::error::UserError;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -46,8 +46,9 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
        if let Some(distinct_fid) = distinct_fid {
            let mut excluded = RoaringBitmap::new();
            let mut results = vec![];
+            let mut skip = 0;
            for docid in universe.iter() {
-                if results.len() >= from + length {
+                if results.len() >= length {
                    break;
                }
                if excluded.contains(docid) {
@@ -55,19 +56,16 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
                }

                distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
+                skip += 1;
+                if skip <= from {
+                    continue;
+                }
+
                results.push(docid);
            }

            let mut all_candidates = universe - excluded;
            all_candidates.extend(results.iter().copied());
-            // drain the results of the skipped elements
-            // this **must** be done **after** writing the entire results in `all_candidates` to ensure
-            // e.g. estimatedTotalHits is correct.
-            if results.len() >= from {
-                results.drain(..from);
-            } else {
-                results.clear();
-            }

            return Ok(BucketSortOutput {
                scores: vec![Default::default(); results.len()],
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@@ -29,7 +29,7 @@ use std::hash::Hash;
 pub use cheapest_paths::PathVisitor;
 pub use condition_docids_cache::ConditionDocIdsCache;
 pub use dead_ends_cache::DeadEndsCache;
-pub use exactness::ExactnessGraph;
+pub use exactness::{ExactnessCondition, ExactnessGraph};
 pub use fid::{FidCondition, FidGraph};
 pub use position::{PositionCondition, PositionGraph};
 pub use proximity::{ProximityCondition, ProximityGraph};
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -14,7 +14,7 @@ pub use grenad_helpers::{
 };
 pub use merge_functions::{
    concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
-    merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps,
+    merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
    serialize_roaring_bitmap, MergeFn,
 };

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -20,7 +20,10 @@ use slice_group_by::GroupBy;
 use typed_chunk::{write_typed_chunk_into_index, TypedChunk};

 use self::enrich::enrich_documents_batch;
-pub use self::enrich::{extract_finite_float_from_value, DocumentId};
+pub use self::enrich::{
+    extract_finite_float_from_value, validate_document_id, validate_document_id_value,
+    validate_geo_from_json, DocumentId,
+};
 pub use self::helpers::{
    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
--- a/milli/tests/search/distinct.rs
+++ b/milli/tests/search/distinct.rs
@@ -202,7 +202,7 @@ test_distinct!(
    EXTERNAL_DOCUMENTS_IDS.len(),
    1,
    vec![],
-    3
+    2
 );
 test_distinct!(
    // testing: https://github.com/meilisearch/meilisearch/issues/4078
@@ -212,7 +212,7 @@ test_distinct!(
    1,
    2,
    vec![],
-    3
+    1
 );
 test_distinct!(
    // testing: https://github.com/meilisearch/meilisearch/issues/4078
@@ -222,7 +222,7 @@ test_distinct!(
    EXTERNAL_DOCUMENTS_IDS.len(),
    2,
    vec![],
-    7
+    5
 );
 test_distinct!(
    // testing: https://github.com/meilisearch/meilisearch/issues/4078
@@ -232,5 +232,5 @@ test_distinct!(
    2,
    4,
    vec![],
-    7
+    3
 );